diff --git a/PolyPilot.Tests/ProcessingWatchdogTests.cs b/PolyPilot.Tests/ProcessingWatchdogTests.cs
index eeb6538c0d..da216d4dd6 100644
--- a/PolyPilot.Tests/ProcessingWatchdogTests.cs
+++ b/PolyPilot.Tests/ProcessingWatchdogTests.cs
@@ -2457,9 +2457,10 @@ public void WatchdogMaxToolAliveResets_BoundsMaxStuckTime()
     [Fact]
     public void CaseA_ExceedingMaxResets_FallsThroughToKill_InSource()
     {
-        // Verify the watchdog's Case A checks WatchdogMaxToolAliveResets and falls
-        // through when exceeded. This is the core fix for the stuck-session bug where
-        // a dead session's tool appears active but no events ever arrive.
+        // Verify the watchdog's Case A uses events.jsonl freshness checks and falls
+        // through when the file is stale and the confirmation cycle is exceeded.
+        // This is the core fix for the stuck-session bug where a dead session's tool
+        // appears active but no events ever arrive.
         var source = File.ReadAllText(
             Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs"));
         var methodIdx = source.IndexOf("private async Task RunProcessingWatchdogAsync");
@@ -2467,10 +2468,10 @@ public void CaseA_ExceedingMaxResets_FallsThroughToKill_InSource()
         var endIdx = source.IndexOf("    private readonly ConcurrentDictionary", methodIdx);
         var watchdogBody = source.Substring(methodIdx, endIdx - methodIdx);
 
-        // Case A must reference WatchdogMaxToolAliveResets
-        Assert.True(watchdogBody.Contains("WatchdogMaxToolAliveResets"),
-            "Case A must check WatchdogMaxToolAliveResets to cap consecutive resets");
-        // Case A must increment WatchdogCaseAResets
+        // Case A must check events.jsonl freshness
+        Assert.True(watchdogBody.Contains("events.jsonl"),
+            "Case A must check events.jsonl freshness to distinguish active tools from dead connections");
+        // Case A must increment WatchdogCaseAResets for confirmation cycles
         Assert.True(watchdogBody.Contains("WatchdogCaseAResets"),
             "Case A must track reset count via state.WatchdogCaseAResets");
     }
diff --git a/PolyPilot.Tests/SessionOrganizationTests.cs b/PolyPilot.Tests/SessionOrganizationTests.cs
index 2e1e5df14d..e123df7a22 100644
--- a/PolyPilot.Tests/SessionOrganizationTests.cs
+++ b/PolyPilot.Tests/SessionOrganizationTests.cs
@@ -1364,6 +1364,22 @@ public void BuiltInPresets_IncludePRReviewSquad()
         Assert.NotNull(prSquad.WorkerSystemPrompts);
         Assert.Equal(prSquad.WorkerModels.Length, prSquad.WorkerSystemPrompts!.Length);
     }
+
+    [Fact]
+    public void BuiltInPresets_IncludeSkillValidator()
+    {
+        var skillValidator = GroupPreset.BuiltIn.FirstOrDefault(p => p.Name == "Skill Validator");
+        Assert.NotNull(skillValidator);
+        Assert.Equal(2, skillValidator!.WorkerModels.Length);
+        Assert.Equal(MultiAgentMode.OrchestratorReflect, skillValidator.Mode);
+        Assert.Equal("⚖️", skillValidator.Emoji);
+        Assert.NotNull(skillValidator.SharedContext);
+        Assert.NotNull(skillValidator.RoutingContext);
+        Assert.NotNull(skillValidator.WorkerSystemPrompts);
+        Assert.Equal(2, skillValidator.WorkerSystemPrompts!.Length);
+        Assert.All(skillValidator.WorkerSystemPrompts, p => Assert.False(string.IsNullOrWhiteSpace(p)));
+        Assert.NotNull(skillValidator.MaxReflectIterations);
+    }
 }
 
 public class GroupModelAnalyzerTests
@@ -1766,7 +1782,7 @@ public class MultiAgentScenarioTests
     /// 
     /// User flow:
     ///   1. Click 🚀 Preset in sidebar toolbar
-    ///   2. Preset picker appears showing 2 built-in templates
+    ///   2. Preset picker appears showing 3 built-in templates
     ///   3. Select "PR Review Squad" (📋)
     ///   4. System creates: Orchestrator (claude-opus-4.6) + 5 Workers
     ///   5. Sidebar shows group with mode selector set to "🎯 Orchestrator"
@@ -1777,7 +1793,7 @@ public void Scenario_CreateGroupFromPreset()
     {
         // Step 1-2: User sees built-in presets
         var presets = GroupPreset.BuiltIn;
-        Assert.Equal(2, presets.Length);
+        Assert.Equal(3, presets.Length);
 
         // Step 3: User picks "PR Review Squad"
         var prReview = presets.First(p => p.Name == "PR Review Squad");
diff --git a/PolyPilot/Models/ModelCapabilities.cs b/PolyPilot/Models/ModelCapabilities.cs
index 13634ab818..c8773e8695 100644
--- a/PolyPilot/Models/ModelCapabilities.cs
+++ b/PolyPilot/Models/ModelCapabilities.cs
@@ -344,6 +344,197 @@ public record GroupPreset(string Name, string Description, string Emoji, MultiAg
                 """,
             MaxReflectIterations = 10,
         },
+
+        new GroupPreset(
+            "Skill Validator", "Two evaluators assess your skill from different angles — dotnet empirical testing vs. Anthropic design review — orchestrator builds consensus",
+            "⚖️", MultiAgentMode.OrchestratorReflect,
+            "claude-opus-4.6", new[] { "claude-sonnet-4.6", "claude-sonnet-4.6" })
+        {
+            WorkerSystemPrompts = new[]
+            {
+                """
+                You are the Dotnet Skill Validator. You evaluate skills empirically using the methodology from the dotnet/skills skill-validator tool.
+
+                ## Your Evaluation Approach
+
+                For each skill under evaluation, perform these steps:
+
+                ### 1. Inspect the Skill Definition
+                - Read the SKILL.md file carefully
+                - Read the skill's `tests/eval.yaml` if present
+                - Identify what scenarios the skill claims to improve
+
+                ### 2. Empirical Baseline Comparison
+                - For each scenario in the skill's eval.yaml (or infer representative scenarios from SKILL.md):
+                  a. Describe what the agent response would look like WITHOUT the skill (baseline)
+                  b. Describe what the agent response looks like WITH the skill
+                  c. Assess: token efficiency, tool call count, task completion, error rate
+
+                ### 3. Pairwise Comparative Judgment
+                - Compare baseline vs. skill-augmented performance side-by-side
+                - Apply position-swap bias mitigation: consider both orderings in your judgment
+                - Rate improvement on a scale of -1 (degraded) to +1 (strong improvement) with confidence
+
+                ### 4. Statistical Assessment
+                - Estimate variance across the scenario set
+                - Flag scenarios where the skill helps most vs. least
+                - Identify scenarios where the skill might cause regressions
+
+                ### 5. Verdict
+                Format your verdict as:
+                ```
+                ## Dotnet Validator Verdict
+                **Overall Score**: X/10
+                **Confidence**: High / Medium / Low
+                **Verdict**: KEEP / IMPROVE / REMOVE
+
+                ### Strengths
+                - [specific strengths with evidence]
+
+                ### Weaknesses
+                - [specific weaknesses with evidence]
+
+                ### Suggested Improvements
+                - [concrete, actionable suggestions]
+                ```
+
+                ## Rules
+                - Be data-driven: cite specific scenarios and observed behaviors
+                - Focus on measurable outcomes: does it reduce tokens? improve task completion? reduce errors?
+                - Don't recommend keeping a skill that shows no measurable improvement in your empirical analysis
+                """,
+
+                """
+                You are the Anthropic Skill Evaluator. You evaluate skills through the lens of prompt engineering quality, trigger accuracy, and agent guidance design.
+
+                ## Your Evaluation Approach
+
+                For each skill under evaluation, assess these dimensions:
+
+                ### 1. Description Quality (Trigger Accuracy)
+                - Is the description specific enough to trigger reliably for its intended use cases?
+                - Is it too broad — will it trigger for unintended tasks?
+                - Does it include enough trigger phrases/keywords to match user intent?
+                - Rate trigger precision (0-10) and recall (0-10)
+
+                ### 2. Instruction Clarity
+                - Are the instructions in SKILL.md clear, actionable, and unambiguous?
+                - Do they tell the agent *exactly* what to do and in what order?
+                - Are there missing edge cases or situations the skill doesn't handle?
+                - Does the skill avoid over-constraining the agent in ways that limit helpfulness?
+
+                ### 3. Scope Appropriateness
+                - Is the skill focused on a single, well-defined capability?
+                - Is the skill too broad (trying to do too much) or too narrow (not useful enough)?
+                - Does the skill overlap significantly with other skills? (potential conflicts)
+
+                ### 4. Test Coverage
+                - Does the eval.yaml cover the happy path?
+                - Does it cover edge cases and failure modes?
+                - Are negative test cases present (things the skill should NOT do)?
+
+                ### 5. Verdict
+                Format your verdict as:
+                ```
+                ## Anthropic Evaluator Verdict
+                **Overall Score**: X/10
+                **Trigger Precision**: X/10 | **Trigger Recall**: X/10
+                **Verdict**: KEEP / IMPROVE / REMOVE
+
+                ### Strengths
+                - [specific strengths]
+
+                ### Weaknesses
+                - [specific weaknesses]
+
+                ### Suggested Improvements
+                - [concrete rewrites or additions with examples]
+                ```
+
+                ## Rules
+                - Be concrete: quote specific lines from SKILL.md when critiquing
+                - Focus on prompt quality, not empirical test results
+                - Suggest specific rewrites, not vague advice like "be clearer"
+                """,
+            },
+            SharedContext = """
+                ## Skill Evaluation Standards
+
+                Both evaluators assess the same skill and produce independent verdicts.
+                A good skill must satisfy BOTH evaluators to be marked KEEP.
+
+                ### What makes a skill worth keeping
+                - Measurable improvement in task completion (Dotnet validator perspective)
+                - Clear, precise description that triggers reliably (Anthropic evaluator perspective)
+                - Focused scope — does one thing well
+                - Actionable instructions that guide the agent without over-constraining it
+                - Adequate test coverage
+
+                ### What warrants IMPROVE
+                - Good intent but fixable gaps (bad trigger description, missing scenarios, ambiguous instructions)
+                - One evaluator says KEEP but the other says REMOVE with specific concerns
+
+                ### What warrants REMOVE
+                - No measurable improvement in empirical testing
+                - Trigger description too broad/narrow to be useful
+                - Instructions that would cause regressions or confusion
+
+                ### Consensus Rule
+                - KEEP requires both evaluators to say KEEP or one KEEP + one IMPROVE
+                - IMPROVE if the evaluators disagree or both say IMPROVE
+                - REMOVE if either evaluator says REMOVE with strong evidence
+                """,
+            RoutingContext = """
+                ## Skill Validator Orchestration
+
+                You orchestrate two skill evaluators who assess skills from different angles. Your role is to:
+                1. Dispatch the skill to BOTH evaluators simultaneously (or sequentially if needed)
+                2. Collect their verdicts
+                3. Identify where they agree and disagree
+                4. Build a consensus verdict explaining which suggestions to adopt and why
+
+                ### Worker Names
+                - **worker-1** = Dotnet Skill Validator (empirical, outcome-focused)
+                - **worker-2** = Anthropic Skill Evaluator (prompt-design-focused)
+
+                ### Dispatch Pattern
+                1. **First dispatch**: Send the skill content to BOTH workers. Include the full SKILL.md content and eval.yaml if available.
+                2. **After both complete**: Compare their verdicts. Identify agreements (high confidence) and disagreements (requires judgment).
+                3. **Build consensus**: Produce a final report that explains which worker's suggestions are adopted, which are declined, and why.
+
+                ### Consensus Report Format
+                ```
+                ## Skill Validator Consensus Report: [Skill Name]
+
+                ### Summary
+                **Dotnet Verdict**: KEEP/IMPROVE/REMOVE (X/10)
+                **Anthropic Verdict**: KEEP/IMPROVE/REMOVE (X/10)
+                **Consensus**: KEEP / IMPROVE / REMOVE
+
+                ### Points of Agreement (High Confidence)
+                - [issues both evaluators flagged]
+
+                ### Points of Disagreement (Requires Judgment)
+                - [Dotnet says X, Anthropic says Y — adopted: Z because ...]
+
+                ### Adopted Suggestions
+                - [suggestions we are recommending, with rationale]
+
+                ### Declined Suggestions
+                - [suggestions we are NOT adopting, with rationale]
+
+                ### Final Recommendation
+                [1-2 sentence actionable summary]
+                ```
+
+                ### Rules
+                - Always explain WHY suggestions are adopted or declined
+                - Where evaluators disagree, explain the tradeoff and make a reasoned judgment
+                - Highlight suggestions both evaluators agree on as high-confidence improvements
+                - NEVER emit [[GROUP_REFLECT_COMPLETE]] until both evaluators have responded and a consensus report is produced
+                """,
+            MaxReflectIterations = 6,
+        },
     };
 }
 
diff --git a/PolyPilot/Services/CopilotService.Codespace.cs b/PolyPilot/Services/CopilotService.Codespace.cs
index 943535958e..69e5288a22 100644
--- a/PolyPilot/Services/CopilotService.Codespace.cs
+++ b/PolyPilot/Services/CopilotService.Codespace.cs
@@ -788,6 +788,7 @@ private async Task ResumeCodespaceSessionsAsync(SessionGroup group, Cancellation
                 state.Info.WorkingDirectory = codespaceWorkDir;
 
                 CancelProcessingWatchdog(state);
+                CancelToolHealthCheck(state);
                 var newState = new SessionState
                 {
                     Session = newSession,
diff --git a/PolyPilot/Services/CopilotService.Events.cs b/PolyPilot/Services/CopilotService.Events.cs
index 6d3ee23701..bde7ecaf91 100644
--- a/PolyPilot/Services/CopilotService.Events.cs
+++ b/PolyPilot/Services/CopilotService.Events.cs
@@ -308,6 +308,9 @@ void Invoke(Action action)
                 if (toolStart.Data == null) break;
                 Interlocked.Increment(ref state.ActiveToolCallCount);
                 state.HasUsedToolsThisTurn = true; // volatile field — no explicit barrier needed
+                // Record tool start time and schedule health check
+                Interlocked.Exchange(ref state.ToolStartedAtTicks, DateTime.UtcNow.Ticks);
+                ScheduleToolHealthCheck(state, sessionName);
                 if (state.Info.ProcessingPhase < 3)
                 {
                     state.Info.ProcessingPhase = 3; // Working
@@ -362,6 +365,8 @@ void Invoke(Action action)
             case ToolExecutionCompleteEvent toolDone:
                 if (toolDone.Data == null) break;
                 Interlocked.Decrement(ref state.ActiveToolCallCount);
+                // Cancel the tool health check timer since tool completed normally
+                CancelToolHealthCheck(state);
                 Interlocked.Increment(ref state.Info._toolCallCount);
                 var completeCallId = toolDone.Data.ToolCallId ?? "";
                 var completeToolName = toolDone.Data?.GetType().GetProperty("ToolName")?.GetValue(toolDone.Data)?.ToString();
@@ -444,6 +449,7 @@ void Invoke(Action action)
             case AssistantTurnStartEvent:
                 // Cancel any pending TurnEnd→Idle fallback — another agent round is starting
                 CancelTurnEndFallback(state);
+                state.FallbackCanceledByTurnStart = true;
                 state.HasReceivedDeltasThisTurn = false;
                 var phaseAdvancedToThinking = state.Info.ProcessingPhase < 2;
                 if (phaseAdvancedToThinking) state.Info.ProcessingPhase = 2; // Thinking
@@ -464,7 +470,14 @@ void Invoke(Action action)
                 }
                 // Schedule a delayed CompleteResponse in case SessionIdleEvent never arrives (SDK bug #299).
                 // Cancelled by AssistantTurnStartEvent (another round starting) or SessionIdleEvent (normal path).
+                // If TurnStart previously canceled the fallback, this re-arms it — creating the
+                // self-healing loop: TurnEnd → TurnStart cancel → TurnEnd re-arm → fallback fires.
                 {
+                    if (state.FallbackCanceledByTurnStart)
+                    {
+                        Debug($"[TURNEND-FALLBACK] '{sessionName}' re-arming fallback timer (was canceled by TurnStart)");
+                        state.FallbackCanceledByTurnStart = false;
+                    }
                     var turnEndGen = Interlocked.Read(ref state.ProcessingGeneration);
                     var idleFallbackCts = new CancellationTokenSource();
                     // Capture token BEFORE publishing so CancelTurnEndFallback on another thread
@@ -651,6 +664,7 @@ await notifService.SendNotificationAsync(
                 var errMsg = Models.ErrorMessageHelper.HumanizeMessage(err.Data?.Message ?? "Unknown error");
                 CancelProcessingWatchdog(state);
                 CancelTurnEndFallback(state);
+                CancelToolHealthCheck(state);
                 Interlocked.Exchange(ref state.ActiveToolCallCount, 0);
                 state.HasUsedToolsThisTurn = false;
                 Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0);
@@ -862,9 +876,12 @@ private void CompleteResponse(SessionState state, long? expectedGeneration = nul
         CancelProcessingWatchdog(state);
         // Also cancel any pending TurnEnd→Idle fallback — CompleteResponse is now executing
         CancelTurnEndFallback(state);
+        CancelToolHealthCheck(state);
         Interlocked.Exchange(ref state.ActiveToolCallCount, 0);
         state.HasUsedToolsThisTurn = false;
+        state.FallbackCanceledByTurnStart = false;
         Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0);
+        Interlocked.Exchange(ref state.ToolHealthStaleChecks, 0);
         state.Info.IsResumed = false; // Clear after first successful turn
         var response = state.CurrentResponse.ToString();
         if (!string.IsNullOrWhiteSpace(response))
@@ -1390,6 +1407,11 @@ private void HandleReflectionAdvanceResult(SessionState state, string response,
     /// <summary>If no SDK events arrive for this many seconds while a tool is actively executing, the session is considered stuck.
     /// This is much longer because legitimate tool executions (e.g., running UI tests, long builds) can take many minutes.</summary>
     internal const int WatchdogToolExecutionTimeoutSeconds = 600;
+    /// <summary>After the first Case A reset (tool running + server alive but no events), switch to this
+    /// shorter timeout for subsequent checks. This accelerates dead connection detection while still
+    /// allowing the first 600s for legitimate long-running tools. Total max stuck time becomes
+    /// 600s + (WatchdogMaxToolAliveResets × 60s) ≈ 12 minutes instead of 40 minutes.</summary>
+    internal const int WatchdogToolEscalationTimeoutSeconds = 60;
     // Sessions that USED tools but have none actively running — the model may be
     // thinking between tool rounds, but 600s is too long for a likely-dead session.
     internal const int WatchdogUsedToolsIdleTimeoutSeconds = 180;
@@ -1408,8 +1430,16 @@ private void HandleReflectionAdvanceResult(SessionState state, string response,
     /// session's transport-level connection is broken (ConnectionLostException). Without this cap,
     /// Case A resets LastEventAtTicks indefinitely, and ProcessingStartedAt resets on each app
     /// restart — so neither the inactivity nor the max-time safety net ever fires.
-    /// (3+1) resets × 600s effective timeout ≈ 40 minutes max of Case A resets.</summary>
-    internal const int WatchdogMaxToolAliveResets = 3;
+    /// With escalation timeout (60s after first reset), total max stuck time is:
+    /// 600s (first) + 2 × 60s (escalation) = 720s ≈ 12 minutes.</summary>
+    internal const int WatchdogMaxToolAliveResets = 2;
+
+    /// <summary>
+    /// Milliseconds after a tool starts to perform the first health check. If no events have
+    /// arrived since tool start, we verify the connection is still alive. This detects dead
+    /// connections within ~30s instead of waiting for the 600s watchdog timeout.
+    /// </summary>
+    internal const int ToolHealthCheckIntervalMs = 30_000;
 
     /// <summary>
     /// Milliseconds to wait after AssistantTurnEndEvent before firing CompleteResponse
@@ -1436,6 +1466,144 @@ private static void CancelProcessingWatchdog(SessionState state)
         }
     }
 
+    /// <summary>
+    /// Cancels and disposes any pending tool health check timer.
+    /// </summary>
+    private static void CancelToolHealthCheck(SessionState state)
+    {
+        var prev = Interlocked.Exchange(ref state.ToolHealthCheckTimer, null);
+        prev?.Dispose();
+    }
+
+    /// <summary>
+    /// Schedules a tool health check to run after ToolHealthCheckIntervalMs.
+    /// If the tool completes before the timer fires, the timer is cancelled.
+    /// If the timer fires and no events have arrived since tool start, we check
+    /// if the connection is still alive and trigger recovery if it's dead.
+    /// </summary>
+    private void ScheduleToolHealthCheck(SessionState state, string sessionName)
+    {
+        // Skip in demo/remote mode where we can't probe the local server
+        if (IsDemoMode || IsRemoteMode) return;
+
+        var checkGeneration = Interlocked.Read(ref state.ProcessingGeneration);
+        var timer = new Timer(_ =>
+        {
+            try
+            {
+                // Verify we're still on the same turn
+                if (Interlocked.Read(ref state.ProcessingGeneration) != checkGeneration) return;
+                if (!state.Info.IsProcessing) return;
+
+                var activeTools = Volatile.Read(ref state.ActiveToolCallCount);
+                if (activeTools <= 0) return; // Tool completed normally
+
+                // Check if any events arrived since tool start
+                var lastEventTicks = Interlocked.Read(ref state.LastEventAtTicks);
+                var toolStartTicks = Interlocked.Read(ref state.ToolStartedAtTicks);
+                var eventsSinceToolStart = lastEventTicks > toolStartTicks;
+
+                if (eventsSinceToolStart)
+                {
+                    // Events are still flowing - tool is legitimately running
+                    // Schedule another check
+                    Interlocked.Exchange(ref state.ToolHealthStaleChecks, 0);
+                    Debug($"[TOOL-HEALTH] '{sessionName}' events flowing — rescheduling health check");
+                    ScheduleToolHealthCheck(state, sessionName);
+                    return;
+                }
+
+                // No events since tool start. Check if server is alive.
+                var serverAlive = _serverManager.IsServerRunning;
+                if (!serverAlive)
+                {
+                    Debug($"[TOOL-HEALTH] '{sessionName}' server is DEAD — triggering immediate recovery");
+                    TriggerToolHealthRecovery(state, sessionName, "server not responding");
+                    return;
+                }
+
+                // Server TCP port is alive, but no events for 30s. The connection might be dead.
+                // Increment the stale check counter and check if we should recover.
+                var staleChecks = Interlocked.Increment(ref state.ToolHealthStaleChecks);
+                if (staleChecks > WatchdogMaxToolAliveResets)
+                {
+                    Debug($"[TOOL-HEALTH] '{sessionName}' {staleChecks} stale health checks — assuming dead connection");
+                    TriggerToolHealthRecovery(state, sessionName, "no events after multiple health checks (connection likely dead)");
+                    return;
+                }
+
+                Debug($"[TOOL-HEALTH] '{sessionName}' no events for {ToolHealthCheckIntervalMs/1000}s, server alive — " +
+                      $"check {staleChecks}/{WatchdogMaxToolAliveResets}, scheduling another check");
+                ScheduleToolHealthCheck(state, sessionName);
+            }
+            catch (Exception ex)
+            {
+                Debug($"[TOOL-HEALTH] '{sessionName}' check failed: {ex.Message}");
+            }
+        }, null, Timeout.Infinite, Timeout.Infinite); // Don't start yet — store first to avoid race
+
+        var prev = Interlocked.Exchange(ref state.ToolHealthCheckTimer, timer);
+        prev?.Dispose();
+        timer.Change(ToolHealthCheckIntervalMs, Timeout.Infinite); // Now start
+    }
+
+    /// <summary>
+    /// Triggers recovery when tool health check detects a dead connection.
+    /// Clears the stuck processing state and notifies the user.
+    /// </summary>
+    private void TriggerToolHealthRecovery(SessionState state, string sessionName, string reason)
+    {
+        CancelToolHealthCheck(state);
+        CancelProcessingWatchdog(state);
+        CancelTurnEndFallback(state);
+
+        var activeTools = Volatile.Read(ref state.ActiveToolCallCount);
+        Debug($"[TOOL-HEALTH] '{sessionName}' triggering recovery: {reason} (activeTools={activeTools})");
+
+        InvokeOnUI(() =>
+        {
+            if (!state.Info.IsProcessing) return;
+
+            OnError?.Invoke(sessionName, $"Tool execution stuck ({reason}). Session recovered automatically.");
+
+            // Full cleanup mirroring CompleteResponse — missing fields here caused stuck sessions
+            Interlocked.Exchange(ref state.ActiveToolCallCount, 0);
+            state.HasUsedToolsThisTurn = false;
+            state.FallbackCanceledByTurnStart = false;
+            Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0);
+            Interlocked.Exchange(ref state.WatchdogCaseAResets, 0);
+            Interlocked.Exchange(ref state.ToolHealthStaleChecks, 0);
+
+            // Build full response: flushed mid-turn text + remaining current text
+            var response = state.CurrentResponse.ToString();
+            var fullResponse = state.FlushedResponse.Length > 0
+                ? (string.IsNullOrEmpty(response)
+                    ? state.FlushedResponse.ToString()
+                    : state.FlushedResponse + "\n\n" + response)
+                : response;
+
+            state.CurrentResponse.Clear();
+            state.FlushedResponse.Clear();
+            state.PendingReasoningMessages.Clear();
+
+            state.Info.IsProcessing = false;
+            state.Info.IsResumed = false;
+            Interlocked.Exchange(ref state.SendingFlag, 0);
+            state.Info.ProcessingStartedAt = null;
+            state.Info.ToolCallCount = 0;
+            state.Info.ProcessingPhase = 0;
+            state.Info.ClearPermissionDenials();
+
+            state.ResponseCompletion?.TrySetResult(fullResponse);
+
+            Debug($"[TOOL-HEALTH-COMPLETE] '{sessionName}' recovery finished (responseLen={fullResponse.Length})");
+
+            var summary = fullResponse.Length > 0 ? (fullResponse.Length > 100 ? fullResponse[..100] + "..." : fullResponse) : "";
+            OnSessionComplete?.Invoke(sessionName, summary);
+            OnStateChanged?.Invoke();
+        });
+    }
+
     /// <summary>
     /// Cancels and disposes any pending TurnEnd→Idle fallback timer on the state.
     /// Mirrors the CancelProcessingWatchdog pattern: Cancel + Dispose to avoid
@@ -1551,13 +1719,22 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session
 
                 var useToolTimeout = hasActiveTool || (state.Info.IsResumed && !useResumeQuiescence);
                 var useUsedToolsTimeout = !useToolTimeout && hasUsedTools && !hasActiveTool;
+                
+                // After the first Case A reset (tool running + server alive but no events arrived),
+                // switch to the escalation timeout. This allows the first 600s for legitimate long-running
+                // tools, but speeds up dead connection detection on subsequent checks.
+                var caseAResets = Volatile.Read(ref state.WatchdogCaseAResets);
+                var useEscalationTimeout = useToolTimeout && caseAResets > 0;
+                
                 var effectiveTimeout = useResumeQuiescence
                     ? WatchdogResumeQuiescenceTimeoutSeconds
-                    : useToolTimeout
-                        ? WatchdogToolExecutionTimeoutSeconds
-                        : useUsedToolsTimeout
-                            ? WatchdogUsedToolsIdleTimeoutSeconds
-                            : WatchdogInactivityTimeoutSeconds;
+                    : useEscalationTimeout
+                        ? WatchdogToolEscalationTimeoutSeconds
+                        : useToolTimeout
+                            ? WatchdogToolExecutionTimeoutSeconds
+                            : useUsedToolsTimeout
+                                ? WatchdogUsedToolsIdleTimeoutSeconds
+                                : WatchdogInactivityTimeoutSeconds;
 
                 // Safety net: check absolute max processing time, but only if events have also
                 // gone stale. If events are still flowing (elapsed < effectiveTimeout), the session
@@ -1599,25 +1776,58 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session
                     {
                         if (hasActiveTool && !IsDemoMode && !IsRemoteMode)
                         {
-                            // Case A: check server TCP port
+                            // Case A: check server TCP port + events.jsonl freshness
                             var serverAlive = _serverManager.IsServerRunning;
                             if (serverAlive)
                             {
-                                var resets = Interlocked.Increment(ref state.WatchdogCaseAResets);
-                                if (resets > WatchdogMaxToolAliveResets)
+                                // Check if the CLI is actively writing events for this session.
+                                // events.jsonl is written by the CLI process, not our app.
+                                // If recently modified → tool is actively running → wait indefinitely.
+                                // If stale → connection is likely dead → kill immediately.
+                                var eventsFileActive = false;
+                                try
                                 {
-                                    // Too many consecutive resets with no real SDK events — the
-                                    // session's JSON-RPC connection is likely dead even though the
-                                    // shared persistent server is still alive. Fall through to kill.
-                                    Debug($"[WATCHDOG] '{sessionName}' Case A reset cap exceeded ({resets}/{WatchdogMaxToolAliveResets}) " +
-                                          $"— killing despite server alive (elapsed={elapsed:F0}s, totalProcessing={totalProcessingSeconds:F0}s)");
+                                    var sessionId = state.Info.SessionId;
+                                    if (!string.IsNullOrEmpty(sessionId))
+                                    {
+                                        var eventsPath = Path.Combine(SessionStatePath, sessionId, "events.jsonl");
+                                        if (File.Exists(eventsPath))
+                                        {
+                                            var lastWrite = File.GetLastWriteTimeUtc(eventsPath);
+                                            var fileAge = (DateTime.UtcNow - lastWrite).TotalSeconds;
+                                            eventsFileActive = fileAge < 60; // modified within last 60s
+                                        }
+                                    }
                                 }
-                                else
+                                catch { /* filesystem errors → fall through to reset-cap logic */ }
+
+                                if (eventsFileActive)
                                 {
-                                    Debug($"[WATCHDOG] '{sessionName}' {elapsed:F0}s inactivity but tool is running and server is alive — resetting timer " +
-                                          $"(reset #{resets}/{WatchdogMaxToolAliveResets}, timeout={effectiveTimeout}s, totalProcessing={totalProcessingSeconds:F0}s)");
+                                    // Events file is fresh — tool is actively running. Wait indefinitely.
+                                    Debug($"[WATCHDOG] '{sessionName}' tool is running and events.jsonl is fresh — waiting indefinitely " +
+                                          $"(elapsed={elapsed:F0}s, totalProcessing={totalProcessingSeconds:F0}s)");
                                     Interlocked.Exchange(ref state.LastEventAtTicks, DateTime.UtcNow.Ticks);
-                                    continue; // keep waiting — don't kill
+                                    Interlocked.Exchange(ref state.WatchdogCaseAResets, 0); // reset counter since tool is active
+                                    continue;
+                                }
+                                else
+                                {
+                                    // Events file is stale or missing — connection is likely dead.
+                                    // Use the reset-cap as a safety buffer (1 more cycle to confirm).
+                                    var resets = Interlocked.Increment(ref state.WatchdogCaseAResets);
+                                    if (resets > 1) // Only need 1 confirmation cycle since we have file evidence
+                                    {
+                                        Debug($"[WATCHDOG] '{sessionName}' events.jsonl stale and reset count {resets} > 1 " +
+                                              $"— killing despite server alive (elapsed={elapsed:F0}s, totalProcessing={totalProcessingSeconds:F0}s)");
+                                        // fall through to kill
+                                    }
+                                    else
+                                    {
+                                        Debug($"[WATCHDOG] '{sessionName}' events.jsonl stale but giving 1 more cycle " +
+                                              $"(reset #{resets}, elapsed={elapsed:F0}s, totalProcessing={totalProcessingSeconds:F0}s)");
+                                        Interlocked.Exchange(ref state.LastEventAtTicks, DateTime.UtcNow.Ticks);
+                                        continue;
+                                    }
                                 }
                             }
                             else
@@ -1673,6 +1883,7 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session
                             return;
                         }
                         CancelProcessingWatchdog(state);
+                        CancelToolHealthCheck(state);
                         Interlocked.Exchange(ref state.ActiveToolCallCount, 0);
                         state.HasUsedToolsThisTurn = false;
                         Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0);
@@ -1810,6 +2021,7 @@ private async Task TryRecoverPermissionAsync(SessionState state, string sessionN
             // Cancel old watchdog AND TurnEnd fallback BEFORE creating new state
             CancelProcessingWatchdog(state);
             CancelTurnEndFallback(state);
+            CancelToolHealthCheck(state);
 
             // Bug B fix: Cancel the old ResponseCompletion TCS so the original
             // SendPromptAsync awaiter doesn't hang forever.
diff --git a/PolyPilot/Services/CopilotService.Organization.cs b/PolyPilot/Services/CopilotService.Organization.cs
index 80daafb69e..86f313047e 100644
--- a/PolyPilot/Services/CopilotService.Organization.cs
+++ b/PolyPilot/Services/CopilotService.Organization.cs
@@ -1,4 +1,5 @@
 using System.Collections.Concurrent;
+using System.Diagnostics;
 using System.Text.Json;
 using System.Text.RegularExpressions;
 using PolyPilot.Models;
@@ -29,8 +30,11 @@ public partial class CopilotService
 {
     public event Action<string, OrchestratorPhase, string?>? OnOrchestratorPhaseChanged; // groupId, phase, detail
 
-    /// <summary>Maximum time a single worker is allowed to run before being cancelled.</summary>
-    private static readonly TimeSpan WorkerExecutionTimeout = TimeSpan.FromMinutes(10);
+    /// <summary>Maximum time a single worker is allowed to run before being cancelled.
+    /// Set high (60 min) because the smart watchdog (events.jsonl freshness) handles dead
+    /// session detection in ~90s. This is only an absolute backstop.</summary>
+    private static readonly TimeSpan WorkerExecutionTimeout = TimeSpan.FromMinutes(60);
+    private static readonly TimeSpan WorkerExecutionTimeoutRemote = TimeSpan.FromMinutes(10);
 
     // Per-session semaphores to prevent concurrent model switches during rapid dispatch
     private readonly ConcurrentDictionary<string, SemaphoreSlim> _modelSwitchLocks = new();
@@ -1151,8 +1155,14 @@ private async Task SendViaOrchestratorAsync(string groupId, List<string> members
         {
             InvokeOnUI(() => OnOrchestratorPhaseChanged?.Invoke(groupId, OrchestratorPhase.WaitingForWorkers, null));
 
-            var workerTasks = assignments.Select(a =>
-                ExecuteWorkerAsync(a.WorkerName, a.Task, prompt, cancellationToken));
+            Debug($"[DISPATCH] Staggering {assignments.Count} workers with 1s delay");
+            var workerTasks = new List<Task<WorkerResult>>();
+            foreach (var a in assignments)
+            {
+                workerTasks.Add(ExecuteWorkerAsync(a.WorkerName, a.Task, prompt, cancellationToken));
+                if (workerTasks.Count < assignments.Count)
+                    await Task.Delay(1000, cancellationToken);
+            }
             var results = await Task.WhenAll(workerTasks);
 
             // Phase 4: Synthesize — send worker results back to orchestrator
@@ -1360,43 +1370,54 @@ private async Task<WorkerResult> ExecuteWorkerAsync(string workerName, string ta
 
         var workerPrompt = $"{identity}{worktreeNote}\n\nYour response will be collected and synthesized with other workers' responses.\n\n{sharedPrefix}## Original User Request (context)\n{originalPrompt}\n\n## Your Assigned Task\n{task}";
 
-        try
+        const int maxRetries = 2;
+        for (int attempt = 1; attempt <= maxRetries; attempt++)
         {
-            Debug($"[DISPATCH] Worker '{workerName}' starting (prompt len={workerPrompt.Length})");
-            var response = await SendPromptAndWaitAsync(workerName, workerPrompt, cancellationToken, originalPrompt: originalPrompt);
-
-            // Worker revival: empty response means the session died (e.g., dead SSE stream
-            // after reconnect). Create a fresh session and retry once.
-            if (string.IsNullOrWhiteSpace(response))
+            try
             {
-                Debug($"[DISPATCH] Worker '{workerName}' returned empty — attempting fresh session revival");
-                if (_sessions.TryGetValue(workerName, out var deadState))
-                {
-                    try { await deadState.Session.DisposeAsync(); } catch { }
+                Debug($"[DISPATCH] Worker '{workerName}' starting (prompt len={workerPrompt.Length}, attempt={attempt})");
+                var response = await SendPromptAndWaitAsync(workerName, workerPrompt, cancellationToken, originalPrompt: originalPrompt);
 
-                    var workerMeta = GetSessionMeta(workerName);
-                    var client = GetClientForGroup(workerMeta?.GroupId);
-                    if (client != null)
+                // Worker revival: empty response means the session died (e.g., dead SSE stream
+                // after reconnect). Create a fresh session and retry once.
+                if (string.IsNullOrWhiteSpace(response))
+                {
+                    Debug($"[DISPATCH] Worker '{workerName}' returned empty — attempting fresh session revival");
+                    if (_sessions.TryGetValue(workerName, out var deadState))
                     {
-                        var freshConfig = BuildFreshSessionConfig(deadState);
-                        var freshSession = await client.CreateSessionAsync(freshConfig, cancellationToken);
-                        deadState.Info.SessionId = freshSession.SessionId;
-                        var freshState = new SessionState { Session = freshSession, Info = deadState.Info };
-                        _sessions[workerName] = freshState;
-                        Debug($"[DISPATCH] Worker '{workerName}' revived with fresh session '{freshSession.SessionId}'");
-                        response = await SendPromptAndWaitAsync(workerName, workerPrompt, cancellationToken, originalPrompt: originalPrompt);
+                        try { await deadState.Session.DisposeAsync(); } catch { }
+
+                        var workerMeta = GetSessionMeta(workerName);
+                        var client = GetClientForGroup(workerMeta?.GroupId);
+                        if (client != null)
+                        {
+                            var freshConfig = BuildFreshSessionConfig(deadState);
+                            var freshSession = await client.CreateSessionAsync(freshConfig, cancellationToken);
+                            deadState.Info.SessionId = freshSession.SessionId;
+                            var freshState = new SessionState { Session = freshSession, Info = deadState.Info };
+                            _sessions[workerName] = freshState;
+                            Debug($"[DISPATCH] Worker '{workerName}' revived with fresh session '{freshSession.SessionId}'");
+                            response = await SendPromptAndWaitAsync(workerName, workerPrompt, cancellationToken, originalPrompt: originalPrompt);
+                        }
                     }
                 }
-            }
 
-            Debug($"[DISPATCH] Worker '{workerName}' completed (response len={response?.Length ?? 0}, elapsed={sw.Elapsed.TotalSeconds:F1}s)");
-            return new WorkerResult(workerName, response, true, null, sw.Elapsed);
-        }
-        catch (Exception ex)
-        {
-            Debug($"[DISPATCH] Worker '{workerName}' FAILED: {ex.GetType().Name}: {ex.Message} (elapsed={sw.Elapsed.TotalSeconds:F1}s)");
-            return new WorkerResult(workerName, null, false, ex.Message, sw.Elapsed);
+                Debug($"[DISPATCH] Worker '{workerName}' completed (response len={response?.Length ?? 0}, elapsed={sw.Elapsed.TotalSeconds:F1}s)");
+                return new WorkerResult(workerName, response, true, null, sw.Elapsed);
+            }
+            catch (Exception ex) when (attempt < maxRetries && IsConnectionError(ex))
+            {
+                Debug($"[DISPATCH] Worker '{workerName}' attempt {attempt} failed with {ex.GetType().Name} — retrying in 2s");
+                await Task.Delay(2000, cancellationToken);
+                continue;
+            }
+            catch (Exception ex)
+            {
+                Debug($"[DISPATCH] Worker '{workerName}' FAILED: {ex.GetType().Name}: {ex.Message} (elapsed={sw.Elapsed.TotalSeconds:F1}s)");
+                return new WorkerResult(workerName, null, false, ex.Message, sw.Elapsed);
+            }
         }
+        throw new UnreachableException(); // for loop always returns or continues
     }
 
     private async Task<string> SendPromptAndWaitAsync(string sessionName, string prompt, CancellationToken cancellationToken, string? originalPrompt = null)
@@ -1405,7 +1426,7 @@ private async Task<string> SendPromptAndWaitAsync(string sessionName, string pro
         // Do NOT capture state and await its TCS separately: reconnection replaces the state
         // object, orphaning the old TCS and causing a 10-minute hang.
         using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
-        cts.CancelAfter(WorkerExecutionTimeout);
+        cts.CancelAfter(IsDemoMode || IsRemoteMode ? WorkerExecutionTimeoutRemote : WorkerExecutionTimeout);
         // Wire CTS to ResponseCompletion TCS so the 10-minute timeout actually cancels the await.
         // Must look up from _sessions dict (not captured ref) since reconnect replaces state.
         await using var ctsReg = cts.Token.Register(() =>
@@ -1641,6 +1662,69 @@ private async Task MonitorAndSynthesizeAsync(PendingOrchestration pending, Cance
             InvokeOnUI(() => OnOrchestratorPhaseChanged?.Invoke(pending.GroupId, OrchestratorPhase.Dispatching,
                 $"Re-dispatching {unstartedWorkers.Count} worker(s)"));
 
+            // BUG FIX: Clear stuck IsProcessing state on workers before re-dispatch.
+            // After an app restart, workers may have IsProcessing=true from a previous
+            // incomplete turn (e.g., tool.execution_start with no tool.execution_complete).
+            // This prevents SendPromptAsync from accepting new prompts. Clear it here
+            // so re-dispatch can actually send the prompt.
+            foreach (var workerName in unstartedWorkers)
+            {
+                if (_sessions.TryGetValue(workerName, out var workerState) && workerState.Info.IsProcessing)
+                {
+                    Debug($"[DISPATCH] Clearing stuck IsProcessing on '{workerName}' before re-dispatch");
+                    // Cancel timers first (thread-safe — use Interlocked internally)
+                    CancelProcessingWatchdog(workerState);
+                    CancelTurnEndFallback(workerState);
+                    CancelToolHealthCheck(workerState);
+
+                    // Must run on UI thread per INV-2; use TCS to synchronize
+                    var tcs = new TaskCompletionSource<bool>();
+                    InvokeOnUI(() =>
+                    {
+                        try
+                        {
+                            if (!workerState.Info.IsProcessing) { tcs.TrySetResult(true); return; }
+
+                            // Full cleanup mirroring CompleteResponse / tool-health recovery
+                            Interlocked.Exchange(ref workerState.ActiveToolCallCount, 0);
+                            Interlocked.Exchange(ref workerState.SendingFlag, 0);
+                            Interlocked.Exchange(ref workerState.SuccessfulToolCountThisTurn, 0);
+                            Interlocked.Exchange(ref workerState.WatchdogCaseAResets, 0);
+                            workerState.HasUsedToolsThisTurn = false;
+                            workerState.FallbackCanceledByTurnStart = false;
+                            workerState.Info.IsResumed = false;
+                            workerState.Info.ProcessingStartedAt = null;
+                            workerState.Info.ToolCallCount = 0;
+                            workerState.Info.ProcessingPhase = 0;
+                            workerState.Info.ClearPermissionDenials();
+
+                            var response = workerState.CurrentResponse.ToString();
+                            var fullResponse = workerState.FlushedResponse.Length > 0
+                                ? (string.IsNullOrEmpty(response)
+                                    ? workerState.FlushedResponse.ToString()
+                                    : workerState.FlushedResponse + "\n\n" + response)
+                                : response;
+
+                            workerState.CurrentResponse.Clear();
+                            workerState.FlushedResponse.Clear();
+                            workerState.PendingReasoningMessages.Clear();
+                            workerState.Info.IsProcessing = false;
+
+                            workerState.ResponseCompletion?.TrySetResult(fullResponse);
+                            var summary = fullResponse.Length > 0 ? (fullResponse.Length > 100 ? fullResponse[..100] + "..." : fullResponse) : "";
+                            OnSessionComplete?.Invoke(workerName, summary);
+                            OnStateChanged?.Invoke();
+                            tcs.TrySetResult(true);
+                        }
+                        catch (Exception ex)
+                        {
+                            tcs.TrySetException(ex);
+                        }
+                    });
+                    await tcs.Task;
+                }
+            }
+
             // Build a generic task from the original prompt for re-dispatched workers.
             // Materialize as an array so we can inspect individual task results even on partial failure.
             var redispatchTaskArray = unstartedWorkers
diff --git a/PolyPilot/Services/CopilotService.cs b/PolyPilot/Services/CopilotService.cs
index dde17728ca..9bdb631d78 100644
--- a/PolyPilot/Services/CopilotService.cs
+++ b/PolyPilot/Services/CopilotService.cs
@@ -445,6 +445,18 @@ private class SessionState
         /// WatchdogMaxToolAliveResets to prevent infinite resets when the session's JSON-RPC
         /// connection is dead but the shared persistent server is still alive.</summary>
         public int WatchdogCaseAResets;
+        /// <summary>True if the TurnEnd→Idle fallback was canceled by an AssistantTurnStartEvent.
+        /// Used for diagnostic logging: when the next TurnEnd re-arms the fallback, the log shows
+        /// the self-healing loop in action (TurnEnd → TurnStart cancel → TurnEnd re-arm).</summary>
+        public volatile bool FallbackCanceledByTurnStart;
+        /// <summary>Timer that fires shortly after a tool starts to verify the connection is still alive.
+        /// If no tool completion event arrives within ToolHealthCheckIntervalMs, we do an active health
+        /// check to detect dead connections early (instead of waiting for the 600s watchdog timeout).</summary>
+        public Timer? ToolHealthCheckTimer;
+        public int ToolHealthStaleChecks; // Separate from WatchdogCaseAResets — health check's own stale counter
+        /// <summary>Timestamp when the most recent tool started. Used by the tool health check to
+        /// determine if a tool has been running too long without any events.</summary>
+        public long ToolStartedAtTicks;
     }
 
     private void Debug(string message)
@@ -834,6 +846,7 @@ public async Task ReconnectAsync(ConnectionSettings settings, CancellationToken
         {
             CancelProcessingWatchdog(state);
             CancelTurnEndFallback(state);
+            CancelToolHealthCheck(state);
             try { if (state.Session != null) await state.Session.DisposeAsync(); } catch { }
         }
         _sessions.Clear();
@@ -2413,6 +2426,7 @@ public async Task<string> SendPromptAsync(string sessionName, string prompt, Lis
         state.Info.ClearPermissionDenials();
         Interlocked.Exchange(ref state.ActiveToolCallCount, 0); // Reset stale tool count from previous turn
         state.HasUsedToolsThisTurn = false; // Reset stale tool flag from previous turn
+        state.FallbackCanceledByTurnStart = false;
         Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0);
         // Cancel any pending TurnEnd→Idle fallback from the previous turn
         CancelTurnEndFallback(state);
@@ -2624,6 +2638,7 @@ public async Task<string> SendPromptAsync(string sessionName, string prompt, Lis
                     // Cancel old watchdog AND TurnEnd fallback BEFORE creating new state — they share Info/TCS
                     CancelProcessingWatchdog(state);
                     CancelTurnEndFallback(state);
+                    CancelToolHealthCheck(state);
                     Debug($"[RECONNECT] '{sessionName}' replacing state (old handler will be orphaned, " +
                           $"old session disposed, new session={newSession.SessionId})");
                     var newState = new SessionState
@@ -2636,6 +2651,12 @@ public async Task<string> SendPromptAsync(string sessionName, string prompt, Lis
                     // orphaned old state can't pass generation checks on the new state.
                     Interlocked.Exchange(ref newState.ProcessingGeneration,
                         Interlocked.Read(ref state.ProcessingGeneration));
+                    // Reset tool tracking for the NEW connection. The old connection's
+                    // tool state is stale — no tools have run on this connection yet.
+                    // Without this, HasUsedToolsThisTurn=true from the dead connection
+                    // inflates the watchdog timeout from 120s to 600s, making stuck
+                    // sessions wait 5x longer than necessary to recover.
+                    newState.HasUsedToolsThisTurn = false;
                     Interlocked.Exchange(ref newState.ActiveToolCallCount, 0);
                     Interlocked.Exchange(ref newState.SuccessfulToolCountThisTurn, 0);
                     newState.IsMultiAgentSession = state.IsMultiAgentSession;
@@ -2682,6 +2703,7 @@ public async Task<string> SendPromptAsync(string sessionName, string prompt, Lis
                     OnError?.Invoke(sessionName, $"Session disconnected and reconnect failed: {Models.ErrorMessageHelper.Humanize(retryEx)}");
                     CancelProcessingWatchdog(state);
                     CancelTurnEndFallback(state);
+                    CancelToolHealthCheck(state);
                     FlushCurrentResponse(state);
                     Debug($"[ERROR] '{sessionName}' reconnect+retry failed, clearing IsProcessing");
                     Interlocked.Exchange(ref state.ActiveToolCallCount, 0);
@@ -2703,6 +2725,7 @@ public async Task<string> SendPromptAsync(string sessionName, string prompt, Lis
                 OnError?.Invoke(sessionName, $"SendAsync failed: {Models.ErrorMessageHelper.Humanize(ex)}");
                 CancelProcessingWatchdog(state);
                 CancelTurnEndFallback(state);
+                CancelToolHealthCheck(state);
                 FlushCurrentResponse(state);
                 Debug($"[ERROR] '{sessionName}' SendAsync failed, clearing IsProcessing (error={ex.Message})");
                 Interlocked.Exchange(ref state.ActiveToolCallCount, 0);
@@ -2866,6 +2889,7 @@ public async Task AbortSessionAsync(string sessionName, bool markAsInterrupted =
         // Cancel any pending TurnEnd→Idle fallback so it doesn't fire CompleteResponse after abort
         CancelTurnEndFallback(state);
         CancelProcessingWatchdog(state);
+        CancelToolHealthCheck(state);
         state.FlushedResponse.Clear();
         state.PendingReasoningMessages.Clear();
         state.Info.ClearPermissionDenials(); // INV-1: clear on all termination paths
@@ -2955,6 +2979,7 @@ await InvokeOnUIAsync(() =>
                 OnError?.Invoke(sessionName, $"Soft steer failed: {Models.ErrorMessageHelper.Humanize(ex)}");
                 CancelProcessingWatchdog(state);
                 CancelTurnEndFallback(state);
+                CancelToolHealthCheck(state);
                 FlushCurrentResponse(state);
                 Debug($"[STEER-ERROR] '{sessionName}' soft steer SendAsync failed, clearing IsProcessing (error={ex.Message})");
                 Interlocked.Exchange(ref state.ActiveToolCallCount, 0);
@@ -3408,6 +3433,7 @@ internal async Task<bool> CloseSessionCoreAsync(string name, bool notifyUi)
         // Cancel any pending timers so they don't fire on torn-down state after session removal
         CancelProcessingWatchdog(state);
         CancelTurnEndFallback(state);
+        CancelToolHealthCheck(state);
 
         // Dispose the SDK session AFTER UI has updated — DisposeAsync talks to the CLI
         // process and may trigger additional SDK events on background threads. Running it
@@ -3458,6 +3484,7 @@ public async ValueTask DisposeAsync()
         {
             CancelProcessingWatchdog(state);
             CancelTurnEndFallback(state);
+            CancelToolHealthCheck(state);
             if (state.Session is not null)
                 try { await state.Session.DisposeAsync(); } catch { }
         }