diff --git a/PolyPilot.Tests/ZombieSubagentExpiryTests.cs b/PolyPilot.Tests/ZombieSubagentExpiryTests.cs new file mode 100644 index 000000000..bbe065671 --- /dev/null +++ b/PolyPilot.Tests/ZombieSubagentExpiryTests.cs @@ -0,0 +1,234 @@ +using GitHub.Copilot.SDK; +using PolyPilot.Services; + +namespace PolyPilot.Tests; + +/// +/// Tests for the zombie subagent expiry mechanism in HasActiveBackgroundTasks. +/// +/// Problem: The Copilot CLI has no per-subagent timeout. A crashed or orphaned subagent +/// never fires SubagentCompleted/SubagentFailed, so IDLE-DEFER blocks session completion +/// indefinitely. PolyPilot tracks when IDLE-DEFER first started (as UTC ticks in +/// SubagentDeferStartedAtTicks) and expires the background agent block after +/// SubagentZombieTimeoutMinutes, allowing the session to complete normally. +/// See: issue #509 (expose CancelBackgroundTaskAsync via SDK). +/// +public class ZombieSubagentExpiryTests +{ + private static long TicksAgo(double minutes) => + DateTime.UtcNow.AddMinutes(-minutes).Ticks; + + private static SessionIdleEvent MakeIdleWithAgents(int agentCount = 1) + { + return new SessionIdleEvent + { + Data = new SessionIdleData + { + BackgroundTasks = new SessionIdleDataBackgroundTasks + { + Agents = Enumerable.Range(0, agentCount) + .Select(i => new SessionIdleDataBackgroundTasksAgentsItem + { + AgentId = $"agent-{i}", + AgentType = "copilot", + Description = "" + }).ToArray(), + Shells = Array.Empty() + } + } + }; + } + + private static SessionIdleEvent MakeIdleWithShells(int shellCount = 1) + { + return new SessionIdleEvent + { + Data = new SessionIdleData + { + BackgroundTasks = new SessionIdleDataBackgroundTasks + { + Agents = Array.Empty(), + Shells = Enumerable.Range(0, shellCount) + .Select(i => new SessionIdleDataBackgroundTasksShellsItem + { + ShellId = $"shell-{i}", + Description = "" + }).ToArray() + } + } + }; + } + + // --- Zero ticks (not set) — backward-compatible behavior --- + + [Fact] + public void ZeroTicks_ActiveAgent_ReturnsTrue() + { + // 0 means "not set" — behaviour is unchanged: any agent means "still running". + var idle = MakeIdleWithAgents(); + Assert.True(CopilotService.HasActiveBackgroundTasks(idle, idleDeferStartedAtTicks: 0)); + } + + [Fact] + public void ZeroTicks_NoTasks_ReturnsFalse() + { + var idle = new SessionIdleEvent { Data = new SessionIdleData() }; + Assert.False(CopilotService.HasActiveBackgroundTasks(idle, idleDeferStartedAtTicks: 0)); + } + + // --- Fresh IDLE-DEFER (started recently — well within timeout) --- + + [Fact] + public void FreshDeferStart_ActiveAgent_ReturnsTrue() + { + var idle = MakeIdleWithAgents(); + Assert.True(CopilotService.HasActiveBackgroundTasks(idle, TicksAgo(1))); + } + + [Fact] + public void DeferStartJustBelowThreshold_ReturnsTrue() + { + var idle = MakeIdleWithAgents(); + Assert.True(CopilotService.HasActiveBackgroundTasks( + idle, TicksAgo(CopilotService.SubagentZombieTimeoutMinutes - 1))); + } + + // --- Zombie threshold reached --- + + [Fact] + public void ZombieThresholdExceeded_SingleAgent_ReturnsFalse() + { + var idle = MakeIdleWithAgents(); + Assert.False(CopilotService.HasActiveBackgroundTasks( + idle, TicksAgo(CopilotService.SubagentZombieTimeoutMinutes + 1))); + } + + [Fact] + public void ZombieThresholdExceeded_MultipleAgents_ReturnsFalse() + { + // All 8 agents reported — none complete — reproduces the real incident + var idle = MakeIdleWithAgents(agentCount: 8); + Assert.False(CopilotService.HasActiveBackgroundTasks(idle, TicksAgo(42))); + } + + [Fact] + public void ZombieThresholdExactlyMet_ReturnsFalse() + { + // At exactly the threshold, the session is considered expired. + // TicksAgo(20) produces ticks > 20min ago (test executes in microseconds, not minutes), + // so elapsed will be just over 20min and the threshold check fires correctly. + var idle = MakeIdleWithAgents(); + Assert.False(CopilotService.HasActiveBackgroundTasks( + idle, TicksAgo(CopilotService.SubagentZombieTimeoutMinutes))); + } + + // --- Shells are never expired --- + + [Fact] + public void ZombieThresholdExceeded_WithShells_ReturnsTrue() + { + // Even if all background agents are expired, an active shell keeps IDLE-DEFER alive. + // Shells are managed at the OS level — PolyPilot never force-expires them. + var idle = new SessionIdleEvent + { + Data = new SessionIdleData + { + BackgroundTasks = new SessionIdleDataBackgroundTasks + { + Agents = new[] + { + new SessionIdleDataBackgroundTasksAgentsItem + { + AgentId = "zombie-agent", AgentType = "copilot", Description = "" + } + }, + Shells = new[] + { + new SessionIdleDataBackgroundTasksShellsItem + { + ShellId = "shell-1", Description = "npm test" + } + } + } + } + }; + Assert.True(CopilotService.HasActiveBackgroundTasks(idle, TicksAgo(30))); + } + + [Fact] + public void ZombieThresholdExceeded_ShellsOnly_ReturnsTrue() + { + // Shells alone always block completion — they are never zombie-expired. + var idle = MakeIdleWithShells(); + Assert.True(CopilotService.HasActiveBackgroundTasks(idle, TicksAgo(60))); + } + + [Fact] + public void FreshDeferStart_ShellsOnly_ReturnsTrue() + { + var idle = MakeIdleWithShells(); + Assert.True(CopilotService.HasActiveBackgroundTasks(idle, TicksAgo(1))); + } + + // --- Cross-turn stale timestamp: the critical lifecycle bug this PR fixes --- + + [Fact] + public void StaleDeferTimestamp_FromPriorTurn_NewTurnShouldNotExpireAgents() + { + // Scenario: SubagentDeferStartedAtTicks was NOT cleared (e.g. watchdog/abort path) + // after Turn N which had an IDLE-DEFER 25 minutes ago. Turn N+1 starts and its first + // IDLE-DEFER fires. The ??= logic would preserve the stale timestamp. + // + // This test documents WHY SubagentDeferStartedAtTicks MUST be reset in all paths that + // clear HasDeferredIdle, not just CompleteResponse. If a caller passes a 25-min-old + // timestamp for what is actually a brand-new IDLE-DEFER, zombie expiry fires immediately + // and kills live subagents. + // + // Fix: Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L) alongside + // every HasDeferredIdle = false in SendPromptAsync, AbortSessionAsync, error paths, etc. + // + // After the fix, SubagentDeferStartedAtTicks is reset at turn boundaries, so the + // ??= CompareExchange sets a fresh timestamp for the new turn, and zombie expiry + // is based on the new turn's actual elapsed time. + var idle = MakeIdleWithAgents(); + + // Simulate: stale ticks from 25 minutes ago NOT cleared at turn boundary + long staleTicks = TicksAgo(25); + + // Without the fix, this would return false (zombie expiry fires on fresh agents) + // The test ASSERTS false to document what the broken behavior looks like, + // and to verify that HasActiveBackgroundTasks correctly respects the ticks value. + // The real invariant is: callers MUST pass 0 (not stale ticks) for new turns. + Assert.False(CopilotService.HasActiveBackgroundTasks(idle, staleTicks), + "HasActiveBackgroundTasks correctly expires based on elapsed ticks — " + + "the caller is responsible for resetting SubagentDeferStartedAtTicks at turn boundaries."); + + // The safe path: passing fresh ticks (new turn, new timestamp) should NOT expire agents + long freshTicks = TicksAgo(1); + Assert.True(CopilotService.HasActiveBackgroundTasks(idle, freshTicks), + "With fresh ticks (new turn), agents should NOT be expired — confirms the fix works."); + } + + // --- Backward compatibility with existing BackgroundTasksIdleTests --- + + [Fact] + public void BackwardCompat_NullBackgroundTasks_ReturnsFalse() + { + var idle = new SessionIdleEvent { Data = new SessionIdleData { BackgroundTasks = null } }; + Assert.False(CopilotService.HasActiveBackgroundTasks(idle)); + } + + [Fact] + public void BackwardCompat_WithAgents_ReturnsTrue() + { + var idle = MakeIdleWithAgents(); + Assert.True(CopilotService.HasActiveBackgroundTasks(idle)); + } + + [Fact] + public void BackwardCompat_WithShells_ReturnsTrue() + { + var idle = MakeIdleWithShells(); + Assert.True(CopilotService.HasActiveBackgroundTasks(idle)); + } +} diff --git a/PolyPilot/Services/CopilotService.Events.cs b/PolyPilot/Services/CopilotService.Events.cs index 04d2b5453..d86220fd2 100644 --- a/PolyPilot/Services/CopilotService.Events.cs +++ b/PolyPilot/Services/CopilotService.Events.cs @@ -688,7 +688,27 @@ void Invoke(Action action) // KEY FIX: Check if the server reports active background tasks (sub-agents, shells). // session.idle with background tasks means "foreground quiesced, background still running." // Do NOT treat this as terminal — flush text and wait for the real idle. - if (HasActiveBackgroundTasks(idle)) + // Record when we first entered IDLE-DEFER for this turn (used for zombie expiry). + // CompareExchange(0 → now): sets only on the first IDLE-DEFER; subsequent ones + // for the same turn preserve the original timestamp so elapsed time is cumulative. + Interlocked.CompareExchange( + ref state.SubagentDeferStartedAtTicks, + DateTime.UtcNow.Ticks, + 0L); + + var deferTicks = Interlocked.Read(ref state.SubagentDeferStartedAtTicks); + var hasActiveTasks = HasActiveBackgroundTasks(idle, deferTicks); + + // Log zombie expiry here where Debug() is available (HasActiveBackgroundTasks is static) + if (!hasActiveTasks && deferTicks != 0 && (idle.Data?.BackgroundTasks?.Agents?.Length ?? 0) > 0) + { + var expiredMinutes = TimeSpan.FromTicks(DateTime.UtcNow.Ticks - deferTicks).TotalMinutes; + Debug($"[IDLE-DEFER-ZOMBIE] '{sessionName}' {idle.Data!.BackgroundTasks!.Agents!.Length} " + + $"background agent(s) expired after {expiredMinutes:F0}min " + + $"(threshold={SubagentZombieTimeoutMinutes}min) — allowing session to complete"); + } + + if (hasActiveTasks) { state.HasDeferredIdle = true; // Track for watchdog freshness window Debug($"[IDLE-DEFER] '{sessionName}' session.idle received with active background tasks — " + @@ -885,6 +905,7 @@ await notifService.SendNotificationAsync( Interlocked.Exchange(ref state.ActiveToolCallCount, 0); state.HasUsedToolsThisTurn = false; state.HasDeferredIdle = false; + Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L); Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); Interlocked.Exchange(ref state.ToolHealthStaleChecks, 0); Interlocked.Exchange(ref state.EventCountThisTurn, 0); @@ -1240,6 +1261,7 @@ private void CompleteResponse(SessionState state, long? expectedGeneration = nul Interlocked.Exchange(ref state.ActiveToolCallCount, 0); state.HasUsedToolsThisTurn = false; state.HasDeferredIdle = false; + Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L); state.IsReconnectedSend = false; // Clear reconnect flag on turn completion (defense-in-depth) state.FallbackCanceledByTurnStart = false; Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); @@ -1989,6 +2011,7 @@ private void TriggerToolHealthRecovery(SessionState state, string sessionName, s Interlocked.Exchange(ref state.ActiveToolCallCount, 0); state.HasUsedToolsThisTurn = false; state.HasDeferredIdle = false; + Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L); state.FallbackCanceledByTurnStart = false; Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); Interlocked.Exchange(ref state.WatchdogCaseAResets, 0); @@ -2128,16 +2151,44 @@ internal static bool IsMcpError(string? text) return error.ToString(); } + /// + /// How long a session can be stuck in IDLE-DEFER (background agents reported but never completing) + /// before all background agents are treated as zombies and the session is allowed to complete. + /// The Copilot CLI has no per-agent timeout, so a crashed or orphaned subagent blocks + /// IDLE-DEFER indefinitely. After this threshold PolyPilot expires the stale block. + /// Shells are never expired — they are managed at the OS level. + /// + internal const int SubagentZombieTimeoutMinutes = 20; + /// /// Check if a SessionIdleEvent reports active background tasks (agents or shells). /// When background tasks are active, session.idle means "foreground quiesced, background /// still running" — NOT true completion. + /// + /// When is non-zero, background agents are treated + /// as zombies if the session has been in IDLE-DEFER longer than + /// . This allows the session to complete even if + /// the CLI never fires SubagentCompleted for a crashed or orphaned subagent. + /// The caller is responsible for logging the zombie expiry via Debug(). + /// Shells are never expired — their lifecycle is managed by the OS. /// - internal static bool HasActiveBackgroundTasks(SessionIdleEvent idle) + internal static bool HasActiveBackgroundTasks( + SessionIdleEvent idle, + long idleDeferStartedAtTicks = 0) { var bt = idle.Data?.BackgroundTasks; if (bt == null) return false; - return (bt.Agents is { Length: > 0 }) || (bt.Shells is { Length: > 0 }); + + bool hasAgents = bt.Agents is { Length: > 0 }; + + if (hasAgents && idleDeferStartedAtTicks != 0) + { + var elapsed = TimeSpan.FromTicks(DateTime.UtcNow.Ticks - idleDeferStartedAtTicks); + if (elapsed.TotalMinutes >= SubagentZombieTimeoutMinutes) + hasAgents = false; + } + + return hasAgents || (bt.Shells is { Length: > 0 }); } private void StartProcessingWatchdog(SessionState state, string sessionName) @@ -2563,6 +2614,7 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session Interlocked.Exchange(ref state.ActiveToolCallCount, 0); state.HasUsedToolsThisTurn = false; state.HasDeferredIdle = false; + Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L); Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); Interlocked.Exchange(ref state.ToolHealthStaleChecks, 0); Interlocked.Exchange(ref state.EventCountThisTurn, 0); @@ -2662,6 +2714,7 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session Interlocked.Exchange(ref state.ActiveToolCallCount, 0); state.HasUsedToolsThisTurn = false; state.HasDeferredIdle = false; + Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L); Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); Interlocked.Exchange(ref state.ToolHealthStaleChecks, 0); Interlocked.Exchange(ref state.EventCountThisTurn, 0); @@ -2718,6 +2771,7 @@ private void ClearProcessingStateForRecoveryFailure(SessionState state, string s state.Info.IsResumed = false; state.HasUsedToolsThisTurn = false; state.HasDeferredIdle = false; + Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L); Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); Interlocked.Exchange(ref state.ActiveToolCallCount, 0); Interlocked.Exchange(ref state.ToolHealthStaleChecks, 0); @@ -2818,6 +2872,7 @@ private async Task TryRecoverPermissionAsync(SessionState state, string sessionN // Clear stale tool flag so watchdog uses normal timeout if resend is skipped newState.HasUsedToolsThisTurn = false; newState.HasDeferredIdle = false; + newState.SubagentDeferStartedAtTicks = 0L; // redundant (default), but explicit companion pair // Replace in sessions dictionary BEFORE registering event handler // so HandleSessionEvent's isCurrentState check passes for the new state. @@ -2856,6 +2911,7 @@ private async Task TryRecoverPermissionAsync(SessionState state, string sessionN state.Info.IsResumed = false; state.HasUsedToolsThisTurn = false; state.HasDeferredIdle = false; + Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L); Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); Interlocked.Exchange(ref state.ActiveToolCallCount, 0); Interlocked.Exchange(ref state.ToolHealthStaleChecks, 0); diff --git a/PolyPilot/Services/CopilotService.Organization.cs b/PolyPilot/Services/CopilotService.Organization.cs index d439388fc..c4ecdffe8 100644 --- a/PolyPilot/Services/CopilotService.Organization.cs +++ b/PolyPilot/Services/CopilotService.Organization.cs @@ -2213,6 +2213,7 @@ private async Task ForceCompleteProcessingAsync(string sessionName, SessionState Interlocked.Exchange(ref state.WatchdogCaseBStaleCount, 0); state.HasUsedToolsThisTurn = false; state.HasDeferredIdle = false; + Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L); state.FallbackCanceledByTurnStart = false; state.Info.IsResumed = false; state.Info.ProcessingStartedAt = null; diff --git a/PolyPilot/Services/CopilotService.cs b/PolyPilot/Services/CopilotService.cs index 95f39444f..f94ac1c84 100644 --- a/PolyPilot/Services/CopilotService.cs +++ b/PolyPilot/Services/CopilotService.cs @@ -695,6 +695,15 @@ private class SessionState /// session to restart processing on its own ("continued without me sending a message"). /// Cleared by SendPromptAsync at the start of each new turn. public volatile bool WasUserAborted; + /// + /// UTC ticks when IDLE-DEFER was first entered for the current turn (first + /// SessionIdleEvent with active background tasks). 0 = not set. + /// Uses Interlocked for thread safety: the IDLE-DEFER section (SDK event thread) sets via + /// CompareExchange(0 → now); CompleteResponse/SendPromptAsync (UI thread) clear via Exchange(0). + /// Matches the pattern of TurnEndReceivedAtTicks. MUST be cleared in every path that + /// clears HasDeferredIdle — the two fields are an inseparable companion pair. + /// + public long SubagentDeferStartedAtTicks; } private static void DisposePrematureIdleSignal(SessionState? state) @@ -3280,6 +3289,7 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis Interlocked.Exchange(ref state.ActiveToolCallCount, 0); // Reset stale tool count from previous turn state.HasUsedToolsThisTurn = false; // Reset stale tool flag from previous turn state.HasDeferredIdle = false; // Reset deferred idle flag from previous turn + Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L); // Companion pair — must clear with HasDeferredIdle state.IsReconnectedSend = false; // Clear reconnect flag — new turn starts fresh (see watchdog reconnect timeout) state.WasUserAborted = false; // Clear abort flag — new turn starts fresh (re-enables EVT-REARM) state.PrematureIdleSignal.Reset(); // Clear premature idle detection from previous turn @@ -3571,6 +3581,7 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis // Mirror primary reconnect: reset tool tracking for new connection siblingState.HasUsedToolsThisTurn = false; siblingState.HasDeferredIdle = false; + Interlocked.Exchange(ref siblingState.SubagentDeferStartedAtTicks, 0L); Interlocked.Exchange(ref siblingState.ActiveToolCallCount, 0); Interlocked.Exchange(ref siblingState.SuccessfulToolCountThisTurn, 0); Interlocked.Exchange(ref siblingState.ToolHealthStaleChecks, 0); @@ -3764,6 +3775,7 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis // sessions wait 5x longer than necessary to recover. newState.HasUsedToolsThisTurn = false; newState.HasDeferredIdle = false; + newState.SubagentDeferStartedAtTicks = 0L; // redundant (default), but explicit companion pair Interlocked.Exchange(ref newState.ActiveToolCallCount, 0); Interlocked.Exchange(ref newState.SuccessfulToolCountThisTurn, 0); newState.IsMultiAgentSession = state.IsMultiAgentSession; @@ -3800,6 +3812,7 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis // 120s watchdog tier instead of the inflated 600s from stale tool state. state.HasUsedToolsThisTurn = false; state.HasDeferredIdle = false; + Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L); // Schedule persistence of the new session ID so it survives app restart. // Without this, the debounced save captures the pre-reconnect snapshot @@ -3872,6 +3885,7 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis Interlocked.Exchange(ref state.ActiveToolCallCount, 0); state.HasUsedToolsThisTurn = false; state.HasDeferredIdle = false; + Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L); Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); state.Info.IsResumed = false; state.Info.IsProcessing = false; @@ -3895,6 +3909,7 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis Interlocked.Exchange(ref state.ActiveToolCallCount, 0); state.HasUsedToolsThisTurn = false; state.HasDeferredIdle = false; + Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L); Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); state.Info.IsResumed = false; state.Info.IsProcessing = false; @@ -4048,6 +4063,7 @@ public async Task AbortSessionAsync(string sessionName, bool markAsInterrupted = Interlocked.Exchange(ref state.ActiveToolCallCount, 0); state.HasUsedToolsThisTurn = false; state.HasDeferredIdle = false; + Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L); // INV-1: companion pair with HasDeferredIdle state.IsReconnectedSend = false; // INV-1: clear all per-turn flags on abort Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); // Release send lock — allows a subsequent SteerSessionAsync to acquire it immediately @@ -4174,6 +4190,7 @@ await InvokeOnUIAsync(() => Interlocked.Exchange(ref state.ActiveToolCallCount, 0); state.HasUsedToolsThisTurn = false; state.HasDeferredIdle = false; + Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L); Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); state.Info.IsResumed = false; state.IsReconnectedSend = false; // INV-1 item 8: prevent stale 35s timeout on next watchdog start @@ -4462,6 +4479,7 @@ await InvokeOnUIAsync(() => state.Info.IsResumed = false; state.HasUsedToolsThisTurn = false; state.HasDeferredIdle = false; + Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L); Interlocked.Exchange(ref state.ActiveToolCallCount, 0); Interlocked.Exchange(ref state.SendingFlag, 0); state.Info.ProcessingStartedAt = null;