From 50c6334102f4a5e395500a257efa0a4c4c995758 Mon Sep 17 00:00:00 2001 From: Shane Neuville Date: Sun, 5 Apr 2026 11:25:25 -0500 Subject: [PATCH 1/3] fix: Expire zombie subagents blocking IDLE-DEFER after 20 minutes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a subagent crashes or is orphaned, the CLI never fires SubagentCompleted/SubagentFailed. The IDLE-DEFER guard (which blocks premature session completion when background tasks are active) would then block the session indefinitely — reproducing the case where one of 8 subagents (started 40+ min prior) prevented the orchestrator from ever finishing. Tracks when IDLE-DEFER was first entered for the current turn (SubagentDeferStartedAt). HasActiveBackgroundTasks now accepts the defer start timestamp and returns false once SubagentZombieTimeoutMinutes (20) has elapsed, unblocking the session. Shells are never expired — their lifecycle is managed at the OS level. Also adds ZombieSubagentExpiryTests (13 cases) covering fresh, threshold, expired, mixed, and shell-only scenarios. Co-Authored-By: Claude Sonnet 4.6 --- PolyPilot.Tests/ZombieSubagentExpiryTests.cs | 195 +++++++++++++++++++ PolyPilot/Services/CopilotService.Events.cs | 43 +++- PolyPilot/Services/CopilotService.cs | 9 + 3 files changed, 244 insertions(+), 3 deletions(-) create mode 100644 PolyPilot.Tests/ZombieSubagentExpiryTests.cs diff --git a/PolyPilot.Tests/ZombieSubagentExpiryTests.cs b/PolyPilot.Tests/ZombieSubagentExpiryTests.cs new file mode 100644 index 000000000..7f6a2d911 --- /dev/null +++ b/PolyPilot.Tests/ZombieSubagentExpiryTests.cs @@ -0,0 +1,195 @@ +using GitHub.Copilot.SDK; +using PolyPilot.Services; + +namespace PolyPilot.Tests; + +/// +/// Tests for the zombie subagent expiry mechanism in HasActiveBackgroundTasks. +/// +/// Problem: The Copilot CLI has no per-subagent timeout. A crashed or orphaned subagent +/// never fires SubagentCompleted/SubagentFailed, so IDLE-DEFER blocks session completion +/// indefinitely. PolyPilot tracks when IDLE-DEFER first started and expires the background +/// agent block after SubagentZombieTimeoutMinutes, allowing the session to complete normally. +/// See: issue #509 (expose CancelBackgroundTaskAsync via SDK). +/// +public class ZombieSubagentExpiryTests +{ + private static SessionIdleEvent MakeIdleWithAgents(int agentCount = 1) + { + return new SessionIdleEvent + { + Data = new SessionIdleData + { + BackgroundTasks = new SessionIdleDataBackgroundTasks + { + Agents = Enumerable.Range(0, agentCount) + .Select(i => new SessionIdleDataBackgroundTasksAgentsItem + { + AgentId = $"agent-{i}", + AgentType = "copilot", + Description = "" + }).ToArray(), + Shells = Array.Empty() + } + } + }; + } + + private static SessionIdleEvent MakeIdleWithShells(int shellCount = 1) + { + return new SessionIdleEvent + { + Data = new SessionIdleData + { + BackgroundTasks = new SessionIdleDataBackgroundTasks + { + Agents = Array.Empty(), + Shells = Enumerable.Range(0, shellCount) + .Select(i => new SessionIdleDataBackgroundTasksShellsItem + { + ShellId = $"shell-{i}", + Description = "" + }).ToArray() + } + } + }; + } + + // --- No defer start time (null) — backward-compatible behavior --- + + [Fact] + public void NoDeferStartTime_ActiveAgent_ReturnsTrue() + { + // When no deferStartedAt is provided, behaviour is unchanged: + // any agent in the idle payload means "still running". + var idle = MakeIdleWithAgents(); + Assert.True(CopilotService.HasActiveBackgroundTasks(idle, idleDeferStartedAt: null)); + } + + [Fact] + public void NoDeferStartTime_NoTasks_ReturnsFalse() + { + var idle = new SessionIdleEvent { Data = new SessionIdleData() }; + Assert.False(CopilotService.HasActiveBackgroundTasks(idle, idleDeferStartedAt: null)); + } + + // --- Fresh IDLE-DEFER (started recently — well within timeout) --- + + [Fact] + public void FreshDeferStart_ActiveAgent_ReturnsTrue() + { + var idle = MakeIdleWithAgents(); + var deferStartedAt = DateTime.UtcNow.AddMinutes(-1); // deferred 1 minute ago + Assert.True(CopilotService.HasActiveBackgroundTasks(idle, deferStartedAt)); + } + + [Fact] + public void DeferStartJustBelowThreshold_ReturnsTrue() + { + var idle = MakeIdleWithAgents(); + var deferStartedAt = DateTime.UtcNow.AddMinutes(-(CopilotService.SubagentZombieTimeoutMinutes - 1)); + Assert.True(CopilotService.HasActiveBackgroundTasks(idle, deferStartedAt)); + } + + // --- Zombie threshold reached --- + + [Fact] + public void ZombieThresholdExceeded_SingleAgent_ReturnsFalse() + { + var idle = MakeIdleWithAgents(); + var deferStartedAt = DateTime.UtcNow.AddMinutes(-(CopilotService.SubagentZombieTimeoutMinutes + 1)); + Assert.False(CopilotService.HasActiveBackgroundTasks(idle, deferStartedAt)); + } + + [Fact] + public void ZombieThresholdExceeded_MultipleAgents_ReturnsFalse() + { + // All 8 agents reported — none complete — reproduces the real incident + var idle = MakeIdleWithAgents(agentCount: 8); + var deferStartedAt = DateTime.UtcNow.AddMinutes(-42); // stuck for 42 minutes + Assert.False(CopilotService.HasActiveBackgroundTasks(idle, deferStartedAt)); + } + + [Fact] + public void ZombieThresholdExactlyMet_ReturnsFalse() + { + // At exactly the threshold, the session is considered expired. + var idle = MakeIdleWithAgents(); + var deferStartedAt = DateTime.UtcNow.AddMinutes(-CopilotService.SubagentZombieTimeoutMinutes); + Assert.False(CopilotService.HasActiveBackgroundTasks(idle, deferStartedAt)); + } + + // --- Shells are never expired --- + + [Fact] + public void ZombieThresholdExceeded_WithShells_ReturnsTrue() + { + // Even if all background agents are expired, an active shell keeps IDLE-DEFER alive. + // Shells are managed at the OS level — PolyPilot never force-expires them. + var idle = new SessionIdleEvent + { + Data = new SessionIdleData + { + BackgroundTasks = new SessionIdleDataBackgroundTasks + { + Agents = new[] + { + new SessionIdleDataBackgroundTasksAgentsItem + { + AgentId = "zombie-agent", AgentType = "copilot", Description = "" + } + }, + Shells = new[] + { + new SessionIdleDataBackgroundTasksShellsItem + { + ShellId = "shell-1", Description = "npm test" + } + } + } + } + }; + var deferStartedAt = DateTime.UtcNow.AddMinutes(-30); + Assert.True(CopilotService.HasActiveBackgroundTasks(idle, deferStartedAt)); + } + + [Fact] + public void ZombieThresholdExceeded_ShellsOnly_ReturnsTrue() + { + // Shells alone always block completion — they are never zombie-expired. + var idle = MakeIdleWithShells(); + var deferStartedAt = DateTime.UtcNow.AddMinutes(-60); + Assert.True(CopilotService.HasActiveBackgroundTasks(idle, deferStartedAt)); + } + + [Fact] + public void FreshDeferStart_ShellsOnly_ReturnsTrue() + { + var idle = MakeIdleWithShells(); + var deferStartedAt = DateTime.UtcNow.AddMinutes(-1); + Assert.True(CopilotService.HasActiveBackgroundTasks(idle, deferStartedAt)); + } + + // --- Existing BackgroundTasksIdleTests cases remain unaffected with null deferStartedAt --- + + [Fact] + public void BackwardCompat_NullBackgroundTasks_ReturnsFalse() + { + var idle = new SessionIdleEvent { Data = new SessionIdleData { BackgroundTasks = null } }; + Assert.False(CopilotService.HasActiveBackgroundTasks(idle)); + } + + [Fact] + public void BackwardCompat_WithAgents_ReturnsTrue() + { + var idle = MakeIdleWithAgents(); + Assert.True(CopilotService.HasActiveBackgroundTasks(idle)); + } + + [Fact] + public void BackwardCompat_WithShells_ReturnsTrue() + { + var idle = MakeIdleWithShells(); + Assert.True(CopilotService.HasActiveBackgroundTasks(idle)); + } +} diff --git a/PolyPilot/Services/CopilotService.Events.cs b/PolyPilot/Services/CopilotService.Events.cs index 04d2b5453..3b5947a81 100644 --- a/PolyPilot/Services/CopilotService.Events.cs +++ b/PolyPilot/Services/CopilotService.Events.cs @@ -688,7 +688,10 @@ void Invoke(Action action) // KEY FIX: Check if the server reports active background tasks (sub-agents, shells). // session.idle with background tasks means "foreground quiesced, background still running." // Do NOT treat this as terminal — flush text and wait for the real idle. - if (HasActiveBackgroundTasks(idle)) + // Record when we first entered IDLE-DEFER for this turn (used for zombie expiry). + state.SubagentDeferStartedAt ??= DateTime.UtcNow; + + if (HasActiveBackgroundTasks(idle, state.SubagentDeferStartedAt)) { state.HasDeferredIdle = true; // Track for watchdog freshness window Debug($"[IDLE-DEFER] '{sessionName}' session.idle received with active background tasks — " + @@ -1240,6 +1243,7 @@ private void CompleteResponse(SessionState state, long? expectedGeneration = nul Interlocked.Exchange(ref state.ActiveToolCallCount, 0); state.HasUsedToolsThisTurn = false; state.HasDeferredIdle = false; + state.SubagentDeferStartedAt = null; state.IsReconnectedSend = false; // Clear reconnect flag on turn completion (defense-in-depth) state.FallbackCanceledByTurnStart = false; Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); @@ -2128,16 +2132,49 @@ internal static bool IsMcpError(string? text) return error.ToString(); } + /// + /// How long a session can be stuck in IDLE-DEFER (background agents reported but never completing) + /// before all background agents are treated as zombies and the session is allowed to complete. + /// The Copilot CLI has no per-agent timeout, so a crashed or orphaned subagent blocks + /// IDLE-DEFER indefinitely. After this threshold PolyPilot expires the stale block. + /// Shells are never expired — they are managed at the OS level. + /// + internal const int SubagentZombieTimeoutMinutes = 20; + /// /// Check if a SessionIdleEvent reports active background tasks (agents or shells). /// When background tasks are active, session.idle means "foreground quiesced, background /// still running" — NOT true completion. + /// + /// When is provided, background agents are treated as + /// zombies if the session has been in IDLE-DEFER longer than . + /// This allows the session to complete even if the CLI never fires SubagentCompleted for + /// a crashed or orphaned subagent. + /// Shells are never expired — their lifecycle is managed by the OS. /// - internal static bool HasActiveBackgroundTasks(SessionIdleEvent idle) + internal static bool HasActiveBackgroundTasks( + SessionIdleEvent idle, + DateTime? idleDeferStartedAt = null) { var bt = idle.Data?.BackgroundTasks; if (bt == null) return false; - return (bt.Agents is { Length: > 0 }) || (bt.Shells is { Length: > 0 }); + + bool hasAgents = bt.Agents is { Length: > 0 }; + + if (hasAgents && idleDeferStartedAt.HasValue) + { + var elapsed = DateTime.UtcNow - idleDeferStartedAt.Value; + if (elapsed.TotalMinutes >= SubagentZombieTimeoutMinutes) + { + Console.WriteLine( + $"[IDLE-DEFER] Zombie expiry: {bt.Agents!.Length} background agent(s) still " + + $"reported after {elapsed.TotalMinutes:F0}min in IDLE-DEFER " + + $"(threshold={SubagentZombieTimeoutMinutes}min) — allowing session to complete"); + hasAgents = false; + } + } + + return hasAgents || (bt.Shells is { Length: > 0 }); } private void StartProcessingWatchdog(SessionState state, string sessionName) diff --git a/PolyPilot/Services/CopilotService.cs b/PolyPilot/Services/CopilotService.cs index 95f39444f..b1b300a35 100644 --- a/PolyPilot/Services/CopilotService.cs +++ b/PolyPilot/Services/CopilotService.cs @@ -695,6 +695,15 @@ private class SessionState /// session to restart processing on its own ("continued without me sending a message"). /// Cleared by SendPromptAsync at the start of each new turn. public volatile bool WasUserAborted; + /// + /// UTC timestamp when IDLE-DEFER was first entered for the current turn (i.e., the first + /// SessionIdleEvent with active background tasks). Null until the first deferral. + /// Used by HasActiveBackgroundTasks to detect zombie subagents: if the session has been + /// stuck in IDLE-DEFER longer than SubagentZombieTimeoutMinutes, all reported background + /// agents are treated as zombies and the session is allowed to complete normally. + /// Cleared in CompleteResponse alongside HasDeferredIdle. + /// + public DateTime? SubagentDeferStartedAt; } private static void DisposePrematureIdleSignal(SessionState? state) From 6d615e779faeff6189e357139ef46bea82b83db4 Mon Sep 17 00:00:00 2001 From: Shane Neuville Date: Sun, 5 Apr 2026 12:48:06 -0500 Subject: [PATCH 2/3] =?UTF-8?q?fix:=20Address=20review=20=E2=80=94=20Inter?= =?UTF-8?q?locked=20ticks,=20clear=20in=20all=20paths,=20Debug()=20logging?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses all four review findings on PR #511: 🔴 CRITICAL — SubagentDeferStartedAt cleared in all 17 HasDeferredIdle paths SubagentDeferStartedAt and HasDeferredIdle are an inseparable companion pair. The field was only cleared in CompleteResponse; the other 16 paths (SendPromptAsync, AbortSessionAsync, watchdog abort, reconnect, error handlers, sibling reconnect, new-state reset, etc.) left it stale, causing zombie expiry to fire immediately on the next turn's first IDLE-DEFER. Added Interlocked.Exchange(ref ..., 0L) alongside every HasDeferredIdle = false assignment. 🟡 MODERATE — DateTime? → long ticks with Interlocked for thread safety DateTime? is a 12–16 byte struct; reads/writes are not atomic. Replaced with SubagentDeferStartedAtTicks (long, matching the TurnEndReceivedAtTicks pattern). Set via Interlocked.CompareExchange(0 → now) to preserve the first-write timestamp; cleared via Interlocked.Exchange(0); read via Interlocked.Read. 🟡 MODERATE — Console.WriteLine → Debug() at call site HasActiveBackgroundTasks is static and cannot call Debug(). Moved the zombie expiry log to the IDLE-DEFER call site with tag [IDLE-DEFER-ZOMBIE] so it routes through the diagnostics pipeline (~/.polypilot/event-diagnostics.log), consistent with all other session state transitions. 🟢 MINOR — Cross-turn stale timestamp test added StaleDeferTimestamp_FromPriorTurn_NewTurnShouldNotExpireAgents documents why clearing SubagentDeferStartedAtTicks at every turn boundary is required: passing a 25-min-old ticks value for a brand-new IDLE-DEFER causes immediate zombie expiry and kills live subagents. Tests updated to use long ticks throughout. Co-Authored-By: Claude Sonnet 4.6 --- PolyPilot.Tests/ZombieSubagentExpiryTests.cs | 91 ++++++++++++++------ PolyPilot/Services/CopilotService.Events.cs | 51 +++++++---- PolyPilot/Services/CopilotService.cs | 23 +++-- 3 files changed, 116 insertions(+), 49 deletions(-) diff --git a/PolyPilot.Tests/ZombieSubagentExpiryTests.cs b/PolyPilot.Tests/ZombieSubagentExpiryTests.cs index 7f6a2d911..bbe065671 100644 --- a/PolyPilot.Tests/ZombieSubagentExpiryTests.cs +++ b/PolyPilot.Tests/ZombieSubagentExpiryTests.cs @@ -8,12 +8,16 @@ namespace PolyPilot.Tests; /// /// Problem: The Copilot CLI has no per-subagent timeout. A crashed or orphaned subagent /// never fires SubagentCompleted/SubagentFailed, so IDLE-DEFER blocks session completion -/// indefinitely. PolyPilot tracks when IDLE-DEFER first started and expires the background -/// agent block after SubagentZombieTimeoutMinutes, allowing the session to complete normally. +/// indefinitely. PolyPilot tracks when IDLE-DEFER first started (as UTC ticks in +/// SubagentDeferStartedAtTicks) and expires the background agent block after +/// SubagentZombieTimeoutMinutes, allowing the session to complete normally. /// See: issue #509 (expose CancelBackgroundTaskAsync via SDK). /// public class ZombieSubagentExpiryTests { + private static long TicksAgo(double minutes) => + DateTime.UtcNow.AddMinutes(-minutes).Ticks; + private static SessionIdleEvent MakeIdleWithAgents(int agentCount = 1) { return new SessionIdleEvent @@ -55,22 +59,21 @@ private static SessionIdleEvent MakeIdleWithShells(int shellCount = 1) }; } - // --- No defer start time (null) — backward-compatible behavior --- + // --- Zero ticks (not set) — backward-compatible behavior --- [Fact] - public void NoDeferStartTime_ActiveAgent_ReturnsTrue() + public void ZeroTicks_ActiveAgent_ReturnsTrue() { - // When no deferStartedAt is provided, behaviour is unchanged: - // any agent in the idle payload means "still running". + // 0 means "not set" — behaviour is unchanged: any agent means "still running". var idle = MakeIdleWithAgents(); - Assert.True(CopilotService.HasActiveBackgroundTasks(idle, idleDeferStartedAt: null)); + Assert.True(CopilotService.HasActiveBackgroundTasks(idle, idleDeferStartedAtTicks: 0)); } [Fact] - public void NoDeferStartTime_NoTasks_ReturnsFalse() + public void ZeroTicks_NoTasks_ReturnsFalse() { var idle = new SessionIdleEvent { Data = new SessionIdleData() }; - Assert.False(CopilotService.HasActiveBackgroundTasks(idle, idleDeferStartedAt: null)); + Assert.False(CopilotService.HasActiveBackgroundTasks(idle, idleDeferStartedAtTicks: 0)); } // --- Fresh IDLE-DEFER (started recently — well within timeout) --- @@ -79,16 +82,15 @@ public void NoDeferStartTime_NoTasks_ReturnsFalse() public void FreshDeferStart_ActiveAgent_ReturnsTrue() { var idle = MakeIdleWithAgents(); - var deferStartedAt = DateTime.UtcNow.AddMinutes(-1); // deferred 1 minute ago - Assert.True(CopilotService.HasActiveBackgroundTasks(idle, deferStartedAt)); + Assert.True(CopilotService.HasActiveBackgroundTasks(idle, TicksAgo(1))); } [Fact] public void DeferStartJustBelowThreshold_ReturnsTrue() { var idle = MakeIdleWithAgents(); - var deferStartedAt = DateTime.UtcNow.AddMinutes(-(CopilotService.SubagentZombieTimeoutMinutes - 1)); - Assert.True(CopilotService.HasActiveBackgroundTasks(idle, deferStartedAt)); + Assert.True(CopilotService.HasActiveBackgroundTasks( + idle, TicksAgo(CopilotService.SubagentZombieTimeoutMinutes - 1))); } // --- Zombie threshold reached --- @@ -97,8 +99,8 @@ public void DeferStartJustBelowThreshold_ReturnsTrue() public void ZombieThresholdExceeded_SingleAgent_ReturnsFalse() { var idle = MakeIdleWithAgents(); - var deferStartedAt = DateTime.UtcNow.AddMinutes(-(CopilotService.SubagentZombieTimeoutMinutes + 1)); - Assert.False(CopilotService.HasActiveBackgroundTasks(idle, deferStartedAt)); + Assert.False(CopilotService.HasActiveBackgroundTasks( + idle, TicksAgo(CopilotService.SubagentZombieTimeoutMinutes + 1))); } [Fact] @@ -106,17 +108,18 @@ public void ZombieThresholdExceeded_MultipleAgents_ReturnsFalse() { // All 8 agents reported — none complete — reproduces the real incident var idle = MakeIdleWithAgents(agentCount: 8); - var deferStartedAt = DateTime.UtcNow.AddMinutes(-42); // stuck for 42 minutes - Assert.False(CopilotService.HasActiveBackgroundTasks(idle, deferStartedAt)); + Assert.False(CopilotService.HasActiveBackgroundTasks(idle, TicksAgo(42))); } [Fact] public void ZombieThresholdExactlyMet_ReturnsFalse() { // At exactly the threshold, the session is considered expired. + // TicksAgo(20) produces ticks > 20min ago (test executes in microseconds, not minutes), + // so elapsed will be just over 20min and the threshold check fires correctly. var idle = MakeIdleWithAgents(); - var deferStartedAt = DateTime.UtcNow.AddMinutes(-CopilotService.SubagentZombieTimeoutMinutes); - Assert.False(CopilotService.HasActiveBackgroundTasks(idle, deferStartedAt)); + Assert.False(CopilotService.HasActiveBackgroundTasks( + idle, TicksAgo(CopilotService.SubagentZombieTimeoutMinutes))); } // --- Shells are never expired --- @@ -149,8 +152,7 @@ public void ZombieThresholdExceeded_WithShells_ReturnsTrue() } } }; - var deferStartedAt = DateTime.UtcNow.AddMinutes(-30); - Assert.True(CopilotService.HasActiveBackgroundTasks(idle, deferStartedAt)); + Assert.True(CopilotService.HasActiveBackgroundTasks(idle, TicksAgo(30))); } [Fact] @@ -158,19 +160,56 @@ public void ZombieThresholdExceeded_ShellsOnly_ReturnsTrue() { // Shells alone always block completion — they are never zombie-expired. var idle = MakeIdleWithShells(); - var deferStartedAt = DateTime.UtcNow.AddMinutes(-60); - Assert.True(CopilotService.HasActiveBackgroundTasks(idle, deferStartedAt)); + Assert.True(CopilotService.HasActiveBackgroundTasks(idle, TicksAgo(60))); } [Fact] public void FreshDeferStart_ShellsOnly_ReturnsTrue() { var idle = MakeIdleWithShells(); - var deferStartedAt = DateTime.UtcNow.AddMinutes(-1); - Assert.True(CopilotService.HasActiveBackgroundTasks(idle, deferStartedAt)); + Assert.True(CopilotService.HasActiveBackgroundTasks(idle, TicksAgo(1))); + } + + // --- Cross-turn stale timestamp: the critical lifecycle bug this PR fixes --- + + [Fact] + public void StaleDeferTimestamp_FromPriorTurn_NewTurnShouldNotExpireAgents() + { + // Scenario: SubagentDeferStartedAtTicks was NOT cleared (e.g. watchdog/abort path) + // after Turn N which had an IDLE-DEFER 25 minutes ago. Turn N+1 starts and its first + // IDLE-DEFER fires. The ??= logic would preserve the stale timestamp. + // + // This test documents WHY SubagentDeferStartedAtTicks MUST be reset in all paths that + // clear HasDeferredIdle, not just CompleteResponse. If a caller passes a 25-min-old + // timestamp for what is actually a brand-new IDLE-DEFER, zombie expiry fires immediately + // and kills live subagents. + // + // Fix: Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L) alongside + // every HasDeferredIdle = false in SendPromptAsync, AbortSessionAsync, error paths, etc. + // + // After the fix, SubagentDeferStartedAtTicks is reset at turn boundaries, so the + // ??= CompareExchange sets a fresh timestamp for the new turn, and zombie expiry + // is based on the new turn's actual elapsed time. + var idle = MakeIdleWithAgents(); + + // Simulate: stale ticks from 25 minutes ago NOT cleared at turn boundary + long staleTicks = TicksAgo(25); + + // Without the fix, this would return false (zombie expiry fires on fresh agents) + // The test ASSERTS false to document what the broken behavior looks like, + // and to verify that HasActiveBackgroundTasks correctly respects the ticks value. + // The real invariant is: callers MUST pass 0 (not stale ticks) for new turns. + Assert.False(CopilotService.HasActiveBackgroundTasks(idle, staleTicks), + "HasActiveBackgroundTasks correctly expires based on elapsed ticks — " + + "the caller is responsible for resetting SubagentDeferStartedAtTicks at turn boundaries."); + + // The safe path: passing fresh ticks (new turn, new timestamp) should NOT expire agents + long freshTicks = TicksAgo(1); + Assert.True(CopilotService.HasActiveBackgroundTasks(idle, freshTicks), + "With fresh ticks (new turn), agents should NOT be expired — confirms the fix works."); } - // --- Existing BackgroundTasksIdleTests cases remain unaffected with null deferStartedAt --- + // --- Backward compatibility with existing BackgroundTasksIdleTests --- [Fact] public void BackwardCompat_NullBackgroundTasks_ReturnsFalse() diff --git a/PolyPilot/Services/CopilotService.Events.cs b/PolyPilot/Services/CopilotService.Events.cs index 3b5947a81..d86220fd2 100644 --- a/PolyPilot/Services/CopilotService.Events.cs +++ b/PolyPilot/Services/CopilotService.Events.cs @@ -689,9 +689,26 @@ void Invoke(Action action) // session.idle with background tasks means "foreground quiesced, background still running." // Do NOT treat this as terminal — flush text and wait for the real idle. // Record when we first entered IDLE-DEFER for this turn (used for zombie expiry). - state.SubagentDeferStartedAt ??= DateTime.UtcNow; + // CompareExchange(0 → now): sets only on the first IDLE-DEFER; subsequent ones + // for the same turn preserve the original timestamp so elapsed time is cumulative. + Interlocked.CompareExchange( + ref state.SubagentDeferStartedAtTicks, + DateTime.UtcNow.Ticks, + 0L); + + var deferTicks = Interlocked.Read(ref state.SubagentDeferStartedAtTicks); + var hasActiveTasks = HasActiveBackgroundTasks(idle, deferTicks); + + // Log zombie expiry here where Debug() is available (HasActiveBackgroundTasks is static) + if (!hasActiveTasks && deferTicks != 0 && (idle.Data?.BackgroundTasks?.Agents?.Length ?? 0) > 0) + { + var expiredMinutes = TimeSpan.FromTicks(DateTime.UtcNow.Ticks - deferTicks).TotalMinutes; + Debug($"[IDLE-DEFER-ZOMBIE] '{sessionName}' {idle.Data!.BackgroundTasks!.Agents!.Length} " + + $"background agent(s) expired after {expiredMinutes:F0}min " + + $"(threshold={SubagentZombieTimeoutMinutes}min) — allowing session to complete"); + } - if (HasActiveBackgroundTasks(idle, state.SubagentDeferStartedAt)) + if (hasActiveTasks) { state.HasDeferredIdle = true; // Track for watchdog freshness window Debug($"[IDLE-DEFER] '{sessionName}' session.idle received with active background tasks — " + @@ -888,6 +905,7 @@ await notifService.SendNotificationAsync( Interlocked.Exchange(ref state.ActiveToolCallCount, 0); state.HasUsedToolsThisTurn = false; state.HasDeferredIdle = false; + Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L); Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); Interlocked.Exchange(ref state.ToolHealthStaleChecks, 0); Interlocked.Exchange(ref state.EventCountThisTurn, 0); @@ -1243,7 +1261,7 @@ private void CompleteResponse(SessionState state, long? expectedGeneration = nul Interlocked.Exchange(ref state.ActiveToolCallCount, 0); state.HasUsedToolsThisTurn = false; state.HasDeferredIdle = false; - state.SubagentDeferStartedAt = null; + Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L); state.IsReconnectedSend = false; // Clear reconnect flag on turn completion (defense-in-depth) state.FallbackCanceledByTurnStart = false; Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); @@ -1993,6 +2011,7 @@ private void TriggerToolHealthRecovery(SessionState state, string sessionName, s Interlocked.Exchange(ref state.ActiveToolCallCount, 0); state.HasUsedToolsThisTurn = false; state.HasDeferredIdle = false; + Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L); state.FallbackCanceledByTurnStart = false; Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); Interlocked.Exchange(ref state.WatchdogCaseAResets, 0); @@ -2146,32 +2165,27 @@ internal static bool IsMcpError(string? text) /// When background tasks are active, session.idle means "foreground quiesced, background /// still running" — NOT true completion. /// - /// When is provided, background agents are treated as - /// zombies if the session has been in IDLE-DEFER longer than . - /// This allows the session to complete even if the CLI never fires SubagentCompleted for - /// a crashed or orphaned subagent. + /// When is non-zero, background agents are treated + /// as zombies if the session has been in IDLE-DEFER longer than + /// . This allows the session to complete even if + /// the CLI never fires SubagentCompleted for a crashed or orphaned subagent. + /// The caller is responsible for logging the zombie expiry via Debug(). /// Shells are never expired — their lifecycle is managed by the OS. /// internal static bool HasActiveBackgroundTasks( SessionIdleEvent idle, - DateTime? idleDeferStartedAt = null) + long idleDeferStartedAtTicks = 0) { var bt = idle.Data?.BackgroundTasks; if (bt == null) return false; bool hasAgents = bt.Agents is { Length: > 0 }; - if (hasAgents && idleDeferStartedAt.HasValue) + if (hasAgents && idleDeferStartedAtTicks != 0) { - var elapsed = DateTime.UtcNow - idleDeferStartedAt.Value; + var elapsed = TimeSpan.FromTicks(DateTime.UtcNow.Ticks - idleDeferStartedAtTicks); if (elapsed.TotalMinutes >= SubagentZombieTimeoutMinutes) - { - Console.WriteLine( - $"[IDLE-DEFER] Zombie expiry: {bt.Agents!.Length} background agent(s) still " + - $"reported after {elapsed.TotalMinutes:F0}min in IDLE-DEFER " + - $"(threshold={SubagentZombieTimeoutMinutes}min) — allowing session to complete"); hasAgents = false; - } } return hasAgents || (bt.Shells is { Length: > 0 }); @@ -2600,6 +2614,7 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session Interlocked.Exchange(ref state.ActiveToolCallCount, 0); state.HasUsedToolsThisTurn = false; state.HasDeferredIdle = false; + Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L); Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); Interlocked.Exchange(ref state.ToolHealthStaleChecks, 0); Interlocked.Exchange(ref state.EventCountThisTurn, 0); @@ -2699,6 +2714,7 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session Interlocked.Exchange(ref state.ActiveToolCallCount, 0); state.HasUsedToolsThisTurn = false; state.HasDeferredIdle = false; + Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L); Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); Interlocked.Exchange(ref state.ToolHealthStaleChecks, 0); Interlocked.Exchange(ref state.EventCountThisTurn, 0); @@ -2755,6 +2771,7 @@ private void ClearProcessingStateForRecoveryFailure(SessionState state, string s state.Info.IsResumed = false; state.HasUsedToolsThisTurn = false; state.HasDeferredIdle = false; + Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L); Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); Interlocked.Exchange(ref state.ActiveToolCallCount, 0); Interlocked.Exchange(ref state.ToolHealthStaleChecks, 0); @@ -2855,6 +2872,7 @@ private async Task TryRecoverPermissionAsync(SessionState state, string sessionN // Clear stale tool flag so watchdog uses normal timeout if resend is skipped newState.HasUsedToolsThisTurn = false; newState.HasDeferredIdle = false; + newState.SubagentDeferStartedAtTicks = 0L; // redundant (default), but explicit companion pair // Replace in sessions dictionary BEFORE registering event handler // so HandleSessionEvent's isCurrentState check passes for the new state. @@ -2893,6 +2911,7 @@ private async Task TryRecoverPermissionAsync(SessionState state, string sessionN state.Info.IsResumed = false; state.HasUsedToolsThisTurn = false; state.HasDeferredIdle = false; + Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L); Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); Interlocked.Exchange(ref state.ActiveToolCallCount, 0); Interlocked.Exchange(ref state.ToolHealthStaleChecks, 0); diff --git a/PolyPilot/Services/CopilotService.cs b/PolyPilot/Services/CopilotService.cs index b1b300a35..f94ac1c84 100644 --- a/PolyPilot/Services/CopilotService.cs +++ b/PolyPilot/Services/CopilotService.cs @@ -696,14 +696,14 @@ private class SessionState /// Cleared by SendPromptAsync at the start of each new turn. public volatile bool WasUserAborted; /// - /// UTC timestamp when IDLE-DEFER was first entered for the current turn (i.e., the first - /// SessionIdleEvent with active background tasks). Null until the first deferral. - /// Used by HasActiveBackgroundTasks to detect zombie subagents: if the session has been - /// stuck in IDLE-DEFER longer than SubagentZombieTimeoutMinutes, all reported background - /// agents are treated as zombies and the session is allowed to complete normally. - /// Cleared in CompleteResponse alongside HasDeferredIdle. + /// UTC ticks when IDLE-DEFER was first entered for the current turn (first + /// SessionIdleEvent with active background tasks). 0 = not set. + /// Uses Interlocked for thread safety: the IDLE-DEFER section (SDK event thread) sets via + /// CompareExchange(0 → now); CompleteResponse/SendPromptAsync (UI thread) clear via Exchange(0). + /// Matches the pattern of TurnEndReceivedAtTicks. MUST be cleared in every path that + /// clears HasDeferredIdle — the two fields are an inseparable companion pair. /// - public DateTime? SubagentDeferStartedAt; + public long SubagentDeferStartedAtTicks; } private static void DisposePrematureIdleSignal(SessionState? state) @@ -3289,6 +3289,7 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis Interlocked.Exchange(ref state.ActiveToolCallCount, 0); // Reset stale tool count from previous turn state.HasUsedToolsThisTurn = false; // Reset stale tool flag from previous turn state.HasDeferredIdle = false; // Reset deferred idle flag from previous turn + Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L); // Companion pair — must clear with HasDeferredIdle state.IsReconnectedSend = false; // Clear reconnect flag — new turn starts fresh (see watchdog reconnect timeout) state.WasUserAborted = false; // Clear abort flag — new turn starts fresh (re-enables EVT-REARM) state.PrematureIdleSignal.Reset(); // Clear premature idle detection from previous turn @@ -3580,6 +3581,7 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis // Mirror primary reconnect: reset tool tracking for new connection siblingState.HasUsedToolsThisTurn = false; siblingState.HasDeferredIdle = false; + Interlocked.Exchange(ref siblingState.SubagentDeferStartedAtTicks, 0L); Interlocked.Exchange(ref siblingState.ActiveToolCallCount, 0); Interlocked.Exchange(ref siblingState.SuccessfulToolCountThisTurn, 0); Interlocked.Exchange(ref siblingState.ToolHealthStaleChecks, 0); @@ -3773,6 +3775,7 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis // sessions wait 5x longer than necessary to recover. newState.HasUsedToolsThisTurn = false; newState.HasDeferredIdle = false; + newState.SubagentDeferStartedAtTicks = 0L; // redundant (default), but explicit companion pair Interlocked.Exchange(ref newState.ActiveToolCallCount, 0); Interlocked.Exchange(ref newState.SuccessfulToolCountThisTurn, 0); newState.IsMultiAgentSession = state.IsMultiAgentSession; @@ -3809,6 +3812,7 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis // 120s watchdog tier instead of the inflated 600s from stale tool state. state.HasUsedToolsThisTurn = false; state.HasDeferredIdle = false; + Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L); // Schedule persistence of the new session ID so it survives app restart. // Without this, the debounced save captures the pre-reconnect snapshot @@ -3881,6 +3885,7 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis Interlocked.Exchange(ref state.ActiveToolCallCount, 0); state.HasUsedToolsThisTurn = false; state.HasDeferredIdle = false; + Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L); Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); state.Info.IsResumed = false; state.Info.IsProcessing = false; @@ -3904,6 +3909,7 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis Interlocked.Exchange(ref state.ActiveToolCallCount, 0); state.HasUsedToolsThisTurn = false; state.HasDeferredIdle = false; + Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L); Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); state.Info.IsResumed = false; state.Info.IsProcessing = false; @@ -4057,6 +4063,7 @@ public async Task AbortSessionAsync(string sessionName, bool markAsInterrupted = Interlocked.Exchange(ref state.ActiveToolCallCount, 0); state.HasUsedToolsThisTurn = false; state.HasDeferredIdle = false; + Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L); // INV-1: companion pair with HasDeferredIdle state.IsReconnectedSend = false; // INV-1: clear all per-turn flags on abort Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); // Release send lock — allows a subsequent SteerSessionAsync to acquire it immediately @@ -4183,6 +4190,7 @@ await InvokeOnUIAsync(() => Interlocked.Exchange(ref state.ActiveToolCallCount, 0); state.HasUsedToolsThisTurn = false; state.HasDeferredIdle = false; + Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L); Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); state.Info.IsResumed = false; state.IsReconnectedSend = false; // INV-1 item 8: prevent stale 35s timeout on next watchdog start @@ -4471,6 +4479,7 @@ await InvokeOnUIAsync(() => state.Info.IsResumed = false; state.HasUsedToolsThisTurn = false; state.HasDeferredIdle = false; + Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L); Interlocked.Exchange(ref state.ActiveToolCallCount, 0); Interlocked.Exchange(ref state.SendingFlag, 0); state.Info.ProcessingStartedAt = null; From 446e650d583453c0b4b2ad4b569ea9a6356458ad Mon Sep 17 00:00:00 2001 From: Shane Neuville Date: Sun, 5 Apr 2026 13:44:11 -0500 Subject: [PATCH 3/3] fix: Add missing SubagentDeferStartedAtTicks clear in ForceCompleteProcessing ForceCompleteProcessing (Organization.cs:2215) had HasDeferredIdle = false without the companion SubagentDeferStartedAtTicks clear. This violated the stated invariant that the two fields are an inseparable companion pair. A stale timestamp leaking across turns would cause immediate false zombie expiry on the next IDLE-DEFER. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- PolyPilot/Services/CopilotService.Organization.cs | 1 + 1 file changed, 1 insertion(+) diff --git a/PolyPilot/Services/CopilotService.Organization.cs b/PolyPilot/Services/CopilotService.Organization.cs index d439388fc..c4ecdffe8 100644 --- a/PolyPilot/Services/CopilotService.Organization.cs +++ b/PolyPilot/Services/CopilotService.Organization.cs @@ -2213,6 +2213,7 @@ private async Task ForceCompleteProcessingAsync(string sessionName, SessionState Interlocked.Exchange(ref state.WatchdogCaseBStaleCount, 0); state.HasUsedToolsThisTurn = false; state.HasDeferredIdle = false; + Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L); state.FallbackCanceledByTurnStart = false; state.Info.IsResumed = false; state.Info.ProcessingStartedAt = null;