Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
234 changes: 234 additions & 0 deletions PolyPilot.Tests/ZombieSubagentExpiryTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
using GitHub.Copilot.SDK;
using PolyPilot.Services;

namespace PolyPilot.Tests;

/// <summary>
/// Tests for the zombie subagent expiry mechanism in HasActiveBackgroundTasks.
///
/// Problem: The Copilot CLI has no per-subagent timeout. A crashed or orphaned subagent
/// never fires SubagentCompleted/SubagentFailed, so IDLE-DEFER blocks session completion
/// indefinitely. PolyPilot tracks when IDLE-DEFER first started (as UTC ticks in
/// SubagentDeferStartedAtTicks) and expires the background agent block after
/// SubagentZombieTimeoutMinutes, allowing the session to complete normally.
/// See: issue #509 (expose CancelBackgroundTaskAsync via SDK).
/// </summary>
public class ZombieSubagentExpiryTests
{
private static long TicksAgo(double minutes) =>
DateTime.UtcNow.AddMinutes(-minutes).Ticks;

private static SessionIdleEvent MakeIdleWithAgents(int agentCount = 1)
{
return new SessionIdleEvent
{
Data = new SessionIdleData
{
BackgroundTasks = new SessionIdleDataBackgroundTasks
{
Agents = Enumerable.Range(0, agentCount)
.Select(i => new SessionIdleDataBackgroundTasksAgentsItem
{
AgentId = $"agent-{i}",
AgentType = "copilot",
Description = ""
}).ToArray(),
Shells = Array.Empty<SessionIdleDataBackgroundTasksShellsItem>()
}
}
};
}

private static SessionIdleEvent MakeIdleWithShells(int shellCount = 1)
{
return new SessionIdleEvent
{
Data = new SessionIdleData
{
BackgroundTasks = new SessionIdleDataBackgroundTasks
{
Agents = Array.Empty<SessionIdleDataBackgroundTasksAgentsItem>(),
Shells = Enumerable.Range(0, shellCount)
.Select(i => new SessionIdleDataBackgroundTasksShellsItem
{
ShellId = $"shell-{i}",
Description = ""
}).ToArray()
}
}
};
}

// --- Zero ticks (not set) — backward-compatible behavior ---

[Fact]
public void ZeroTicks_ActiveAgent_ReturnsTrue()
{
// 0 means "not set" — behaviour is unchanged: any agent means "still running".
var idle = MakeIdleWithAgents();
Assert.True(CopilotService.HasActiveBackgroundTasks(idle, idleDeferStartedAtTicks: 0));
}

[Fact]
public void ZeroTicks_NoTasks_ReturnsFalse()
{
var idle = new SessionIdleEvent { Data = new SessionIdleData() };
Assert.False(CopilotService.HasActiveBackgroundTasks(idle, idleDeferStartedAtTicks: 0));
}

// --- Fresh IDLE-DEFER (started recently — well within timeout) ---

[Fact]
public void FreshDeferStart_ActiveAgent_ReturnsTrue()
{
var idle = MakeIdleWithAgents();
Assert.True(CopilotService.HasActiveBackgroundTasks(idle, TicksAgo(1)));
}

[Fact]
public void DeferStartJustBelowThreshold_ReturnsTrue()
{
var idle = MakeIdleWithAgents();
Assert.True(CopilotService.HasActiveBackgroundTasks(
idle, TicksAgo(CopilotService.SubagentZombieTimeoutMinutes - 1)));
}

// --- Zombie threshold reached ---

[Fact]
public void ZombieThresholdExceeded_SingleAgent_ReturnsFalse()
{
var idle = MakeIdleWithAgents();
Assert.False(CopilotService.HasActiveBackgroundTasks(
idle, TicksAgo(CopilotService.SubagentZombieTimeoutMinutes + 1)));
}

[Fact]
public void ZombieThresholdExceeded_MultipleAgents_ReturnsFalse()
{
// All 8 agents reported — none complete — reproduces the real incident
var idle = MakeIdleWithAgents(agentCount: 8);
Assert.False(CopilotService.HasActiveBackgroundTasks(idle, TicksAgo(42)));
}

[Fact]
public void ZombieThresholdExactlyMet_ReturnsFalse()
{
// At exactly the threshold, the session is considered expired.
// TicksAgo(20) produces ticks > 20min ago (test executes in microseconds, not minutes),
// so elapsed will be just over 20min and the threshold check fires correctly.
var idle = MakeIdleWithAgents();
Assert.False(CopilotService.HasActiveBackgroundTasks(
idle, TicksAgo(CopilotService.SubagentZombieTimeoutMinutes)));
}

// --- Shells are never expired ---

[Fact]
public void ZombieThresholdExceeded_WithShells_ReturnsTrue()
{
// Even if all background agents are expired, an active shell keeps IDLE-DEFER alive.
// Shells are managed at the OS level — PolyPilot never force-expires them.
var idle = new SessionIdleEvent
{
Data = new SessionIdleData
{
BackgroundTasks = new SessionIdleDataBackgroundTasks
{
Agents = new[]
{
new SessionIdleDataBackgroundTasksAgentsItem
{
AgentId = "zombie-agent", AgentType = "copilot", Description = ""
}
},
Shells = new[]
{
new SessionIdleDataBackgroundTasksShellsItem
{
ShellId = "shell-1", Description = "npm test"
}
}
}
}
};
Assert.True(CopilotService.HasActiveBackgroundTasks(idle, TicksAgo(30)));
}

[Fact]
public void ZombieThresholdExceeded_ShellsOnly_ReturnsTrue()
{
// Shells alone always block completion — they are never zombie-expired.
var idle = MakeIdleWithShells();
Assert.True(CopilotService.HasActiveBackgroundTasks(idle, TicksAgo(60)));
}

[Fact]
public void FreshDeferStart_ShellsOnly_ReturnsTrue()
{
var idle = MakeIdleWithShells();
Assert.True(CopilotService.HasActiveBackgroundTasks(idle, TicksAgo(1)));
}

// --- Cross-turn stale timestamp: the critical lifecycle bug this PR fixes ---

[Fact]
public void StaleDeferTimestamp_FromPriorTurn_NewTurnShouldNotExpireAgents()
{
// Scenario: SubagentDeferStartedAtTicks was NOT cleared (e.g. watchdog/abort path)
// after Turn N which had an IDLE-DEFER 25 minutes ago. Turn N+1 starts and its first
// IDLE-DEFER fires. The ??= logic would preserve the stale timestamp.
//
// This test documents WHY SubagentDeferStartedAtTicks MUST be reset in all paths that
// clear HasDeferredIdle, not just CompleteResponse. If a caller passes a 25-min-old
// timestamp for what is actually a brand-new IDLE-DEFER, zombie expiry fires immediately
// and kills live subagents.
//
// Fix: Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L) alongside
// every HasDeferredIdle = false in SendPromptAsync, AbortSessionAsync, error paths, etc.
//
// After the fix, SubagentDeferStartedAtTicks is reset at turn boundaries, so the
// ??= CompareExchange sets a fresh timestamp for the new turn, and zombie expiry
// is based on the new turn's actual elapsed time.
var idle = MakeIdleWithAgents();

// Simulate: stale ticks from 25 minutes ago NOT cleared at turn boundary
long staleTicks = TicksAgo(25);

// Without the fix, this would return false (zombie expiry fires on fresh agents)
// The test ASSERTS false to document what the broken behavior looks like,
// and to verify that HasActiveBackgroundTasks correctly respects the ticks value.
// The real invariant is: callers MUST pass 0 (not stale ticks) for new turns.
Assert.False(CopilotService.HasActiveBackgroundTasks(idle, staleTicks),
"HasActiveBackgroundTasks correctly expires based on elapsed ticks — " +
"the caller is responsible for resetting SubagentDeferStartedAtTicks at turn boundaries.");

// The safe path: passing fresh ticks (new turn, new timestamp) should NOT expire agents
long freshTicks = TicksAgo(1);
Assert.True(CopilotService.HasActiveBackgroundTasks(idle, freshTicks),
"With fresh ticks (new turn), agents should NOT be expired — confirms the fix works.");
}

// --- Backward compatibility with existing BackgroundTasksIdleTests ---

[Fact]
public void BackwardCompat_NullBackgroundTasks_ReturnsFalse()
{
var idle = new SessionIdleEvent { Data = new SessionIdleData { BackgroundTasks = null } };
Assert.False(CopilotService.HasActiveBackgroundTasks(idle));
}

[Fact]
public void BackwardCompat_WithAgents_ReturnsTrue()
{
var idle = MakeIdleWithAgents();
Assert.True(CopilotService.HasActiveBackgroundTasks(idle));
}

[Fact]
public void BackwardCompat_WithShells_ReturnsTrue()
{
var idle = MakeIdleWithShells();
Assert.True(CopilotService.HasActiveBackgroundTasks(idle));
}
}
62 changes: 59 additions & 3 deletions PolyPilot/Services/CopilotService.Events.cs
Original file line number Diff line number Diff line change
Expand Up @@ -688,7 +688,27 @@ void Invoke(Action action)
// KEY FIX: Check if the server reports active background tasks (sub-agents, shells).
// session.idle with background tasks means "foreground quiesced, background still running."
// Do NOT treat this as terminal — flush text and wait for the real idle.
if (HasActiveBackgroundTasks(idle))
// Record when we first entered IDLE-DEFER for this turn (used for zombie expiry).
// CompareExchange(0 → now): sets only on the first IDLE-DEFER; subsequent ones
// for the same turn preserve the original timestamp so elapsed time is cumulative.
Interlocked.CompareExchange(
ref state.SubagentDeferStartedAtTicks,
DateTime.UtcNow.Ticks,
0L);

var deferTicks = Interlocked.Read(ref state.SubagentDeferStartedAtTicks);
var hasActiveTasks = HasActiveBackgroundTasks(idle, deferTicks);

// Log zombie expiry here where Debug() is available (HasActiveBackgroundTasks is static)
if (!hasActiveTasks && deferTicks != 0 && (idle.Data?.BackgroundTasks?.Agents?.Length ?? 0) > 0)
{
var expiredMinutes = TimeSpan.FromTicks(DateTime.UtcNow.Ticks - deferTicks).TotalMinutes;
Debug($"[IDLE-DEFER-ZOMBIE] '{sessionName}' {idle.Data!.BackgroundTasks!.Agents!.Length} " +
$"background agent(s) expired after {expiredMinutes:F0}min " +
$"(threshold={SubagentZombieTimeoutMinutes}min) — allowing session to complete");
}

if (hasActiveTasks)
{
state.HasDeferredIdle = true; // Track for watchdog freshness window
Debug($"[IDLE-DEFER] '{sessionName}' session.idle received with active background tasks — " +
Expand Down Expand Up @@ -885,6 +905,7 @@ await notifService.SendNotificationAsync(
Interlocked.Exchange(ref state.ActiveToolCallCount, 0);
state.HasUsedToolsThisTurn = false;
state.HasDeferredIdle = false;
Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L);
Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0);
Interlocked.Exchange(ref state.ToolHealthStaleChecks, 0);
Interlocked.Exchange(ref state.EventCountThisTurn, 0);
Expand Down Expand Up @@ -1240,6 +1261,7 @@ private void CompleteResponse(SessionState state, long? expectedGeneration = nul
Interlocked.Exchange(ref state.ActiveToolCallCount, 0);
state.HasUsedToolsThisTurn = false;
state.HasDeferredIdle = false;
Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L);
state.IsReconnectedSend = false; // Clear reconnect flag on turn completion (defense-in-depth)
state.FallbackCanceledByTurnStart = false;
Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0);
Expand Down Expand Up @@ -1989,6 +2011,7 @@ private void TriggerToolHealthRecovery(SessionState state, string sessionName, s
Interlocked.Exchange(ref state.ActiveToolCallCount, 0);
state.HasUsedToolsThisTurn = false;
state.HasDeferredIdle = false;
Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L);
state.FallbackCanceledByTurnStart = false;
Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0);
Interlocked.Exchange(ref state.WatchdogCaseAResets, 0);
Expand Down Expand Up @@ -2128,16 +2151,44 @@ internal static bool IsMcpError(string? text)
return error.ToString();
}

/// <summary>
/// How long a session can be stuck in IDLE-DEFER (background agents reported but never completing)
/// before all background agents are treated as zombies and the session is allowed to complete.
/// The Copilot CLI has no per-agent timeout, so a crashed or orphaned subagent blocks
/// IDLE-DEFER indefinitely. After this threshold PolyPilot expires the stale block.
/// Shells are never expired — they are managed at the OS level.
/// </summary>
internal const int SubagentZombieTimeoutMinutes = 20;

/// <summary>
/// Check if a SessionIdleEvent reports active background tasks (agents or shells).
/// When background tasks are active, session.idle means "foreground quiesced, background
/// still running" — NOT true completion.
///
/// When <paramref name="idleDeferStartedAtTicks"/> is non-zero, background agents are treated
/// as zombies if the session has been in IDLE-DEFER longer than
/// <see cref="SubagentZombieTimeoutMinutes"/>. This allows the session to complete even if
/// the CLI never fires SubagentCompleted for a crashed or orphaned subagent.
/// The caller is responsible for logging the zombie expiry via <c>Debug()</c>.
/// Shells are never expired — their lifecycle is managed by the OS.
/// </summary>
internal static bool HasActiveBackgroundTasks(SessionIdleEvent idle)
internal static bool HasActiveBackgroundTasks(
SessionIdleEvent idle,
long idleDeferStartedAtTicks = 0)
{
var bt = idle.Data?.BackgroundTasks;
if (bt == null) return false;
return (bt.Agents is { Length: > 0 }) || (bt.Shells is { Length: > 0 });

bool hasAgents = bt.Agents is { Length: > 0 };

if (hasAgents && idleDeferStartedAtTicks != 0)
{
var elapsed = TimeSpan.FromTicks(DateTime.UtcNow.Ticks - idleDeferStartedAtTicks);
if (elapsed.TotalMinutes >= SubagentZombieTimeoutMinutes)
hasAgents = false;
}

return hasAgents || (bt.Shells is { Length: > 0 });
}

private void StartProcessingWatchdog(SessionState state, string sessionName)
Expand Down Expand Up @@ -2563,6 +2614,7 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session
Interlocked.Exchange(ref state.ActiveToolCallCount, 0);
state.HasUsedToolsThisTurn = false;
state.HasDeferredIdle = false;
Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L);
Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0);
Interlocked.Exchange(ref state.ToolHealthStaleChecks, 0);
Interlocked.Exchange(ref state.EventCountThisTurn, 0);
Expand Down Expand Up @@ -2662,6 +2714,7 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session
Interlocked.Exchange(ref state.ActiveToolCallCount, 0);
state.HasUsedToolsThisTurn = false;
state.HasDeferredIdle = false;
Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L);
Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0);
Interlocked.Exchange(ref state.ToolHealthStaleChecks, 0);
Interlocked.Exchange(ref state.EventCountThisTurn, 0);
Expand Down Expand Up @@ -2718,6 +2771,7 @@ private void ClearProcessingStateForRecoveryFailure(SessionState state, string s
state.Info.IsResumed = false;
state.HasUsedToolsThisTurn = false;
state.HasDeferredIdle = false;
Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L);
Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0);
Interlocked.Exchange(ref state.ActiveToolCallCount, 0);
Interlocked.Exchange(ref state.ToolHealthStaleChecks, 0);
Expand Down Expand Up @@ -2818,6 +2872,7 @@ private async Task TryRecoverPermissionAsync(SessionState state, string sessionN
// Clear stale tool flag so watchdog uses normal timeout if resend is skipped
newState.HasUsedToolsThisTurn = false;
newState.HasDeferredIdle = false;
newState.SubagentDeferStartedAtTicks = 0L; // redundant (default), but explicit companion pair

// Replace in sessions dictionary BEFORE registering event handler
// so HandleSessionEvent's isCurrentState check passes for the new state.
Expand Down Expand Up @@ -2856,6 +2911,7 @@ private async Task TryRecoverPermissionAsync(SessionState state, string sessionN
state.Info.IsResumed = false;
state.HasUsedToolsThisTurn = false;
state.HasDeferredIdle = false;
Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L);
Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0);
Interlocked.Exchange(ref state.ActiveToolCallCount, 0);
Interlocked.Exchange(ref state.ToolHealthStaleChecks, 0);
Expand Down
1 change: 1 addition & 0 deletions PolyPilot/Services/CopilotService.Organization.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2213,6 +2213,7 @@ private async Task ForceCompleteProcessingAsync(string sessionName, SessionState
Interlocked.Exchange(ref state.WatchdogCaseBStaleCount, 0);
state.HasUsedToolsThisTurn = false;
state.HasDeferredIdle = false;
Interlocked.Exchange(ref state.SubagentDeferStartedAtTicks, 0L);
state.FallbackCanceledByTurnStart = false;
state.Info.IsResumed = false;
state.Info.ProcessingStartedAt = null;
Expand Down
Loading