Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion PolyPilot.Tests/ChatExperienceSafetyTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -842,7 +842,7 @@ public void CompleteResponse_Source_ClearsSendingFlag()
Assert.True(crIdx > 0);

// Within CompleteResponse, SendingFlag must be cleared (may be 80+ lines into method)
var afterCR = source.Substring(crIdx, Math.Min(5000, source.Length - crIdx));
var afterCR = source.Substring(crIdx, Math.Min(6000, source.Length - crIdx));
Assert.Contains("SendingFlag", afterCR);
}

Expand Down
1 change: 1 addition & 0 deletions PolyPilot.Tests/TestSetup.cs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ internal static void Initialize()
TestBaseDir = Path.Combine(Path.GetTempPath(), "polypilot-tests-" + Environment.ProcessId);
Directory.CreateDirectory(TestBaseDir);
CopilotService.SetBaseDirForTesting(TestBaseDir);
CopilotService.SetCaptureDirForTesting(Path.Combine(TestBaseDir, "zero-idle-captures"));
RepoManager.SetBaseDirForTesting(TestBaseDir);
PromptLibraryService.SetUserPromptsDirForTesting(Path.Combine(TestBaseDir, "prompts"));
}
Expand Down
116 changes: 116 additions & 0 deletions PolyPilot.Tests/ZeroIdleCaptureTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
using System.Text.Json;
using PolyPilot.Services;

namespace PolyPilot.Tests;

/// <summary>
/// Tests for the zero-idle capture diagnostics feature (#359).
/// Validates capture file format, field population, and retention purge.
/// </summary>
[Collection("ZeroIdleCapture")]
public class ZeroIdleCaptureTests : IDisposable
{
private readonly string _testDir;

public ZeroIdleCaptureTests()
{
_testDir = Path.Combine(Path.GetTempPath(), $"polypilot-zic-test-{Guid.NewGuid():N}");
Directory.CreateDirectory(_testDir);
CopilotService.SetCaptureDirForTesting(_testDir);
}

public void Dispose()
{
CopilotService.ResetCaptureDir();
try { Directory.Delete(_testDir, recursive: true); } catch { }
}

// ── Purge logic ─────────────────────────────────────────────────────────

[Fact]
public void PurgeOldCaptures_KeepsOnlyMostRecent()
{
// Create 5 capture files with sortable names
for (int i = 0; i < 5; i++)
{
var name = $"capture_2026-03-{10 + i:D2}T12-00-00_sess{i:D4}.json";
File.WriteAllText(Path.Combine(_testDir, name), "{}");
}

// Purge keeping only 2
CopilotService.PurgeOldCaptures(keepCount: 2);

var remaining = Directory.GetFiles(_testDir, "capture_*.json");
Assert.Equal(2, remaining.Length);
// Newest files should survive (sorted descending, skip 2 = delete oldest 3)
Assert.Contains(remaining, f => f.Contains("capture_2026-03-14"));
Assert.Contains(remaining, f => f.Contains("capture_2026-03-13"));
}

[Fact]
public void PurgeOldCaptures_NoOpWhenFewFiles()
{
File.WriteAllText(Path.Combine(_testDir, "capture_2026-03-11T12-00-00_abcd1234.json"), "{}");

CopilotService.PurgeOldCaptures(keepCount: 100);

Assert.Single(Directory.GetFiles(_testDir, "capture_*.json"));
}

[Fact]
public void PurgeOldCaptures_NoOpWhenDirMissing()
{
CopilotService.SetCaptureDirForTesting("/nonexistent/path/zic");

// Should not throw
CopilotService.PurgeOldCaptures();
}

// ── EnableVerboseEventTracing setting ────────────────────────────────────

[Fact]
public void EnableVerboseEventTracing_DefaultsToFalse()
{
var settings = new PolyPilot.Models.ConnectionSettings();
Assert.False(settings.EnableVerboseEventTracing);
}

[Fact]
public void EnableVerboseEventTracing_RoundTripsViaJson()
{
var settings = new PolyPilot.Models.ConnectionSettings
{
EnableVerboseEventTracing = true
};
var json = JsonSerializer.Serialize(settings);
var deserialized = JsonSerializer.Deserialize<PolyPilot.Models.ConnectionSettings>(json);
Assert.NotNull(deserialized);
Assert.True(deserialized!.EnableVerboseEventTracing);
}

// ── SessionState field existence ────────────────────────────────────────

[Fact]
public void AgentSessionInfo_HasEventCountThisTurn_FieldExists()
{
// EventCountThisTurn and TurnEndReceivedAtTicks are on the private SessionState class.
// We verify CopilotService has the capture/purge methods exposed.
// PurgeOldCaptures is internal static — if it compiles with our test, the fields exist.
Assert.NotNull(typeof(CopilotService).GetMethod("PurgeOldCaptures",
System.Reflection.BindingFlags.Static | System.Reflection.BindingFlags.NonPublic));
}

[Fact]
public void CopilotService_HasSetCaptureDirForTesting()
{
Assert.NotNull(typeof(CopilotService).GetMethod("SetCaptureDirForTesting",
System.Reflection.BindingFlags.Static | System.Reflection.BindingFlags.NonPublic));
}

[Fact]
public void CopilotService_HasResetCaptureDir()
{
Assert.NotNull(typeof(CopilotService).GetMethod("ResetCaptureDir",
System.Reflection.BindingFlags.Static | System.Reflection.BindingFlags.NonPublic));
}
}
3 changes: 3 additions & 0 deletions PolyPilot/MauiProgram.cs
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,9 @@ public static MauiApp CreateMauiApp()
PluginLoader.LoadEnabledProviders(builder.Services, pluginSettings.Plugins.Enabled);
#endif

// Startup cleanup: purge old zero-idle captures (keep last 100)
try { CopilotService.PurgeOldCaptures(); } catch { }

return builder.Build();
}

Expand Down
5 changes: 5 additions & 0 deletions PolyPilot/Models/ConnectionSettings.cs
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,11 @@ public class ConnectionSettings
public bool EnableSessionNotifications { get; set; } = false;
public bool MuteWorkerNotifications { get; set; } = false;
public bool CodespacesEnabled { get; set; } = false;
/// <summary>
/// When true, logs every SDK event type to event-diagnostics.log (not just lifecycle events).
/// Useful for investigating zero-idle sessions (#299) — reveals the exact last event before silence.
/// </summary>
public bool EnableVerboseEventTracing { get; set; } = false;

/// <summary>
/// Normalizes a remote URL by ensuring it has an http(s):// scheme.
Expand Down
151 changes: 151 additions & 0 deletions PolyPilot/Services/CopilotService.Events.cs
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,9 @@ private void HandleSessionEvent(SessionState state, SessionEvent evt)
Interlocked.Exchange(ref state.WatchdogCaseAResets, 0);
state.Info.LastUpdatedAt = DateTime.Now;
}
// Count every event for zero-idle diagnostics (#299)
Interlocked.Increment(ref state.EventCountThisTurn);

var sessionName = state.Info.Name;
var isCurrentState = _sessions.TryGetValue(sessionName, out var current) && ReferenceEquals(current, state);

Expand All @@ -237,6 +240,13 @@ private void HandleSessionEvent(SessionState state, SessionEvent evt)
$"(IsProcessing={state.Info.IsProcessing}, isCurrentState={isCurrentState}, " +
$"thread={Environment.CurrentManagedThreadId})");
}
// Verbose event tracing: log ALL event types when enabled (for zero-idle investigation #299).
// This reveals the exact last event before silence — was it ToolExecutionComplete? AssistantMessage?
else if (_currentSettings?.EnableVerboseEventTracing == true)
{
Debug($"[EVT-TRACE] '{sessionName}' {evt.GetType().Name} " +
$"(eventCount={state.EventCountThisTurn}, thread={Environment.CurrentManagedThreadId})");
}

// Warn if receiving events on an orphaned (replaced) state object.
// After the generation-carry fix, stale callbacks on orphaned state would have
Expand Down Expand Up @@ -463,6 +473,7 @@ void Invoke(Action action)
break;

case AssistantTurnEndEvent:
Interlocked.Exchange(ref state.TurnEndReceivedAtTicks, DateTime.UtcNow.Ticks);
try { CompleteReasoningMessages(state, sessionName); }
catch (Exception ex)
{
Expand Down Expand Up @@ -517,10 +528,12 @@ void Invoke(Action action)
return;
}
Debug($"[IDLE-FALLBACK] '{sessionName}' SessionIdleEvent not received {TurnEndIdleFallbackMs + TurnEndIdleToolFallbackAdditionalMs}ms after TurnEnd (tools used) — firing CompleteResponse");
CaptureZeroIdleDiagnostics(state, sessionName, toolsUsed: true);
InvokeOnUI(() => CompleteResponse(state, turnEndGen));
return;
}
Debug($"[IDLE-FALLBACK] '{sessionName}' SessionIdleEvent not received {TurnEndIdleFallbackMs}ms after TurnEnd — firing CompleteResponse");
CaptureZeroIdleDiagnostics(state, sessionName, toolsUsed: false);
InvokeOnUI(() => CompleteResponse(state, turnEndGen));
}
catch (OperationCanceledException) { /* expected on cancellation */ }
Expand Down Expand Up @@ -882,6 +895,8 @@ private void CompleteResponse(SessionState state, long? expectedGeneration = nul
state.FallbackCanceledByTurnStart = false;
Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0);
Interlocked.Exchange(ref state.ToolHealthStaleChecks, 0);
Interlocked.Exchange(ref state.EventCountThisTurn, 0);
Interlocked.Exchange(ref state.TurnEndReceivedAtTicks, 0);
state.Info.IsResumed = false; // Clear after first successful turn
var response = state.CurrentResponse.ToString();
if (!string.IsNullOrWhiteSpace(response))
Expand Down Expand Up @@ -2156,4 +2171,140 @@ public async Task RecoverSessionAsync(string sessionName)
await TryRecoverPermissionAsync(state, sessionName);
}
}

// ── Zero-idle capture diagnostics (#299) ────────────────────────────────

private static string? _zeroIdleCaptureDir;
private static string ZeroIdleCaptureDir
{
get
{
lock (_pathLock)
return _zeroIdleCaptureDir ??= Path.Combine(PolyPilotBaseDir, "zero-idle-captures");
}
}

// For testing: override the capture directory
internal static void SetCaptureDirForTesting(string dir) => _zeroIdleCaptureDir = dir;
internal static void ResetCaptureDir() => _zeroIdleCaptureDir = null;

/// <summary>
/// Writes a diagnostic capture file when the TurnEnd→Idle fallback fires,
/// meaning SessionIdleEvent was not received (SDK bug #299).
/// Includes session state snapshot, event counts, and last events from events.jsonl.
/// Never throws — capture failures are swallowed to Console.Error.
/// </summary>
private void CaptureZeroIdleDiagnostics(SessionState state, string sessionName, bool toolsUsed)
{
try
{
var captureDir = ZeroIdleCaptureDir;
Directory.CreateDirectory(captureDir);

var sessionId = state.Info.SessionId ?? "unknown";
var now = DateTime.UtcNow;
var turnEndTicks = Interlocked.Read(ref state.TurnEndReceivedAtTicks);
var turnEndAge = turnEndTicks > 0
? (now - new DateTime(turnEndTicks, DateTimeKind.Utc)).TotalSeconds
: -1;
var lastEventAge = (now - new DateTime(Interlocked.Read(ref state.LastEventAtTicks), DateTimeKind.Utc)).TotalSeconds;

// Read last 50 events from events.jsonl
var eventsFile = Path.Combine(SessionStatePath, sessionId, "events.jsonl");
var recentEvents = new List<object>();
try
{
if (File.Exists(eventsFile))
{
var allEvents = ParseEventLogFile(eventsFile);
var tail = allEvents.Count > 50 ? allEvents.GetRange(allEvents.Count - 50, 50) : allEvents;
foreach (var (ts, type, detail) in tail)
recentEvents.Add(new { timestamp = ts, type, detail });
}
}
catch { /* best-effort */ }

// Count concurrent sessions
var totalSessions = _sessions.Count;
var processingSessions = _sessions.Values.Count(s => s.Info.IsProcessing);

var capture = new Dictionary<string, object?>
{
["capture_timestamp"] = now.ToString("O"),
["trigger"] = "IDLE_FALLBACK",
["session"] = new Dictionary<string, object?>
{
["session_id"] = sessionId,
["session_name"] = sessionName,
["model"] = state.Info.Model,
["history_size"] = state.Info.MessageCount,
["is_multi_agent"] = state.IsMultiAgentSession,
},
["processing_state"] = new Dictionary<string, object?>
{
["is_processing"] = state.Info.IsProcessing,
["processing_phase"] = state.Info.ProcessingPhase,
["active_tool_call_count"] = Volatile.Read(ref state.ActiveToolCallCount),
["has_used_tools_this_turn"] = state.HasUsedToolsThisTurn,
["successful_tool_count"] = state.SuccessfulToolCountThisTurn,
["event_count_this_turn"] = Volatile.Read(ref state.EventCountThisTurn),
["processing_generation"] = Interlocked.Read(ref state.ProcessingGeneration),
["has_received_deltas"] = state.HasReceivedDeltasThisTurn,
},
["timing"] = new Dictionary<string, object?>
{
["turn_end_age_seconds"] = Math.Round(turnEndAge, 2),
["last_event_age_seconds"] = Math.Round(lastEventAge, 2),
["fallback_tools_used"] = toolsUsed,
["fallback_wait_ms"] = toolsUsed
? TurnEndIdleFallbackMs + TurnEndIdleToolFallbackAdditionalMs
: TurnEndIdleFallbackMs,
},
["concurrency"] = new Dictionary<string, object?>
{
["total_sessions"] = totalSessions,
["processing_sessions"] = processingSessions,
},
["events_jsonl_tail"] = recentEvents,
};

var json = System.Text.Json.JsonSerializer.Serialize(capture,
new System.Text.Json.JsonSerializerOptions { WriteIndented = true });
var fileName = $"capture_{now:yyyy-MM-ddTHH-mm-ss}_{sessionId[..Math.Min(8, sessionId.Length)]}.json";
File.WriteAllText(Path.Combine(captureDir, fileName), json);

Debug($"[ZERO-IDLE] '{sessionName}' capture written: {fileName} " +
$"(events={state.EventCountThisTurn}, historySize={state.Info.History.Count}, " +
$"turnEndAge={turnEndAge:F1}s, tools={toolsUsed})");
}
catch (Exception ex)
{
Console.Error.WriteLine($"[ZeroIdleCapture] Failed to write capture: {ex.Message}");
}
}

/// <summary>
/// Deletes old zero-idle capture files, keeping the most recent 100.
/// Called once on startup. Never throws.
/// </summary>
internal static void PurgeOldCaptures(int keepCount = 100)
{
try
{
var dir = ZeroIdleCaptureDir;
if (!Directory.Exists(dir)) return;
var files = Directory.GetFiles(dir, "capture_*.json")
.OrderByDescending(f => f)
.Skip(keepCount)
.ToList();
foreach (var file in files)
{
try { File.Delete(file); } catch { }
}
}
catch (Exception ex)
{
Console.Error.WriteLine($"[ZeroIdleCapture] Failed to purge old captures: {ex.Message}");
}
}
}
12 changes: 11 additions & 1 deletion PolyPilot/Services/CopilotService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ internal static void SetBaseDirForTesting(string path)
_copilotBaseDir = null;
_sessionStatePath = null;
_pendingOrchestrationFile = null;
_zeroIdleCaptureDir = null;
}
}

Expand Down Expand Up @@ -457,6 +458,13 @@ private class SessionState
/// <summary>Timestamp when the most recent tool started. Used by the tool health check to
/// determine if a tool has been running too long without any events.</summary>
public long ToolStartedAtTicks;
/// <summary>Count of SDK events received during the current processing turn.
/// Incremented in HandleSessionEvent, reset in SendPromptAsync and CompleteResponse.
/// Used by zero-idle capture to quantify how much activity occurred before silence.</summary>
public int EventCountThisTurn;
/// <summary>Timestamp (UTC ticks) when AssistantTurnEndEvent was received.
/// Used by zero-idle capture to measure fallback wait duration.</summary>
public long TurnEndReceivedAtTicks;
}

private void Debug(string message)
Expand All @@ -471,7 +479,7 @@ private void Debug(string message)
message.StartsWith("[COMPLETE") || message.StartsWith("[SEND") ||
message.StartsWith("[RECONNECT") || message.StartsWith("[UI-ERR") ||
message.StartsWith("[DISPATCH") || message.StartsWith("[WATCHDOG") ||
message.StartsWith("[HEALTH") ||
message.StartsWith("[HEALTH") || message.StartsWith("[ZERO-IDLE") ||
message.Contains("watchdog"))
{
try
Expand Down Expand Up @@ -2428,6 +2436,8 @@ public async Task<string> SendPromptAsync(string sessionName, string prompt, Lis
state.HasUsedToolsThisTurn = false; // Reset stale tool flag from previous turn
state.FallbackCanceledByTurnStart = false;
Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0);
Interlocked.Exchange(ref state.EventCountThisTurn, 0); // Reset event counter for zero-idle capture
Interlocked.Exchange(ref state.TurnEndReceivedAtTicks, 0);
// Cancel any pending TurnEnd→Idle fallback from the previous turn
CancelTurnEndFallback(state);
state.IsMultiAgentSession = IsSessionInMultiAgentGroup(sessionName); // Cache for watchdog (UI thread safe)
Expand Down