diff --git a/PolyPilot.Tests/ChatExperienceSafetyTests.cs b/PolyPilot.Tests/ChatExperienceSafetyTests.cs index 1bb045ab2c..6bbcfd4840 100644 --- a/PolyPilot.Tests/ChatExperienceSafetyTests.cs +++ b/PolyPilot.Tests/ChatExperienceSafetyTests.cs @@ -842,7 +842,7 @@ public void CompleteResponse_Source_ClearsSendingFlag() Assert.True(crIdx > 0); // Within CompleteResponse, SendingFlag must be cleared (may be 80+ lines into method) - var afterCR = source.Substring(crIdx, Math.Min(5000, source.Length - crIdx)); + var afterCR = source.Substring(crIdx, Math.Min(6000, source.Length - crIdx)); Assert.Contains("SendingFlag", afterCR); } diff --git a/PolyPilot.Tests/TestSetup.cs b/PolyPilot.Tests/TestSetup.cs index 10574ead53..38ed8a4985 100644 --- a/PolyPilot.Tests/TestSetup.cs +++ b/PolyPilot.Tests/TestSetup.cs @@ -25,6 +25,7 @@ internal static void Initialize() TestBaseDir = Path.Combine(Path.GetTempPath(), "polypilot-tests-" + Environment.ProcessId); Directory.CreateDirectory(TestBaseDir); CopilotService.SetBaseDirForTesting(TestBaseDir); + CopilotService.SetCaptureDirForTesting(Path.Combine(TestBaseDir, "zero-idle-captures")); RepoManager.SetBaseDirForTesting(TestBaseDir); PromptLibraryService.SetUserPromptsDirForTesting(Path.Combine(TestBaseDir, "prompts")); } diff --git a/PolyPilot.Tests/ZeroIdleCaptureTests.cs b/PolyPilot.Tests/ZeroIdleCaptureTests.cs new file mode 100644 index 0000000000..6ae8321940 --- /dev/null +++ b/PolyPilot.Tests/ZeroIdleCaptureTests.cs @@ -0,0 +1,116 @@ +using System.Text.Json; +using PolyPilot.Services; + +namespace PolyPilot.Tests; + +/// +/// Tests for the zero-idle capture diagnostics feature (#359). +/// Validates capture file format, field population, and retention purge. +/// +[Collection("ZeroIdleCapture")] +public class ZeroIdleCaptureTests : IDisposable +{ + private readonly string _testDir; + + public ZeroIdleCaptureTests() + { + _testDir = Path.Combine(Path.GetTempPath(), $"polypilot-zic-test-{Guid.NewGuid():N}"); + Directory.CreateDirectory(_testDir); + CopilotService.SetCaptureDirForTesting(_testDir); + } + + public void Dispose() + { + CopilotService.ResetCaptureDir(); + try { Directory.Delete(_testDir, recursive: true); } catch { } + } + + // ── Purge logic ───────────────────────────────────────────────────────── + + [Fact] + public void PurgeOldCaptures_KeepsOnlyMostRecent() + { + // Create 5 capture files with sortable names + for (int i = 0; i < 5; i++) + { + var name = $"capture_2026-03-{10 + i:D2}T12-00-00_sess{i:D4}.json"; + File.WriteAllText(Path.Combine(_testDir, name), "{}"); + } + + // Purge keeping only 2 + CopilotService.PurgeOldCaptures(keepCount: 2); + + var remaining = Directory.GetFiles(_testDir, "capture_*.json"); + Assert.Equal(2, remaining.Length); + // Newest files should survive (sorted descending, skip 2 = delete oldest 3) + Assert.Contains(remaining, f => f.Contains("capture_2026-03-14")); + Assert.Contains(remaining, f => f.Contains("capture_2026-03-13")); + } + + [Fact] + public void PurgeOldCaptures_NoOpWhenFewFiles() + { + File.WriteAllText(Path.Combine(_testDir, "capture_2026-03-11T12-00-00_abcd1234.json"), "{}"); + + CopilotService.PurgeOldCaptures(keepCount: 100); + + Assert.Single(Directory.GetFiles(_testDir, "capture_*.json")); + } + + [Fact] + public void PurgeOldCaptures_NoOpWhenDirMissing() + { + CopilotService.SetCaptureDirForTesting("/nonexistent/path/zic"); + + // Should not throw + CopilotService.PurgeOldCaptures(); + } + + // ── EnableVerboseEventTracing setting ──────────────────────────────────── + + [Fact] + public void EnableVerboseEventTracing_DefaultsToFalse() + { + var settings = new PolyPilot.Models.ConnectionSettings(); + Assert.False(settings.EnableVerboseEventTracing); + } + + [Fact] + public void EnableVerboseEventTracing_RoundTripsViaJson() + { + var settings = new PolyPilot.Models.ConnectionSettings + { + EnableVerboseEventTracing = true + }; + var json = JsonSerializer.Serialize(settings); + var deserialized = JsonSerializer.Deserialize(json); + Assert.NotNull(deserialized); + Assert.True(deserialized!.EnableVerboseEventTracing); + } + + // ── SessionState field existence ──────────────────────────────────────── + + [Fact] + public void AgentSessionInfo_HasEventCountThisTurn_FieldExists() + { + // EventCountThisTurn and TurnEndReceivedAtTicks are on the private SessionState class. + // We verify CopilotService has the capture/purge methods exposed. + // PurgeOldCaptures is internal static — if it compiles with our test, the fields exist. + Assert.NotNull(typeof(CopilotService).GetMethod("PurgeOldCaptures", + System.Reflection.BindingFlags.Static | System.Reflection.BindingFlags.NonPublic)); + } + + [Fact] + public void CopilotService_HasSetCaptureDirForTesting() + { + Assert.NotNull(typeof(CopilotService).GetMethod("SetCaptureDirForTesting", + System.Reflection.BindingFlags.Static | System.Reflection.BindingFlags.NonPublic)); + } + + [Fact] + public void CopilotService_HasResetCaptureDir() + { + Assert.NotNull(typeof(CopilotService).GetMethod("ResetCaptureDir", + System.Reflection.BindingFlags.Static | System.Reflection.BindingFlags.NonPublic)); + } +} diff --git a/PolyPilot/MauiProgram.cs b/PolyPilot/MauiProgram.cs index 846a5b4c0b..295d28a884 100644 --- a/PolyPilot/MauiProgram.cs +++ b/PolyPilot/MauiProgram.cs @@ -128,6 +128,9 @@ public static MauiApp CreateMauiApp() PluginLoader.LoadEnabledProviders(builder.Services, pluginSettings.Plugins.Enabled); #endif + // Startup cleanup: purge old zero-idle captures (keep last 100) + try { CopilotService.PurgeOldCaptures(); } catch { } + return builder.Build(); } diff --git a/PolyPilot/Models/ConnectionSettings.cs b/PolyPilot/Models/ConnectionSettings.cs index e9a5e995f2..ebc55c13f1 100644 --- a/PolyPilot/Models/ConnectionSettings.cs +++ b/PolyPilot/Models/ConnectionSettings.cs @@ -89,6 +89,11 @@ public class ConnectionSettings public bool EnableSessionNotifications { get; set; } = false; public bool MuteWorkerNotifications { get; set; } = false; public bool CodespacesEnabled { get; set; } = false; + /// + /// When true, logs every SDK event type to event-diagnostics.log (not just lifecycle events). + /// Useful for investigating zero-idle sessions (#299) — reveals the exact last event before silence. + /// + public bool EnableVerboseEventTracing { get; set; } = false; /// /// Normalizes a remote URL by ensuring it has an http(s):// scheme. diff --git a/PolyPilot/Services/CopilotService.Events.cs b/PolyPilot/Services/CopilotService.Events.cs index bde7ecaf91..c41b85236f 100644 --- a/PolyPilot/Services/CopilotService.Events.cs +++ b/PolyPilot/Services/CopilotService.Events.cs @@ -225,6 +225,9 @@ private void HandleSessionEvent(SessionState state, SessionEvent evt) Interlocked.Exchange(ref state.WatchdogCaseAResets, 0); state.Info.LastUpdatedAt = DateTime.Now; } + // Count every event for zero-idle diagnostics (#299) + Interlocked.Increment(ref state.EventCountThisTurn); + var sessionName = state.Info.Name; var isCurrentState = _sessions.TryGetValue(sessionName, out var current) && ReferenceEquals(current, state); @@ -237,6 +240,13 @@ private void HandleSessionEvent(SessionState state, SessionEvent evt) $"(IsProcessing={state.Info.IsProcessing}, isCurrentState={isCurrentState}, " + $"thread={Environment.CurrentManagedThreadId})"); } + // Verbose event tracing: log ALL event types when enabled (for zero-idle investigation #299). + // This reveals the exact last event before silence — was it ToolExecutionComplete? AssistantMessage? + else if (_currentSettings?.EnableVerboseEventTracing == true) + { + Debug($"[EVT-TRACE] '{sessionName}' {evt.GetType().Name} " + + $"(eventCount={state.EventCountThisTurn}, thread={Environment.CurrentManagedThreadId})"); + } // Warn if receiving events on an orphaned (replaced) state object. // After the generation-carry fix, stale callbacks on orphaned state would have @@ -463,6 +473,7 @@ void Invoke(Action action) break; case AssistantTurnEndEvent: + Interlocked.Exchange(ref state.TurnEndReceivedAtTicks, DateTime.UtcNow.Ticks); try { CompleteReasoningMessages(state, sessionName); } catch (Exception ex) { @@ -517,10 +528,12 @@ void Invoke(Action action) return; } Debug($"[IDLE-FALLBACK] '{sessionName}' SessionIdleEvent not received {TurnEndIdleFallbackMs + TurnEndIdleToolFallbackAdditionalMs}ms after TurnEnd (tools used) — firing CompleteResponse"); + CaptureZeroIdleDiagnostics(state, sessionName, toolsUsed: true); InvokeOnUI(() => CompleteResponse(state, turnEndGen)); return; } Debug($"[IDLE-FALLBACK] '{sessionName}' SessionIdleEvent not received {TurnEndIdleFallbackMs}ms after TurnEnd — firing CompleteResponse"); + CaptureZeroIdleDiagnostics(state, sessionName, toolsUsed: false); InvokeOnUI(() => CompleteResponse(state, turnEndGen)); } catch (OperationCanceledException) { /* expected on cancellation */ } @@ -882,6 +895,8 @@ private void CompleteResponse(SessionState state, long? expectedGeneration = nul state.FallbackCanceledByTurnStart = false; Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); Interlocked.Exchange(ref state.ToolHealthStaleChecks, 0); + Interlocked.Exchange(ref state.EventCountThisTurn, 0); + Interlocked.Exchange(ref state.TurnEndReceivedAtTicks, 0); state.Info.IsResumed = false; // Clear after first successful turn var response = state.CurrentResponse.ToString(); if (!string.IsNullOrWhiteSpace(response)) @@ -2156,4 +2171,140 @@ public async Task RecoverSessionAsync(string sessionName) await TryRecoverPermissionAsync(state, sessionName); } } + + // ── Zero-idle capture diagnostics (#299) ──────────────────────────────── + + private static string? _zeroIdleCaptureDir; + private static string ZeroIdleCaptureDir + { + get + { + lock (_pathLock) + return _zeroIdleCaptureDir ??= Path.Combine(PolyPilotBaseDir, "zero-idle-captures"); + } + } + + // For testing: override the capture directory + internal static void SetCaptureDirForTesting(string dir) => _zeroIdleCaptureDir = dir; + internal static void ResetCaptureDir() => _zeroIdleCaptureDir = null; + + /// + /// Writes a diagnostic capture file when the TurnEnd→Idle fallback fires, + /// meaning SessionIdleEvent was not received (SDK bug #299). + /// Includes session state snapshot, event counts, and last events from events.jsonl. + /// Never throws — capture failures are swallowed to Console.Error. + /// + private void CaptureZeroIdleDiagnostics(SessionState state, string sessionName, bool toolsUsed) + { + try + { + var captureDir = ZeroIdleCaptureDir; + Directory.CreateDirectory(captureDir); + + var sessionId = state.Info.SessionId ?? "unknown"; + var now = DateTime.UtcNow; + var turnEndTicks = Interlocked.Read(ref state.TurnEndReceivedAtTicks); + var turnEndAge = turnEndTicks > 0 + ? (now - new DateTime(turnEndTicks, DateTimeKind.Utc)).TotalSeconds + : -1; + var lastEventAge = (now - new DateTime(Interlocked.Read(ref state.LastEventAtTicks), DateTimeKind.Utc)).TotalSeconds; + + // Read last 50 events from events.jsonl + var eventsFile = Path.Combine(SessionStatePath, sessionId, "events.jsonl"); + var recentEvents = new List(); + try + { + if (File.Exists(eventsFile)) + { + var allEvents = ParseEventLogFile(eventsFile); + var tail = allEvents.Count > 50 ? allEvents.GetRange(allEvents.Count - 50, 50) : allEvents; + foreach (var (ts, type, detail) in tail) + recentEvents.Add(new { timestamp = ts, type, detail }); + } + } + catch { /* best-effort */ } + + // Count concurrent sessions + var totalSessions = _sessions.Count; + var processingSessions = _sessions.Values.Count(s => s.Info.IsProcessing); + + var capture = new Dictionary + { + ["capture_timestamp"] = now.ToString("O"), + ["trigger"] = "IDLE_FALLBACK", + ["session"] = new Dictionary + { + ["session_id"] = sessionId, + ["session_name"] = sessionName, + ["model"] = state.Info.Model, + ["history_size"] = state.Info.MessageCount, + ["is_multi_agent"] = state.IsMultiAgentSession, + }, + ["processing_state"] = new Dictionary + { + ["is_processing"] = state.Info.IsProcessing, + ["processing_phase"] = state.Info.ProcessingPhase, + ["active_tool_call_count"] = Volatile.Read(ref state.ActiveToolCallCount), + ["has_used_tools_this_turn"] = state.HasUsedToolsThisTurn, + ["successful_tool_count"] = state.SuccessfulToolCountThisTurn, + ["event_count_this_turn"] = Volatile.Read(ref state.EventCountThisTurn), + ["processing_generation"] = Interlocked.Read(ref state.ProcessingGeneration), + ["has_received_deltas"] = state.HasReceivedDeltasThisTurn, + }, + ["timing"] = new Dictionary + { + ["turn_end_age_seconds"] = Math.Round(turnEndAge, 2), + ["last_event_age_seconds"] = Math.Round(lastEventAge, 2), + ["fallback_tools_used"] = toolsUsed, + ["fallback_wait_ms"] = toolsUsed + ? TurnEndIdleFallbackMs + TurnEndIdleToolFallbackAdditionalMs + : TurnEndIdleFallbackMs, + }, + ["concurrency"] = new Dictionary + { + ["total_sessions"] = totalSessions, + ["processing_sessions"] = processingSessions, + }, + ["events_jsonl_tail"] = recentEvents, + }; + + var json = System.Text.Json.JsonSerializer.Serialize(capture, + new System.Text.Json.JsonSerializerOptions { WriteIndented = true }); + var fileName = $"capture_{now:yyyy-MM-ddTHH-mm-ss}_{sessionId[..Math.Min(8, sessionId.Length)]}.json"; + File.WriteAllText(Path.Combine(captureDir, fileName), json); + + Debug($"[ZERO-IDLE] '{sessionName}' capture written: {fileName} " + + $"(events={state.EventCountThisTurn}, historySize={state.Info.History.Count}, " + + $"turnEndAge={turnEndAge:F1}s, tools={toolsUsed})"); + } + catch (Exception ex) + { + Console.Error.WriteLine($"[ZeroIdleCapture] Failed to write capture: {ex.Message}"); + } + } + + /// + /// Deletes old zero-idle capture files, keeping the most recent 100. + /// Called once on startup. Never throws. + /// + internal static void PurgeOldCaptures(int keepCount = 100) + { + try + { + var dir = ZeroIdleCaptureDir; + if (!Directory.Exists(dir)) return; + var files = Directory.GetFiles(dir, "capture_*.json") + .OrderByDescending(f => f) + .Skip(keepCount) + .ToList(); + foreach (var file in files) + { + try { File.Delete(file); } catch { } + } + } + catch (Exception ex) + { + Console.Error.WriteLine($"[ZeroIdleCapture] Failed to purge old captures: {ex.Message}"); + } + } } diff --git a/PolyPilot/Services/CopilotService.cs b/PolyPilot/Services/CopilotService.cs index 16df52c2a9..16bcc6f284 100644 --- a/PolyPilot/Services/CopilotService.cs +++ b/PolyPilot/Services/CopilotService.cs @@ -148,6 +148,7 @@ internal static void SetBaseDirForTesting(string path) _copilotBaseDir = null; _sessionStatePath = null; _pendingOrchestrationFile = null; + _zeroIdleCaptureDir = null; } } @@ -457,6 +458,13 @@ private class SessionState /// Timestamp when the most recent tool started. Used by the tool health check to /// determine if a tool has been running too long without any events. public long ToolStartedAtTicks; + /// Count of SDK events received during the current processing turn. + /// Incremented in HandleSessionEvent, reset in SendPromptAsync and CompleteResponse. + /// Used by zero-idle capture to quantify how much activity occurred before silence. + public int EventCountThisTurn; + /// Timestamp (UTC ticks) when AssistantTurnEndEvent was received. + /// Used by zero-idle capture to measure fallback wait duration. + public long TurnEndReceivedAtTicks; } private void Debug(string message) @@ -471,7 +479,7 @@ private void Debug(string message) message.StartsWith("[COMPLETE") || message.StartsWith("[SEND") || message.StartsWith("[RECONNECT") || message.StartsWith("[UI-ERR") || message.StartsWith("[DISPATCH") || message.StartsWith("[WATCHDOG") || - message.StartsWith("[HEALTH") || + message.StartsWith("[HEALTH") || message.StartsWith("[ZERO-IDLE") || message.Contains("watchdog")) { try @@ -2428,6 +2436,8 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis state.HasUsedToolsThisTurn = false; // Reset stale tool flag from previous turn state.FallbackCanceledByTurnStart = false; Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); + Interlocked.Exchange(ref state.EventCountThisTurn, 0); // Reset event counter for zero-idle capture + Interlocked.Exchange(ref state.TurnEndReceivedAtTicks, 0); // Cancel any pending TurnEnd→Idle fallback from the previous turn CancelTurnEndFallback(state); state.IsMultiAgentSession = IsSessionInMultiAgentGroup(sessionName); // Cache for watchdog (UI thread safe)