From ad0088000d48932ab96a602bdf1c2f86031fbdc4 Mon Sep 17 00:00:00 2001 From: Shane Neuville Date: Tue, 17 Mar 2026 12:03:51 -0500 Subject: [PATCH 1/2] Add keepalive ping to prevent server idle timeout (#396) The headless Copilot server kills sessions after ~35 minutes of inactivity. This causes multi-agent workers to lose their work mid-execution when tool runs take a long time. Add a background keepalive loop that pings the server every 15 minutes via CopilotClient.PingAsync(). This resets the server's connection-level idle timer and prevents session cleanup. The keepalive: - Starts after InitializeAsync and ReconnectAsync (non-demo/remote) - Stops on ReconnectAsync teardown and DisposeAsync - Skips pings in Demo/Remote mode (no headless server) - Logs [KEEPALIVE] to the diagnostic log for traceability - Catches all exceptions to avoid crashing the app Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- PolyPilot/Services/CopilotService.cs | 63 ++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/PolyPilot/Services/CopilotService.cs b/PolyPilot/Services/CopilotService.cs index 8b910031bf..0e75805c8f 100644 --- a/PolyPilot/Services/CopilotService.cs +++ b/PolyPilot/Services/CopilotService.cs @@ -34,6 +34,8 @@ public partial class CopilotService : IAsyncDisposable private Timer? _saveUiStateDebounce; private UiState? _pendingUiState; private readonly object _uiStateLock = new(); + // Keepalive ping to prevent the headless server from killing idle sessions (~35 min timeout) + private CancellationTokenSource? _keepaliveCts; private readonly IChatDatabase _chatDb; private readonly IServerManager _serverManager; private readonly IWsBridgeClient _bridgeClient; @@ -522,6 +524,56 @@ private static void DisposePrematureIdleSignal(SessionState? state) try { state?.PrematureIdleSignal?.Dispose(); } catch { } } + /// Ping interval to prevent the headless server from killing idle sessions. + /// The server has a ~35 minute idle timeout; pinging every 15 minutes keeps sessions alive. + internal const int KeepalivePingIntervalSeconds = 15 * 60; // 15 minutes + + private void StartKeepalivePing() + { + StopKeepalivePing(); + var cts = new CancellationTokenSource(); + _keepaliveCts = cts; + _ = RunKeepalivePingAsync(cts.Token); + } + + private void StopKeepalivePing() + { + var prev = Interlocked.Exchange(ref _keepaliveCts, null); + if (prev != null) + { + try { prev.Cancel(); } catch { } + prev.Dispose(); + } + } + + private async Task RunKeepalivePingAsync(CancellationToken ct) + { + try + { + while (!ct.IsCancellationRequested) + { + await Task.Delay(TimeSpan.FromSeconds(KeepalivePingIntervalSeconds), ct); + if (ct.IsCancellationRequested) break; + + var client = _client; + if (client == null || IsDemoMode || IsRemoteMode) continue; + + try + { + await client.PingAsync("keepalive", ct); + Debug($"[KEEPALIVE] Ping sent to headless server"); + } + catch (OperationCanceledException) { break; } + catch (Exception ex) + { + Debug($"[KEEPALIVE] Ping failed: {ex.Message}"); + } + } + } + catch (OperationCanceledException) { } + catch (Exception ex) { Debug($"[KEEPALIVE] Loop exited: {ex.Message}"); } + } + private void Debug(string message) { LastDebugMessage = message; @@ -536,6 +588,7 @@ private void Debug(string message) message.StartsWith("[DISPATCH") || message.StartsWith("[WATCHDOG") || message.StartsWith("[HEALTH") || message.StartsWith("[ZERO-IDLE") || message.StartsWith("[PERMISSION") || message.StartsWith("[RESUME-ABORT") || + message.StartsWith("[KEEPALIVE") || message.Contains("watchdog")) { try @@ -834,6 +887,10 @@ public async Task InitializeAsync(CancellationToken cancellationToken = default) // Initialize any registered providers (from DI / plugin loader) await InitializeProvidersAsync(cancellationToken); + + // Start keepalive pinging to prevent server idle timeout + if (!IsDemoMode && !IsRemoteMode && _client != null) + StartKeepalivePing(); } /// @@ -897,6 +954,7 @@ public async Task ReconnectAsync(ConnectionSettings settings, CancellationToken _currentSettings = settings; StopConnectivityMonitoring(); + StopKeepalivePing(); await StopCodespaceHealthCheckAsync(); // Dispose existing sessions and client @@ -996,6 +1054,10 @@ public async Task ReconnectAsync(ConnectionSettings settings, CancellationToken // Re-initialize providers after reconnect await InitializeProvidersAsync(cancellationToken); + + // Start keepalive pinging to prevent server idle timeout + if (!IsDemoMode && !IsRemoteMode && _client != null) + StartKeepalivePing(); } /// @@ -4013,6 +4075,7 @@ public void ClearHistory(string name) public async ValueTask DisposeAsync() { StopConnectivityMonitoring(); + StopKeepalivePing(); await StopCodespaceHealthCheckAsync(); // Flush any pending debounced writes immediately From 802fe99d166e0f287e2ee7b85bfb0dfebca17180 Mon Sep 17 00:00:00 2001 From: Shane Neuville Date: Tue, 17 Mar 2026 12:21:45 -0500 Subject: [PATCH 2/2] Fix review: keepalive in restart/recovery paths, atomic StartKeepalivePing - TryRecoverPersistentServerAsync: stop before server kill, start after successful recovery - RestartServerAsync: stop at entry, start after sessions restored - StartKeepalivePing: use Interlocked.Exchange instead of plain assignment to prevent race with concurrent DisposeAsync Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- PolyPilot/Services/CopilotService.cs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/PolyPilot/Services/CopilotService.cs b/PolyPilot/Services/CopilotService.cs index 0e75805c8f..3b7bee8e4b 100644 --- a/PolyPilot/Services/CopilotService.cs +++ b/PolyPilot/Services/CopilotService.cs @@ -530,9 +530,13 @@ private static void DisposePrematureIdleSignal(SessionState? state) private void StartKeepalivePing() { - StopKeepalivePing(); var cts = new CancellationTokenSource(); - _keepaliveCts = cts; + var prev = Interlocked.Exchange(ref _keepaliveCts, cts); + if (prev != null) + { + try { prev.Cancel(); } catch { } + prev.Dispose(); + } _ = RunKeepalivePingAsync(cts.Token); } @@ -1092,6 +1096,7 @@ internal async Task TryRecoverPersistentServerAsync() Debug("[SERVER-RECOVERY] Attempting persistent server recovery (auth/connectivity failure suspected)..."); // Stop the old server — it's running but broken (e.g., expired auth token cached in-process) + StopKeepalivePing(); _serverManager.StopServer(); // Wait for the old server to fully release the port @@ -1125,6 +1130,7 @@ internal async Task TryRecoverPersistentServerAsync() FallbackNotice = "Persistent server was automatically restarted due to repeated failures. Your sessions should work again."; Interlocked.Exchange(ref _consecutiveWatchdogTimeouts, 0); _lastRecoveryCompletedAt = DateTime.UtcNow; + StartKeepalivePing(); InvokeOnUI(() => OnStateChanged?.Invoke()); return true; } @@ -1162,6 +1168,7 @@ public async Task RestartServerAsync(CancellationToken cancellationToken = defau { Debug("[SERVER-RESTART] Restarting headless server due to native module failure..."); ServerHealthNotice = null; + StopKeepalivePing(); // 1. Dispose all existing sessions (they hold broken connections) foreach (var state in _sessions.Values) @@ -1231,6 +1238,7 @@ public async Task RestartServerAsync(CancellationToken cancellationToken = defau await RestorePreviousSessionsAsync(cancellationToken); FlushSaveActiveSessionsToDisk(); ReconcileOrganization(); + StartKeepalivePing(); OnStateChanged?.Invoke(); Debug("[SERVER-RESTART] Server restart complete, all sessions restored");