diff --git a/.claude/skills/health-monitoring/SKILL.md b/.claude/skills/health-monitoring/SKILL.md new file mode 100644 index 000000000..39a0c0f14 --- /dev/null +++ b/.claude/skills/health-monitoring/SKILL.md @@ -0,0 +1,395 @@ +--- +name: health-monitoring +description: > + Real-time health monitoring for PolyPilot sessions. Use when the user asks to + "monitor sessions", "watch for issues", "check health", "make sure everything is + working", "keep an eye on things", or any request to continuously observe running + PolyPilot sessions for reliability issues. Also use when the user reports a session + is "stuck", "not responding", shows "Thinking..." forever, or threw a "Tool execution + stuck" error — diagnose the issue using the techniques below. Covers: single sessions, + multi-agent orchestration, sub-agent IDLE-DEFER, app restart recovery, and connection + health. +--- + +# PolyPilot Health Monitoring + +Continuously monitor all running PolyPilot sessions for reliability issues. Your job is +to be a vigilant observer — detect problems early, diagnose root causes, and either fix +them or report them clearly. + +## Quick Start + +### 1. Start the live event stream + +```bash +tail -n0 -F ~/.polypilot/event-diagnostics.log +``` + +Run this in an async shell and read it periodically (every 2-5 minutes). This is your +primary signal source — every important state change is logged here. + +### 2. Initial baseline check + +Before monitoring, establish what "healthy" looks like right now: + +```bash +# How many sessions exist? +python3 -c "import json; d=json.load(open('$HOME/.polypilot/active-sessions.json')); print(f'{len(d)} active sessions')" + +# Any recent errors? +grep -E '\[ERROR\]|\[WATCHDOG\].*timeout|\[TOOL-HEALTH\].*recovery|\[RECONNECT\].*replacing' \ + ~/.polypilot/event-diagnostics.log | tail -10 + +# Any phantom sessions? +python3 -c " +import json +d=json.load(open('$HOME/.polypilot/active-sessions.json')) +phantoms=[s['DisplayName'] for s in d if '(previous)' in s.get('DisplayName','') or '(resumed)' in s.get('DisplayName','')] +print(f'{len(phantoms)} phantom sessions' + (': '+', '.join(phantoms) if phantoms else '')) +" +``` + +### 3. Periodic health checks + +Every 2-5 minutes, read the event stream and look for problems. Filter for the +important events — skip the noisy `[EVT]` lines unless diagnosing a specific issue: + +```bash +tail -200 ~/.polypilot/event-diagnostics.log | grep -E \ + 'ERROR|WATCHDOG|RECONNECT|TOOL-HEALTH|COMPLETE|IDLE-DEFER|SEND|KEEPALIVE|HEALTH|DISPATCH|ABORT|INTERRUPTED' +``` + +## What Healthy Looks Like + +A healthy session lifecycle follows this pattern: + +``` +[SEND] → [EVT] TurnStart → [EVT] TurnEnd → [EVT] TurnStart → ... → [EVT] SessionIdle → [IDLE] → [COMPLETE] +``` + +Key indicators of health: +- **SEND → COMPLETE** cycle completes (check `flushedLen` is non-zero for real responses) +- **IDLE-DEFER** fires when sub-agents are active, then COMPLETE fires when they finish +- **KEEPALIVE** pings appear every 15 minutes +- **No ERROR, WATCHDOG timeout, or RECONNECT events** + +## What Problems Look Like + +### Problem: Session stuck at "Thinking..." + +**Symptom:** `[SEND]` logged but no `[COMPLETE]` after several minutes. + +**Diagnosis:** +```bash +# Check the last events for the stuck session +grep "SessionName" ~/.polypilot/event-diagnostics.log | tail -10 +``` + +**Possible causes:** +1. **No events flowing** → Connection may be dead. Look for `[WATCHDOG]` events — the + watchdog should detect and recover within 2 minutes (120s inactivity timeout). +2. **Events flowing but no IDLE** → Session is legitimately working (tool execution). + Check if `AssistantTurnStart/End` events are still arriving. +3. **IDLE-DEFER active** → Sub-agents are still running. Check `backgroundTasks` count + in the `[IDLE-DIAG]` line. This is normal for multi-agent and sub-agent sessions. + +### Problem: "Tool execution stuck" error message + +**Symptom:** User sees "Tool execution stuck (reason). Session recovered automatically." + +**Diagnosis:** +```bash +grep "TOOL-HEALTH" ~/.polypilot/event-diagnostics.log | tail -10 +``` + +This fires when the `ToolHealthCheck` timer detects no events for 30s after a tool +starts, and the server is either dead or has been stale for multiple checks. The session +is auto-recovered — check if `[COMPLETE]` follows. If the response was meaningful +(`flushedLen > 0`), the recovery worked. If `flushedLen=0`, the tool died before +producing output. + +### Problem: Client recreation / Blazor surface reset + +**Symptom:** All sessions briefly reset in the UI, user sees a flash. + +**Diagnosis:** +```bash +grep "RECONNECT.*replacing state\|Recreating client" ~/.polypilot/event-diagnostics.log | tail -5 +``` + +This happens when `SendAsync` throws a connection error. The app recreates the +`CopilotClient` and re-resumes all sessions. Check: +- **What session triggered it** — the `[SEND]` just before the first `[RECONNECT]` +- **Did all sessions recover?** — Look for `Failed to re-resume sibling` entries +- **Was the server alive?** — Check if `[KEEPALIVE]` pings succeeded before/after + +```bash +# Find sessions that failed to re-resume +grep "Failed to re-resume" ~/.polypilot/event-diagnostics.log | tail -10 +``` + +Sessions that fail to re-resume with "Session not found" had their server-side session +expired. This is normal for very old idle sessions. + +### Problem: Watchdog timeout + +**Symptom:** `[WATCHDOG]` line with `IsProcessing=false` and a system message about +"Session appears stuck". + +**Diagnosis:** +```bash +grep "WATCHDOG.*IsProcessing=false\|WATCHDOG.*timeout\|WATCHDOG.*stuck" \ + ~/.polypilot/event-diagnostics.log | tail -5 +``` + +Check which timeout tier fired: +- **30s** (resume quiescence) — Session was resumed after restart but never received + events. The turn likely completed before the restart. Normal. +- **120s** (inactivity) — No events for 2 minutes with no tool activity. Connection + may have dropped. +- **600s** (tool execution) — Tool was running for 10 minutes with no events. Rare + but can happen for very long builds. + +### Problem: Phantom `(previous)` sessions + +**Symptom:** Session list shows duplicates with `(previous)` suffix. + +**Diagnosis:** +```bash +python3 -c " +import json +d=json.load(open('$HOME/.polypilot/active-sessions.json')) +for s in d: + if '(previous)' in s.get('DisplayName',''): + print(f'{s[\"DisplayName\"]}: group={s.get(\"GroupId\",\"\")}, recovered={s.get(\"RecoveredFromSessionId\",\"\")}')" +``` + +These are caused by worker revival creating a new session while the old one lingers in +`active-sessions.json`. The fix in PR #531 added `_closedSessionIds` tracking to prevent +this. If you still see phantoms, check `RecoveredFromSessionId` — the old session should +have been excluded from the merge. + +### Problem: DISPATCH-RECOVER false positive + +**Symptom:** `[DISPATCH-RECOVER]` in the log when workers completed normally. + +**Diagnosis:** +```bash +grep "DISPATCH-RECOVER" ~/.polypilot/event-diagnostics.log | tail -5 +``` + +This should be rare after PR #531's two-phase mtime check. If it fires, check: +1. Was the worker's `events.jsonl` actually written to during the grace period? +2. Did a `session.resume` event cause a false write? + +## Multi-Agent Monitoring + +Multi-agent sessions have extra complexity. Monitor these patterns: + +### Healthy orchestrator flow +``` +[DISPATCH-ROUTE] → [SEND] orchestrator → [IDLE-DEFER] (agents=N) → ... → [COMPLETE] +``` + +Between IDLE-DEFER and COMPLETE, workers are running. Check worker events: +```bash +grep "worker" ~/.polypilot/event-diagnostics.log | tail -20 +``` + +### Worker completion timing +Workers should complete in 2-30s for simple tasks, 1-10 minutes for complex ones. +If a worker hasn't produced events in 5+ minutes, check its events.jsonl: + +```bash +# Find worker session ID +python3 -c " +import json +d=json.load(open('$HOME/.polypilot/active-sessions.json')) +for s in d: + if 'worker' in s.get('DisplayName','').lower(): + print(f'{s[\"DisplayName\"]}: {s[\"SessionId\"]}')" | head -10 + +# Check last event time for a specific worker +ls -la ~/.copilot/session-state/SESSION_ID/events.jsonl +``` + +### IDLE-DEFER-REARM +When `session.idle` arrives with active background tasks but `IsProcessing` was already +cleared (by watchdog or race), IDLE-DEFER-REARM re-arms processing. This is correct +behavior — it keeps the session alive until sub-agents finish: + +``` +[IDLE-DEFER-REARM] 'SessionName' re-arming IsProcessing — background tasks active but processing was cleared +``` + +## Post-Relaunch Recovery Check + +After `./relaunch.sh`, do an exhaustive recovery check: + +```bash +# 1. Wait for the app to restart (~15-20s) +sleep 20 + +# 2. Check the relaunch log +tail -5 ~/.polypilot/relaunch.log + +# 3. Check for RECONNECT events (the new app reconnects all sessions) +grep "RECONNECT" ~/.polypilot/event-diagnostics.log | tail -20 + +# 4. Check for failed re-resumes +grep "Failed to re-resume" ~/.polypilot/event-diagnostics.log | tail -10 + +# 5. Verify no sessions are stuck processing +grep "IsProcessing=True" ~/.polypilot/event-diagnostics.log | tail -5 + +# 6. Check for phantom sessions +python3 -c " +import json +d=json.load(open('$HOME/.polypilot/active-sessions.json')) +phantoms=[s['DisplayName'] for s in d if '(previous)' in s.get('DisplayName','')] +stuck=[s['DisplayName'] for s in d if s.get('LastPrompt')] +print(f'Phantoms: {len(phantoms)}, Sessions with pending prompts: {len(stuck)}') +if phantoms: print(' Phantom:', phantoms) +if stuck: print(' Pending:', stuck) +" + +# 7. If any sessions were actively processing during relaunch, verify they resumed +grep "SEND\|COMPLETE" ~/.polypilot/event-diagnostics.log | tail -10 +``` + +**What to look for:** +- All sessions should show `[RECONNECT] Re-resumed sibling session` (except expired ones) +- Sessions that were processing should either complete or be detected by the watchdog +- No `(previous)` phantom sessions should appear +- The `[HEALTH] Connection healthy after resume/wake` event should appear + +## Reporting + +When monitoring, provide periodic status updates: + +| Session | Status | Details | +|---------|--------|---------| +| Name | ✅/⚠️/❌ | Brief description | + +Use: +- ✅ for clean SEND→COMPLETE cycles +- ⚠️ for IDLE-DEFER (expected but worth noting), RECONNECT (recovered), long-running tools +- ❌ for WATCHDOG timeouts, TOOL-HEALTH recovery, ERROR events, stuck sessions + +If you detect an issue, diagnose it immediately using the techniques above. If it's a +code bug (not a transient connection issue), investigate the source code and propose a fix. + +## Mobile App Monitoring (WsBridge + DevTunnel) + +The mobile app connects to the desktop via WebSocket through a DevTunnel. Monitor this +entire chain for reliability issues. + +### Architecture + +``` +Mobile App (iOS/Android) + → DevTunnel (wss://TUNNEL_ID.usw3.devtunnels.ms) + → WsBridgeServer (localhost:4322) + → CopilotService (desktop) +``` + +### Quick Health Check + +```bash +# 1. Is the bridge server listening? +lsof -i :4322 + +# 2. Is DevTunnel running? +ps aux | grep "devtunnel host" | grep -v grep + +# 3. Is the tunnel reachable? +curl -s -o /dev/null -w "%{http_code}" https://TUNNEL_ID.usw3.devtunnels.ms/ + +# 4. Is the devtunnel login still valid? +devtunnel show TUNNEL_ID 2>&1 | head -5 +# If "Login token expired" → tunnel may reject new connections + +# 5. Check for connected mobile clients +# WsBridgeServer logs connect/disconnect to stdout (Console.WriteLine) +# Check app logs: +maui devflow MAUI logs --limit 30 --agent-port 9223 2>&1 | grep -i "client\|connect\|bridge" + +# 6. Bridge-related errors in crash log +grep -i "bridge\|WebSocket\|tunnel" ~/.polypilot/crash.log | tail -5 + +# 7. Bridge-related diagnostic events +grep -E "BRIDGE|SyncRemote|SmartURL" ~/.polypilot/event-diagnostics.log | tail -10 +``` + +### Common Mobile Issues + +#### Mobile shows "Connecting..." or blank session list +1. Check DevTunnel is running (`ps aux | grep devtunnel`) +2. Check bridge port is listening (`lsof -i :4322`) +3. Check tunnel is reachable (`curl` the tunnel URL — expect 404, not connection error) +4. If `devtunnel show` says "Login token expired" — the tunnel host process may still + work but new management operations fail. The existing tunnel should keep forwarding. + +#### Mobile sends message but never gets response +The bridge proxies prompts via `DispatchBridgePromptAsync`. Check: +```bash +# Stack traces from bridge prompt dispatch +grep "DispatchBridgePromptAsync" ~/.polypilot/event-diagnostics.log | tail -5 + +# Bridge completion events +grep "BRIDGE-COMPLETE\|BRIDGE-SESSION" ~/.polypilot/event-diagnostics.log | tail -10 +``` + +If `DispatchBridgePromptAsync` has stack traces, the desktop CopilotService failed to +process the prompt. The mobile will show "Working..." indefinitely. Check if the +triggering session had a connection error (look for RECONNECT events around the same time). + +#### Mobile shows session as "Working" when desktop shows idle +The bridge syncs `IsProcessing` state. If a reconnect force-completed a session on +desktop but the bridge state-sync hasn't fired yet, mobile shows stale state. The +next `SyncRemoteSessions` call (triggered by `OnStateChanged`) should fix it. If not: +```bash +grep "SyncRemoteSessions" ~/.polypilot/event-diagnostics.log | tail -5 +``` + +#### Mobile disconnects frequently +Check network quality. The WebSocket connection goes through DevTunnel → Azure → +mobile network. Each hop can drop. The `WsBridgeClient` on mobile has auto-reconnect +logic, but gaps cause temporary UI freezes. + +```bash +# Check for SmartURL network change events (WiFi↔Cellular transitions) +grep "SmartURL" ~/.polypilot/event-diagnostics.log | tail -10 + +# Check HEALTH events (Mac wake/sleep affects tunnel) +grep "HEALTH" ~/.polypilot/event-diagnostics.log | tail -10 +``` + +### Bridge-Specific Diagnostic Tags + +| Tag | Meaning | +|-----|---------| +| `[BRIDGE-COMPLETE]` | Bridge `OnTurnEnd` cleared IsProcessing for a remote session | +| `[BRIDGE-SESSION-COMPLETE]` | Stale IsProcessing cleared during bridge sync | +| `[SmartURL]` | Network change detected (WiFi gain/loss) — may trigger reconnect | +| `[HEALTH]` | Connection health check after Mac wake/sleep | + +### DevTunnel Management + +```bash +# Re-login if token expired (interactive — opens browser) +devtunnel login + +# Show tunnel details +devtunnel show TUNNEL_ID + +# Restart tunnel hosting (if process died) +devtunnel host TUNNEL_ID --allow-anonymous & + +# Check tunnel port forwarding +devtunnel list +``` + +Note: The DevTunnel host process runs independently from PolyPilot. If the Mac sleeps +and wakes, the tunnel process usually survives but the WebSocket connections through +it may need to re-establish. diff --git a/PolyPilot.Tests/SessionStabilityTests.cs b/PolyPilot.Tests/SessionStabilityTests.cs index a10b680c5..6d0efe182 100644 --- a/PolyPilot.Tests/SessionStabilityTests.cs +++ b/PolyPilot.Tests/SessionStabilityTests.cs @@ -79,27 +79,21 @@ public void ForceCompleteProcessing_ClearsAllInv1Fields() var source = File.ReadAllText(TestPaths.OrganizationCs); var method = ExtractMethod(source, "Task ForceCompleteProcessingAsync"); - // Every INV-1 field must be cleared - var requiredClears = new[] + // ForceCompleteProcessingAsync must call ClearProcessingState (which atomically + // clears all INV-1 fields) and the other required operations + var requiredPatterns = new[] { - "ActiveToolCallCount", // INV-1 field 3 - "HasUsedToolsThisTurn", // INV-1 field 2 - "SendingFlag", // INV-1 field 7 - "IsResumed", // INV-1 field 1 - "ProcessingStartedAt", // INV-1 field 4 - "ToolCallCount", // INV-1 field 5 - "ProcessingPhase", // INV-1 field 6 - "ClearPermissionDenials", // INV-1 field 8 - "FlushCurrentResponse", // INV-1 field 9 - "IsProcessing", // The flag itself - "OnSessionComplete", // INV-1 field 10 - "TrySetResult", // Resolves the worker TCS + "ClearProcessingState(state", // Atomic INV-1 field clearing + "FlushCurrentResponse", // INV-1 field 9 + "OnSessionComplete", // INV-1 field 10 + "TrySetResult", // Resolves the worker TCS + "AllowTurnStartRearm = false", // Explicit recovery terminal }; - foreach (var field in requiredClears) + foreach (var field in requiredPatterns) { Assert.True(method.Contains(field, StringComparison.Ordinal), - $"ForceCompleteProcessingAsync must clear '{field}' (INV-1 compliance)"); + $"ForceCompleteProcessingAsync must contain '{field}'"); } } @@ -141,12 +135,15 @@ public void ForceCompleteProcessing_BoundsAbortAsyncTimeout() } [Fact] - public void ForceCompleteProcessing_PreservesCarryOverDeferredIdleFingerprint() + public void ForceCompleteProcessing_UsesClearProcessingState() { + // ForceCompleteProcessingAsync must delegate to ClearProcessingState rather than + // manually clearing fields. ClearProcessingState calls ClearDeferredIdleTracking + // internally (without preserveCarryOver, which is only needed in SendPromptAsync). var source = File.ReadAllText(TestPaths.OrganizationCs); var method = ExtractMethod(source, "Task ForceCompleteProcessingAsync"); - Assert.Contains("ClearDeferredIdleTracking(state, preserveCarryOver: true)", method); + Assert.Contains("ClearProcessingState(state", method); } [Fact] diff --git a/PolyPilot/Services/CopilotService.Organization.cs b/PolyPilot/Services/CopilotService.Organization.cs index b8c790b49..37d39ba15 100644 --- a/PolyPilot/Services/CopilotService.Organization.cs +++ b/PolyPilot/Services/CopilotService.Organization.cs @@ -2216,24 +2216,13 @@ private async Task ForceCompleteProcessingAsync(string sessionName, SessionState { if (!state.Info.IsProcessing) { tcs.TrySetResult(true); return; } - // Full cleanup mirroring CompleteResponse / unstartedWorkers recovery + // Full cleanup via ClearProcessingState FlushCurrentResponse(state); - Interlocked.Exchange(ref state.ActiveToolCallCount, 0); - Interlocked.Exchange(ref state.SendingFlag, 0); - Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); Interlocked.Exchange(ref state.WatchdogCaseAResets, 0); Interlocked.Exchange(ref state.WatchdogCaseBResets, 0); Interlocked.Exchange(ref state.WatchdogCaseBLastFileSize, 0); Interlocked.Exchange(ref state.WatchdogCaseBStaleCount, 0); - state.HasUsedToolsThisTurn = false; - ClearDeferredIdleTracking(state, preserveCarryOver: true); - state.AllowTurnStartRearm = false; // Force-complete is explicit recovery, not a speculative idle completion state.FallbackCanceledByTurnStart = false; - state.Info.IsResumed = false; - state.Info.ProcessingStartedAt = null; - state.Info.ToolCallCount = 0; - state.Info.ProcessingPhase = 0; - state.Info.ClearPermissionDenials(); var response = state.CurrentResponse.ToString(); var fullResponse = state.FlushedResponse.Length > 0 @@ -2242,11 +2231,11 @@ private async Task ForceCompleteProcessingAsync(string sessionName, SessionState : state.FlushedResponse + "\n\n" + response) : response; - state.CurrentResponse.Clear(); - state.FlushedResponse.Clear(); - ClearFlushedReplayDedup(state); - state.PendingReasoningMessages.Clear(); - state.Info.IsProcessing = false; + // Accumulate API time but don't count as premium request (forced recovery) + if (state.Info.ProcessingStartedAt is { } forceStarted) + state.Info.TotalApiTimeSeconds += (DateTime.UtcNow - forceStarted).TotalSeconds; + ClearProcessingState(state, accumulateApiTime: false); + state.AllowTurnStartRearm = false; // Force-complete is explicit recovery, not a speculative idle completion state.ResponseCompletion?.TrySetResult(fullResponse); var summary = fullResponse.Length > 0 ? (fullResponse.Length > 100 ? fullResponse[..100] + "..." : fullResponse) : ""; diff --git a/PolyPilot/Services/CopilotService.cs b/PolyPilot/Services/CopilotService.cs index 3d1de493c..7a98814f3 100644 --- a/PolyPilot/Services/CopilotService.cs +++ b/PolyPilot/Services/CopilotService.cs @@ -919,6 +919,28 @@ internal static bool ShouldPersistDiagnostic(string message) message.Contains("watchdog") || message.Contains("Failed to"); } + /// + /// Static logging entry point for WsBridgeServer diagnostics. + /// Writes directly to the event-diagnostics.log file without requiring a CopilotService instance. + /// + internal static void LogBridgeDiagnostic(string message) + { + Console.WriteLine($"[DEBUG] {message}"); + if (ShouldPersistDiagnostic(message)) + { + try + { + lock (_diagnosticLogLock) + { + var logPath = Path.Combine(PolyPilotBaseDir, "event-diagnostics.log"); + File.AppendAllText(logPath, + $"{DateTime.UtcNow:yyyy-MM-dd HH:mm:ss.fff} {message}{Environment.NewLine}"); + } + } + catch { } + } + } + private void Debug(string message) { LastDebugMessage = message; @@ -3789,6 +3811,10 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis { Debug($"[RECONNECT] Session ID changed on resume: '{state.Info.SessionId}' → '{actualId}' for '{sessionName}'"); CopyEventsToNewSession(state.Info.SessionId, actualId); + // Mark the old session ID as closed so the merge in SaveActiveSessionsToDisk + // doesn't re-add it — otherwise a stale entry with the old ID lingers and + // gets renamed to "(previous)" on the next save cycle. + _closedSessionIds[state.Info.SessionId] = 0; state.Info.SessionId = actualId; // Persist the new session ID so restarts don't revert to the old one FlushSaveActiveSessionsToDisk(); diff --git a/PolyPilot/Services/WsBridgeServer.cs b/PolyPilot/Services/WsBridgeServer.cs index b4df8e13c..01abbf302 100644 --- a/PolyPilot/Services/WsBridgeServer.cs +++ b/PolyPilot/Services/WsBridgeServer.cs @@ -43,6 +43,26 @@ private record PendingBridgePrompt(string SessionName, string Message, string? A public bool IsRunning => _listener?.IsListening == true && _proxyListener != null; public bool SupportsRemoteConnections { get; private set; } + /// + /// Logs a bridge diagnostic message to the event-diagnostics log via CopilotService. + /// Falls back to Console.WriteLine if CopilotService is not yet attached. + /// All messages are prefixed with [BRIDGE] for filtering. + /// + private void BridgeLog(string message) + { + var tagged = message.StartsWith("[") ? message : $"[BRIDGE] {message}"; + if (_copilot != null) + { + // Use reflection-free approach: CopilotService.LogExternal is a public static method + // that writes to the event-diagnostics log without requiring instance access. + CopilotService.LogBridgeDiagnostic(tagged); + } + else + { + Console.WriteLine(tagged); + } + } + /// /// Access token that clients must provide via X-Tunnel-Authorization header or query param. /// @@ -69,12 +89,12 @@ public void Start(int bridgePort, int targetPort) if (TryBindBridgePipeline(bridgePort)) { - Console.WriteLine($"[WsBridge] Listening on port {bridgePort} (state-sync mode, network via loopback proxy localhost:{_internalListenerPort})"); + BridgeLog($"[BRIDGE] Listening on port {bridgePort} (state-sync mode, network via loopback proxy localhost:{_internalListenerPort})"); OnStateChanged?.Invoke(); } else { - Console.WriteLine($"[WsBridge] Port {bridgePort} unavailable — will retry in accept loops"); + BridgeLog($"[BRIDGE] Port {bridgePort} unavailable — will retry in accept loops"); } _acceptTask = AcceptLoopAsync(_cts.Token); @@ -113,12 +133,12 @@ private bool TryBindInternalListener() listener.Start(); _listener = listener; _internalListenerPort = port; - Console.WriteLine($"[WsBridge] Internal listener on localhost:{port}"); + BridgeLog($"[BRIDGE] Internal listener on localhost:{port}"); return true; } catch (Exception ex) { - Console.WriteLine($"[WsBridge] Internal bind on localhost:{port} failed: {ex.Message}"); + BridgeLog($"[BRIDGE] Internal bind on localhost:{port} failed: {ex.Message}"); } } @@ -133,12 +153,12 @@ private bool TryBindProxyListener(int port) proxy.Server.DualMode = true; proxy.Start(); _proxyListener = proxy; - Console.WriteLine($"[WsBridge] Public proxy listening on port {port} (dual-stack)"); + BridgeLog($"[BRIDGE] Public proxy listening on port {port} (dual-stack)"); return true; } catch (Exception ex) { - Console.WriteLine($"[WsBridge] Public dual-stack bind on port {port} failed: {ex.Message}"); + BridgeLog($"[BRIDGE] Public dual-stack bind on port {port} failed: {ex.Message}"); } try @@ -146,12 +166,12 @@ private bool TryBindProxyListener(int port) var proxy = new TcpListener(IPAddress.Any, port); proxy.Start(); _proxyListener = proxy; - Console.WriteLine($"[WsBridge] Public proxy listening on port {port} (IPv4)"); + BridgeLog($"[BRIDGE] Public proxy listening on port {port} (IPv4)"); return true; } catch (Exception ex) { - Console.WriteLine($"[WsBridge] Public bind on port {port} failed: {ex.Message}"); + BridgeLog($"[BRIDGE] Public bind on port {port} failed: {ex.Message}"); return false; } } @@ -296,14 +316,14 @@ public async Task DrainPendingPromptsAsync() { while (_pendingBridgePrompts.TryDequeue(out var pending)) { - Console.WriteLine($"[BRIDGE] Replaying queued prompt for '{pending.SessionName}'"); + BridgeLog($"[BRIDGE] Replaying queued prompt for '{pending.SessionName}'"); try { await DispatchBridgePromptAsync(pending.SessionName, pending.Message, pending.AgentMode); } catch (Exception ex) { - Console.WriteLine($"[BRIDGE] Failed to replay prompt for '{pending.SessionName}': {ex.Message}"); + BridgeLog($"[BRIDGE] Failed to replay prompt for '{pending.SessionName}': {ex.Message}"); } } } @@ -329,7 +349,7 @@ private async Task DispatchBridgePromptAsync(string sessionName, string message, var orchGroup = _copilot.Organization.Groups.FirstOrDefault(g => g.Id == orchGroupId); if (orchGroup?.OrchestratorMode == MultiAgentMode.OrchestratorReflect) _copilot.StartGroupReflection(orchGroupId, message, orchGroup.MaxReflectIterations ?? 5); - Console.WriteLine($"[WsBridge] Routing '{sessionName}' through orchestration pipeline (group={orchGroupId})"); + BridgeLog($"[BRIDGE] Routing '{sessionName}' through orchestration pipeline (group={orchGroupId})"); await _copilot.SendToMultiAgentGroupAsync(orchGroupId, message, ct); } else @@ -351,13 +371,13 @@ private async Task DispatchBridgePromptAsync(string sessionName, string message, { // Orchestrated sessions route through SendToMultiAgentGroupAsync which has // its own busy handling; blindly queuing would bypass the orchestration pipeline. - Console.WriteLine($"[WsBridge] Orchestrator '{sessionName}' busy, dropping mobile message (retry manually)"); + BridgeLog($"[BRIDGE] Orchestrator '{sessionName}' busy, dropping mobile message (retry manually)"); Broadcast(BridgeMessage.Create(BridgeMessageTypes.ErrorEvent, new ErrorPayload { SessionName = sessionName, Error = "Session is busy processing a request. Please retry when the current turn completes." })); } else { - Console.WriteLine($"[WsBridge] '{sessionName}' busy, queuing mobile message for next turn"); + BridgeLog($"[BRIDGE] '{sessionName}' busy, queuing mobile message for next turn"); _copilot.EnqueueMessage(sessionName, message, agentMode: agentMode); } return Task.CompletedTask; @@ -378,7 +398,7 @@ public void Stop() foreach (var kvp in _clientSendLocks) kvp.Value.Dispose(); _clientSendLocks.Clear(); StopListenersOnly(); - Console.WriteLine("[WsBridge] Stopped"); + BridgeLog("[BRIDGE] Stopped"); OnStateChanged?.Invoke(); } @@ -425,7 +445,7 @@ private async Task AcceptLoopAsync(CancellationToken ct) { context.Response.StatusCode = 429; context.Response.Close(); - Console.WriteLine("[WsBridge] Pair request rate-limited"); + BridgeLog("[BRIDGE] Pair request rate-limited"); continue; } _ = Task.Run(() => HandlePairHandshakeAsync(context, ct), ct); @@ -436,7 +456,7 @@ private async Task AcceptLoopAsync(CancellationToken ct) { context.Response.StatusCode = 401; context.Response.Close(); - Console.WriteLine("[WsBridge] Rejected unauthenticated WebSocket connection"); + BridgeLog("[BRIDGE] Rejected unauthenticated WebSocket connection"); continue; } _ = Task.Run(() => HandleClientAsync(context, ct), ct); @@ -485,13 +505,13 @@ private async Task AcceptLoopAsync(CancellationToken ct) catch (HttpListenerException ex) { if (ct.IsCancellationRequested) break; - Console.WriteLine($"[WsBridge] Listener error ({ex.ErrorCode}): {ex.Message} — will restart"); + BridgeLog($"[BRIDGE] Listener error ({ex.ErrorCode}): {ex.Message} — will restart"); await _restartLock.WaitAsync(CancellationToken.None).ConfigureAwait(false); try { StopListenersOnly(); } finally { _restartLock.Release(); } } catch (Exception ex) { - Console.WriteLine($"[WsBridge] Accept error: {ex.Message}"); + BridgeLog($"[BRIDGE] Accept error: {ex.Message}"); } } } @@ -539,13 +559,13 @@ private async Task ProxyAcceptLoopAsync(CancellationToken ct) catch (SocketException ex) { if (ct.IsCancellationRequested) break; - Console.WriteLine($"[WsBridge] Proxy accept error: {ex.Message} — will restart"); + BridgeLog($"[BRIDGE] Proxy accept error: {ex.Message} — will restart"); await _restartLock.WaitAsync(CancellationToken.None).ConfigureAwait(false); try { StopListenersOnly(); } finally { _restartLock.Release(); } } catch (Exception ex) { - Console.WriteLine($"[WsBridge] Proxy error: {ex.Message}"); + BridgeLog($"[BRIDGE] Proxy error: {ex.Message}"); } } } @@ -568,7 +588,7 @@ private async Task TryRestartListenerAsync(CancellationToken ct) if (TryBindBridgePipeline(_bridgePort)) { - Console.WriteLine($"[WsBridge] Restarted listening on port {_bridgePort}"); + BridgeLog($"[BRIDGE] Restarted listening on port {_bridgePort}"); OnStateChanged?.Invoke(); return true; } @@ -605,7 +625,7 @@ private async Task ProxyClientAsync(TcpClient client, CancellationToken ct) var request = await ReadProxyRequestAsync(downstreamStream, ct).ConfigureAwait(false); if (request == null) { - Console.WriteLine("[WsBridge] Proxy request dropped (incomplete headers or exceeded 64KB limit)"); + BridgeLog("[BRIDGE] Proxy request dropped (incomplete headers or exceeded 64KB limit)"); return; } @@ -627,7 +647,7 @@ private async Task ProxyClientAsync(TcpClient client, CancellationToken ct) catch (OperationCanceledException) { } catch (Exception ex) { - Console.WriteLine($"[WsBridge] Proxy client error: {ex.Message}"); + BridgeLog($"[BRIDGE] Proxy client error: {ex.Message}"); } } @@ -799,7 +819,7 @@ private async Task HandleClientAsync(HttpListenerContext httpContext, Cancellati ws = wsContext.WebSocket; _clients[clientId] = ws; _clientSendLocks[clientId] = new SemaphoreSlim(1, 1); - Console.WriteLine($"[WsBridge] Client {clientId} connected ({_clients.Count} total)"); + BridgeLog($"[BRIDGE] Client {clientId} connected ({_clients.Count} total)"); // Send initial state — if the server is still restoring sessions, wait so the // client doesn't see sessions with MessageCount=0 (History hasn't loaded from @@ -808,14 +828,14 @@ private async Task HandleClientAsync(HttpListenerContext httpContext, Cancellati { if (_copilot.IsRestoring) { - Console.WriteLine($"[WsBridge] Client {clientId} connected while server is restoring — waiting for restore to complete"); + BridgeLog($"[BRIDGE] Client {clientId} connected while server is restoring — waiting for restore to complete"); var restoreDeadline = DateTime.UtcNow.AddSeconds(30); while (_copilot.IsRestoring && DateTime.UtcNow < restoreDeadline && !ct.IsCancellationRequested) await Task.Delay(200, ct); if (!_copilot.IsRestoring) - Console.WriteLine($"[WsBridge] Restore complete — sending session list to client {clientId}"); + BridgeLog($"[BRIDGE] Restore complete — sending session list to client {clientId}"); else - Console.WriteLine($"[WsBridge] Restore still running after 30s — sending partial session list to client {clientId}"); + BridgeLog($"[BRIDGE] Restore still running after 30s — sending partial session list to client {clientId}"); } await SendToClientAsync(clientId, ws, BridgeMessage.Create(BridgeMessageTypes.SessionsList, BuildSessionsListPayload()), ct); @@ -869,7 +889,7 @@ await SendToClientAsync(clientId, ws, catch (OperationCanceledException) { } catch (Exception ex) { - Console.WriteLine($"[WsBridge] Client {clientId} error: {ex.Message}"); + BridgeLog($"[BRIDGE] Client {clientId} error: {ex.Message}"); } finally { @@ -881,7 +901,7 @@ await SendToClientAsync(clientId, ws, catch { } } ws?.Dispose(); - Console.WriteLine($"[WsBridge] Client {clientId} disconnected ({_clients.Count} remaining)"); + BridgeLog($"[BRIDGE] Client {clientId} disconnected ({_clients.Count} remaining)"); } } @@ -909,7 +929,7 @@ await SendToClientAsync(clientId, ws, var sendReq = msg.GetPayload(); if (sendReq != null && !string.IsNullOrWhiteSpace(sendReq.SessionName) && !string.IsNullOrWhiteSpace(sendReq.Message)) { - Console.WriteLine($"[WsBridge] Client sending message to '{sendReq.SessionName}'"); + BridgeLog($"[BRIDGE] Client sending message to '{sendReq.SessionName}'"); // Fire-and-forget: don't block the client message loop waiting for the full response. // SendPromptAsync awaits ResponseCompletion (minutes). Responses stream back via events. // Blocking here prevents the client from sending abort, switch, or other commands. @@ -921,7 +941,7 @@ await SendToClientAsync(clientId, ws, if (_copilot.IsRestoring) { _pendingBridgePrompts.Enqueue(new PendingBridgePrompt(sendSession, sendMessage, sendAgentMode)); - Console.WriteLine($"[BRIDGE] Queued prompt for '{sendSession}' during restore ({_pendingBridgePrompts.Count} pending)"); + BridgeLog($"[BRIDGE] Queued prompt for '{sendSession}' during restore ({_pendingBridgePrompts.Count} pending)"); break; } @@ -929,7 +949,7 @@ await SendToClientAsync(clientId, ws, _ = Task.Run(async () => { try { await DispatchBridgePromptAsync(sendSession, sendMessage, sendAgentMode, ct); } - catch (Exception ex) { Console.WriteLine($"[WsBridge] SendPromptAsync error for '{sendSession}': {ex.Message}"); } + catch (Exception ex) { BridgeLog($"[BRIDGE] SendPromptAsync error for '{sendSession}': {ex.Message}"); } }); } break; @@ -949,14 +969,14 @@ await SendToClientAsync(clientId, ws, createReq.WorkingDirectory.Contains("..") || !Directory.Exists(createReq.WorkingDirectory)) { - Console.WriteLine($"[WsBridge] Rejected invalid WorkingDirectory: {createReq.WorkingDirectory}"); + BridgeLog($"[BRIDGE] Rejected invalid WorkingDirectory: {createReq.WorkingDirectory}"); await SendToClientAsync(clientId, ws, BridgeMessage.Create(BridgeMessageTypes.ErrorEvent, new ErrorPayload { SessionName = createReq.Name, Error = $"Working directory not found on server: {createReq.WorkingDirectory}" }), ct); break; } } - Console.WriteLine($"[WsBridge] Client creating session '{createReq.Name}'"); + BridgeLog($"[BRIDGE] Client creating session '{createReq.Name}'"); await _copilot.CreateSessionAsync(createReq.Name, createReq.Model, createReq.WorkingDirectory, ct); BroadcastSessionsList(); BroadcastOrganizationState(); @@ -990,24 +1010,24 @@ await SendToClientAsync(clientId, ws, // Validate session ID is a valid GUID to prevent path traversal if (!Guid.TryParse(resumeReq.SessionId, out _)) { - Console.WriteLine($"[WsBridge] Rejected invalid session ID format: {resumeReq.SessionId}"); + BridgeLog($"[BRIDGE] Rejected invalid session ID format: {resumeReq.SessionId}"); await SendToClientAsync(clientId, ws, BridgeMessage.Create(BridgeMessageTypes.ErrorEvent, new ErrorPayload { SessionName = resumeReq.DisplayName ?? "Unknown", Error = "Invalid session ID format" }), ct); break; } - Console.WriteLine($"[WsBridge] Client resuming session '{resumeReq.SessionId}'"); + BridgeLog($"[BRIDGE] Client resuming session '{resumeReq.SessionId}'"); var displayName = resumeReq.DisplayName ?? "Resumed"; try { await _copilot.ResumeSessionAsync(resumeReq.SessionId, displayName, workingDirectory: null, model: null, cancellationToken: ct); - Console.WriteLine($"[WsBridge] Session resumed successfully, broadcasting updated list"); + BridgeLog($"[BRIDGE] Session resumed successfully, broadcasting updated list"); BroadcastSessionsList(); BroadcastOrganizationState(); } catch (Exception resumeEx) { - Console.WriteLine($"[WsBridge] Resume failed: {resumeEx.Message}"); + BridgeLog($"[BRIDGE] Resume failed: {resumeEx.Message}"); await SendToClientAsync(clientId, ws, BridgeMessage.Create(BridgeMessageTypes.ErrorEvent, new ErrorPayload { SessionName = displayName, Error = $"Resume failed: {resumeEx.Message}" }), ct); @@ -1019,7 +1039,7 @@ await SendToClientAsync(clientId, ws, var closeReq = msg.GetPayload(); if (closeReq != null && !string.IsNullOrWhiteSpace(closeReq.SessionName)) { - Console.WriteLine($"[WsBridge] Client closing session '{closeReq.SessionName}'"); + BridgeLog($"[BRIDGE] Client closing session '{closeReq.SessionName}'"); await _copilot.CloseSessionAsync(closeReq.SessionName); } break; @@ -1028,7 +1048,7 @@ await SendToClientAsync(clientId, ws, var abortReq = msg.GetPayload(); if (abortReq != null && !string.IsNullOrWhiteSpace(abortReq.SessionName)) { - Console.WriteLine($"[WsBridge] Client aborting session '{abortReq.SessionName}'"); + BridgeLog($"[BRIDGE] Client aborting session '{abortReq.SessionName}'"); // AbortSessionAsync mutates IsProcessing/History — must run on UI thread _copilot.InvokeOnUI(() => { @@ -1041,7 +1061,7 @@ await SendToClientAsync(clientId, ws, var changeModelReq = msg.GetPayload(); if (changeModelReq != null && !string.IsNullOrWhiteSpace(changeModelReq.SessionName)) { - Console.WriteLine($"[WsBridge] Client changing model for '{changeModelReq.SessionName}' to '{changeModelReq.NewModel}'"); + BridgeLog($"[BRIDGE] Client changing model for '{changeModelReq.SessionName}' to '{changeModelReq.NewModel}'"); var modelChanged = await _copilot.ChangeModelAsync(changeModelReq.SessionName, changeModelReq.NewModel, changeModelReq.ReasoningEffort); if (!modelChanged) { @@ -1058,7 +1078,7 @@ await SendToClientAsync(clientId, ws, var renameReq = msg.GetPayload(); if (renameReq != null && !string.IsNullOrWhiteSpace(renameReq.OldName) && !string.IsNullOrWhiteSpace(renameReq.NewName)) { - Console.WriteLine($"[WsBridge] Client renaming session '{renameReq.OldName}' to '{renameReq.NewName}'"); + BridgeLog($"[BRIDGE] Client renaming session '{renameReq.OldName}' to '{renameReq.NewName}'"); var renamed = _copilot.RenameSession(renameReq.OldName, renameReq.NewName); if (!renamed) { @@ -1139,7 +1159,7 @@ await _copilot.CreateGroupFromPresetAsync(preset, } catch (Exception ex) { - Console.WriteLine($"[WsBridge] CreateGroupFromPreset failed: {ex.Message}"); + BridgeLog($"[BRIDGE] CreateGroupFromPreset failed: {ex.Message}"); try { await SendToClientAsync(presetClientId, presetWs, @@ -1277,7 +1297,7 @@ await SendToClientAsync(clientId, ws, } catch (Exception ex) { - Console.WriteLine($"[WsBridgeServer] RemoveRepo error: {ex.Message}"); + BridgeLog($"[BRIDGE] RemoveRepo error: {ex.Message}"); } } break; @@ -1333,7 +1353,7 @@ await SendToClientAsync(clientId, ws, } catch (Exception ex) { - Console.WriteLine($"[WsBridgeServer] RemoveWorktree error: {ex.Message}"); + BridgeLog($"[BRIDGE] RemoveWorktree error: {ex.Message}"); await SendToClientAsync(clientId, ws, BridgeMessage.Create(BridgeMessageTypes.WorktreeError, new RepoErrorPayload { RequestId = rmWtReq.RequestId, Error = ex.Message }), ct); @@ -1350,7 +1370,7 @@ await SendToClientAsync(clientId, ws, { try { - Console.WriteLine($"[WsBridge] Client creating session+worktree for repo '{cswtReq.RepoId}'"); + BridgeLog($"[BRIDGE] Client creating session+worktree for repo '{cswtReq.RepoId}'"); // Run on UI thread — CreateSessionWithWorktreeAsync calls // ReconcileOrganization() which mutates Organization.Sessions await _copilot.InvokeOnUIAsync(async () => @@ -1370,7 +1390,7 @@ await _copilot.CreateSessionWithWorktreeAsync( } catch (Exception ex) { - Console.WriteLine($"[WsBridge] CreateSessionWithWorktree error: {ex.Message}"); + BridgeLog($"[BRIDGE] CreateSessionWithWorktree error: {ex.Message}"); await SendToClientAsync(clientId, ws, BridgeMessage.Create(BridgeMessageTypes.ErrorEvent, new ErrorPayload { SessionName = cswtReq.SessionName ?? "", Error = $"Create session+worktree failed: {ex.Message}" }), ct); @@ -1388,7 +1408,7 @@ await SendToClientAsync(clientId, ws, } catch (Exception ex) { - Console.WriteLine($"[WsBridge] Error handling {msg.Type}: {ex.Message}"); + BridgeLog($"[BRIDGE] Error handling {msg.Type}: {ex.Message}"); } } @@ -1448,7 +1468,7 @@ await SendToClientAsync(clientId, ws, } catch (Exception ex) { - Console.WriteLine($"[WsBridge] Failed to send directory list: {ex.Message}"); + BridgeLog($"[BRIDGE] Failed to send directory list: {ex.Message}"); } } @@ -1517,7 +1537,7 @@ private async Task SendSessionHistoryToClient(string clientId, WebSocket ws, str try { snapshot = session.History.ToArray(); } catch (Exception ex) { - Console.WriteLine($"[WsBridge] History snapshot failed for '{sessionName}': {ex.Message}"); + BridgeLog($"[BRIDGE] History snapshot failed for '{sessionName}': {ex.Message}"); return; } @@ -1649,7 +1669,7 @@ private void DebouncedBroadcastState() public void BroadcastStateToClients() { if (_clients.IsEmpty) return; - Console.WriteLine("[WsBridge] Broadcasting state to clients after unlock/wake"); + BridgeLog("[BRIDGE] Broadcasting state to clients after unlock/wake"); BroadcastSessionsList(); BroadcastOrganizationState(); } @@ -1741,7 +1761,7 @@ private async Task BroadcastSessionHistoryAsync(string sessionName) } catch (Exception ex) { - Console.WriteLine($"[WsBridge] BroadcastSessionHistory error for '{sessionName}': {ex.Message}"); + BridgeLog($"[BRIDGE] BroadcastSessionHistory error for '{sessionName}': {ex.Message}"); } } @@ -1953,14 +1973,14 @@ private async Task HandlePairHandshakeAsync(HttpListenerContext ctx, Cancellatio var wsCtx = await ctx.AcceptWebSocketAsync(null); ws = wsCtx.WebSocket; var remoteIp = GetClientAddress(ctx.Request)?.ToString() ?? "unknown"; - Console.WriteLine($"[WsBridge] Pair handshake from {remoteIp}"); + BridgeLog($"[BRIDGE] Pair handshake from {remoteIp}"); if (_fiestaService != null) await _fiestaService.HandleIncomingPairHandshakeAsync(ws, remoteIp, ct); } catch (Exception ex) { - Console.WriteLine($"[WsBridge] Pair handshake error: {ex.Message}"); + BridgeLog($"[BRIDGE] Pair handshake error: {ex.Message}"); } finally {