diff --git a/.claude/skills/health-monitoring/SKILL.md b/.claude/skills/health-monitoring/SKILL.md
new file mode 100644
index 000000000..39a0c0f14
--- /dev/null
+++ b/.claude/skills/health-monitoring/SKILL.md
@@ -0,0 +1,395 @@
+---
+name: health-monitoring
+description: >
+ Real-time health monitoring for PolyPilot sessions. Use when the user asks to
+ "monitor sessions", "watch for issues", "check health", "make sure everything is
+ working", "keep an eye on things", or any request to continuously observe running
+ PolyPilot sessions for reliability issues. Also use when the user reports a session
+ is "stuck", "not responding", shows "Thinking..." forever, or threw a "Tool execution
+ stuck" error — diagnose the issue using the techniques below. Covers: single sessions,
+ multi-agent orchestration, sub-agent IDLE-DEFER, app restart recovery, and connection
+ health.
+---
+
+# PolyPilot Health Monitoring
+
+Continuously monitor all running PolyPilot sessions for reliability issues. Your job is
+to be a vigilant observer — detect problems early, diagnose root causes, and either fix
+them or report them clearly.
+
+## Quick Start
+
+### 1. Start the live event stream
+
+```bash
+tail -n0 -F ~/.polypilot/event-diagnostics.log
+```
+
+Run this in an async shell and read it periodically (every 2-5 minutes). This is your
+primary signal source — every important state change is logged here.
+
+### 2. Initial baseline check
+
+Before monitoring, establish what "healthy" looks like right now:
+
+```bash
+# How many sessions exist?
+python3 -c "import json; d=json.load(open('$HOME/.polypilot/active-sessions.json')); print(f'{len(d)} active sessions')"
+
+# Any recent errors?
+grep -E '\[ERROR\]|\[WATCHDOG\].*timeout|\[TOOL-HEALTH\].*recovery|\[RECONNECT\].*replacing' \
+ ~/.polypilot/event-diagnostics.log | tail -10
+
+# Any phantom sessions?
+python3 -c "
+import json
+d=json.load(open('$HOME/.polypilot/active-sessions.json'))
+phantoms=[s['DisplayName'] for s in d if '(previous)' in s.get('DisplayName','') or '(resumed)' in s.get('DisplayName','')]
+print(f'{len(phantoms)} phantom sessions' + (': '+', '.join(phantoms) if phantoms else ''))
+"
+```
+
+### 3. Periodic health checks
+
+Every 2-5 minutes, read the event stream and look for problems. Filter for the
+important events — skip the noisy `[EVT]` lines unless diagnosing a specific issue:
+
+```bash
+tail -200 ~/.polypilot/event-diagnostics.log | grep -E \
+ 'ERROR|WATCHDOG|RECONNECT|TOOL-HEALTH|COMPLETE|IDLE-DEFER|SEND|KEEPALIVE|HEALTH|DISPATCH|ABORT|INTERRUPTED'
+```
+
+## What Healthy Looks Like
+
+A healthy session lifecycle follows this pattern:
+
+```
+[SEND] → [EVT] TurnStart → [EVT] TurnEnd → [EVT] TurnStart → ... → [EVT] SessionIdle → [IDLE] → [COMPLETE]
+```
+
+Key indicators of health:
+- **SEND → COMPLETE** cycle completes (check `flushedLen` is non-zero for real responses)
+- **IDLE-DEFER** fires when sub-agents are active, then COMPLETE fires when they finish
+- **KEEPALIVE** pings appear every 15 minutes
+- **No ERROR, WATCHDOG timeout, or RECONNECT events**
+
+## What Problems Look Like
+
+### Problem: Session stuck at "Thinking..."
+
+**Symptom:** `[SEND]` logged but no `[COMPLETE]` after several minutes.
+
+**Diagnosis:**
+```bash
+# Check the last events for the stuck session
+grep "SessionName" ~/.polypilot/event-diagnostics.log | tail -10
+```
+
+**Possible causes:**
+1. **No events flowing** → Connection may be dead. Look for `[WATCHDOG]` events — the
+ watchdog should detect and recover within 2 minutes (120s inactivity timeout).
+2. **Events flowing but no IDLE** → Session is legitimately working (tool execution).
+ Check if `AssistantTurnStart/End` events are still arriving.
+3. **IDLE-DEFER active** → Sub-agents are still running. Check `backgroundTasks` count
+ in the `[IDLE-DIAG]` line. This is normal for multi-agent and sub-agent sessions.
+
+### Problem: "Tool execution stuck" error message
+
+**Symptom:** User sees "Tool execution stuck (reason). Session recovered automatically."
+
+**Diagnosis:**
+```bash
+grep "TOOL-HEALTH" ~/.polypilot/event-diagnostics.log | tail -10
+```
+
+This fires when the `ToolHealthCheck` timer detects no events for 30s after a tool
+starts, and the server is either dead or has been stale for multiple checks. The session
+is auto-recovered — check if `[COMPLETE]` follows. If the response was meaningful
+(`flushedLen > 0`), the recovery worked. If `flushedLen=0`, the tool died before
+producing output.
+
+### Problem: Client recreation / Blazor surface reset
+
+**Symptom:** All sessions briefly reset in the UI, user sees a flash.
+
+**Diagnosis:**
+```bash
+grep "RECONNECT.*replacing state\|Recreating client" ~/.polypilot/event-diagnostics.log | tail -5
+```
+
+This happens when `SendAsync` throws a connection error. The app recreates the
+`CopilotClient` and re-resumes all sessions. Check:
+- **What session triggered it** — the `[SEND]` just before the first `[RECONNECT]`
+- **Did all sessions recover?** — Look for `Failed to re-resume sibling` entries
+- **Was the server alive?** — Check if `[KEEPALIVE]` pings succeeded before/after
+
+```bash
+# Find sessions that failed to re-resume
+grep "Failed to re-resume" ~/.polypilot/event-diagnostics.log | tail -10
+```
+
+Sessions that fail to re-resume with "Session not found" had their server-side session
+expired. This is normal for very old idle sessions.
+
+### Problem: Watchdog timeout
+
+**Symptom:** `[WATCHDOG]` line with `IsProcessing=false` and a system message about
+"Session appears stuck".
+
+**Diagnosis:**
+```bash
+grep "WATCHDOG.*IsProcessing=false\|WATCHDOG.*timeout\|WATCHDOG.*stuck" \
+ ~/.polypilot/event-diagnostics.log | tail -5
+```
+
+Check which timeout tier fired:
+- **30s** (resume quiescence) — Session was resumed after restart but never received
+ events. The turn likely completed before the restart. Normal.
+- **120s** (inactivity) — No events for 2 minutes with no tool activity. Connection
+ may have dropped.
+- **600s** (tool execution) — Tool was running for 10 minutes with no events. Rare
+ but can happen for very long builds.
+
+### Problem: Phantom `(previous)` sessions
+
+**Symptom:** Session list shows duplicates with `(previous)` suffix.
+
+**Diagnosis:**
+```bash
+python3 -c "
+import json
+d=json.load(open('$HOME/.polypilot/active-sessions.json'))
+for s in d:
+ if '(previous)' in s.get('DisplayName',''):
+ print(f'{s[\"DisplayName\"]}: group={s.get(\"GroupId\",\"\")}, recovered={s.get(\"RecoveredFromSessionId\",\"\")}')"
+```
+
+These are caused by worker revival creating a new session while the old one lingers in
+`active-sessions.json`. The fix in PR #531 added `_closedSessionIds` tracking to prevent
+this. If you still see phantoms, check `RecoveredFromSessionId` — the old session should
+have been excluded from the merge.
+
+### Problem: DISPATCH-RECOVER false positive
+
+**Symptom:** `[DISPATCH-RECOVER]` in the log when workers completed normally.
+
+**Diagnosis:**
+```bash
+grep "DISPATCH-RECOVER" ~/.polypilot/event-diagnostics.log | tail -5
+```
+
+This should be rare after PR #531's two-phase mtime check. If it fires, check:
+1. Was the worker's `events.jsonl` actually written to during the grace period?
+2. Did a `session.resume` event cause a false write?
+
+## Multi-Agent Monitoring
+
+Multi-agent sessions have extra complexity. Monitor these patterns:
+
+### Healthy orchestrator flow
+```
+[DISPATCH-ROUTE] → [SEND] orchestrator → [IDLE-DEFER] (agents=N) → ... → [COMPLETE]
+```
+
+Between IDLE-DEFER and COMPLETE, workers are running. Check worker events:
+```bash
+grep "worker" ~/.polypilot/event-diagnostics.log | tail -20
+```
+
+### Worker completion timing
+Workers should complete in 2-30s for simple tasks, 1-10 minutes for complex ones.
+If a worker hasn't produced events in 5+ minutes, check its events.jsonl:
+
+```bash
+# Find worker session ID
+python3 -c "
+import json
+d=json.load(open('$HOME/.polypilot/active-sessions.json'))
+for s in d:
+ if 'worker' in s.get('DisplayName','').lower():
+ print(f'{s[\"DisplayName\"]}: {s[\"SessionId\"]}')" | head -10
+
+# Check last event time for a specific worker
+ls -la ~/.copilot/session-state/SESSION_ID/events.jsonl
+```
+
+### IDLE-DEFER-REARM
+When `session.idle` arrives with active background tasks but `IsProcessing` was already
+cleared (by watchdog or race), IDLE-DEFER-REARM re-arms processing. This is correct
+behavior — it keeps the session alive until sub-agents finish:
+
+```
+[IDLE-DEFER-REARM] 'SessionName' re-arming IsProcessing — background tasks active but processing was cleared
+```
+
+## Post-Relaunch Recovery Check
+
+After `./relaunch.sh`, do an exhaustive recovery check:
+
+```bash
+# 1. Wait for the app to restart (~15-20s)
+sleep 20
+
+# 2. Check the relaunch log
+tail -5 ~/.polypilot/relaunch.log
+
+# 3. Check for RECONNECT events (the new app reconnects all sessions)
+grep "RECONNECT" ~/.polypilot/event-diagnostics.log | tail -20
+
+# 4. Check for failed re-resumes
+grep "Failed to re-resume" ~/.polypilot/event-diagnostics.log | tail -10
+
+# 5. Verify no sessions are stuck processing
+grep "IsProcessing=True" ~/.polypilot/event-diagnostics.log | tail -5
+
+# 6. Check for phantom sessions
+python3 -c "
+import json
+d=json.load(open('$HOME/.polypilot/active-sessions.json'))
+phantoms=[s['DisplayName'] for s in d if '(previous)' in s.get('DisplayName','')]
+stuck=[s['DisplayName'] for s in d if s.get('LastPrompt')]
+print(f'Phantoms: {len(phantoms)}, Sessions with pending prompts: {len(stuck)}')
+if phantoms: print(' Phantom:', phantoms)
+if stuck: print(' Pending:', stuck)
+"
+
+# 7. If any sessions were actively processing during relaunch, verify they resumed
+grep "SEND\|COMPLETE" ~/.polypilot/event-diagnostics.log | tail -10
+```
+
+**What to look for:**
+- All sessions should show `[RECONNECT] Re-resumed sibling session` (except expired ones)
+- Sessions that were processing should either complete or be detected by the watchdog
+- No `(previous)` phantom sessions should appear
+- The `[HEALTH] Connection healthy after resume/wake` event should appear
+
+## Reporting
+
+When monitoring, provide periodic status updates:
+
+| Session | Status | Details |
+|---------|--------|---------|
+| Name | ✅/⚠️/❌ | Brief description |
+
+Use:
+- ✅ for clean SEND→COMPLETE cycles
+- ⚠️ for IDLE-DEFER (expected but worth noting), RECONNECT (recovered), long-running tools
+- ❌ for WATCHDOG timeouts, TOOL-HEALTH recovery, ERROR events, stuck sessions
+
+If you detect an issue, diagnose it immediately using the techniques above. If it's a
+code bug (not a transient connection issue), investigate the source code and propose a fix.
+
+## Mobile App Monitoring (WsBridge + DevTunnel)
+
+The mobile app connects to the desktop via WebSocket through a DevTunnel. Monitor this
+entire chain for reliability issues.
+
+### Architecture
+
+```
+Mobile App (iOS/Android)
+ → DevTunnel (wss://TUNNEL_ID.usw3.devtunnels.ms)
+ → WsBridgeServer (localhost:4322)
+ → CopilotService (desktop)
+```
+
+### Quick Health Check
+
+```bash
+# 1. Is the bridge server listening?
+lsof -i :4322
+
+# 2. Is DevTunnel running?
+ps aux | grep "devtunnel host" | grep -v grep
+
+# 3. Is the tunnel reachable?
+curl -s -o /dev/null -w "%{http_code}" https://TUNNEL_ID.usw3.devtunnels.ms/
+
+# 4. Is the devtunnel login still valid?
+devtunnel show TUNNEL_ID 2>&1 | head -5
+# If "Login token expired" → tunnel may reject new connections
+
+# 5. Check for connected mobile clients
+# WsBridgeServer logs connect/disconnect to stdout (Console.WriteLine)
+# Check app logs:
+maui devflow MAUI logs --limit 30 --agent-port 9223 2>&1 | grep -i "client\|connect\|bridge"
+
+# 6. Bridge-related errors in crash log
+grep -i "bridge\|WebSocket\|tunnel" ~/.polypilot/crash.log | tail -5
+
+# 7. Bridge-related diagnostic events
+grep -E "BRIDGE|SyncRemote|SmartURL" ~/.polypilot/event-diagnostics.log | tail -10
+```
+
+### Common Mobile Issues
+
+#### Mobile shows "Connecting..." or blank session list
+1. Check DevTunnel is running (`ps aux | grep devtunnel`)
+2. Check bridge port is listening (`lsof -i :4322`)
+3. Check tunnel is reachable (`curl` the tunnel URL — expect 404, not connection error)
+4. If `devtunnel show` says "Login token expired" — the tunnel host process may still
+ work but new management operations fail. The existing tunnel should keep forwarding.
+
+#### Mobile sends message but never gets response
+The bridge proxies prompts via `DispatchBridgePromptAsync`. Check:
+```bash
+# Stack traces from bridge prompt dispatch
+grep "DispatchBridgePromptAsync" ~/.polypilot/event-diagnostics.log | tail -5
+
+# Bridge completion events
+grep "BRIDGE-COMPLETE\|BRIDGE-SESSION" ~/.polypilot/event-diagnostics.log | tail -10
+```
+
+If `DispatchBridgePromptAsync` has stack traces, the desktop CopilotService failed to
+process the prompt. The mobile will show "Working..." indefinitely. Check if the
+triggering session had a connection error (look for RECONNECT events around the same time).
+
+#### Mobile shows session as "Working" when desktop shows idle
+The bridge syncs `IsProcessing` state. If a reconnect force-completed a session on
+desktop but the bridge state-sync hasn't fired yet, mobile shows stale state. The
+next `SyncRemoteSessions` call (triggered by `OnStateChanged`) should fix it. If not:
+```bash
+grep "SyncRemoteSessions" ~/.polypilot/event-diagnostics.log | tail -5
+```
+
+#### Mobile disconnects frequently
+Check network quality. The WebSocket connection goes through DevTunnel → Azure →
+mobile network. Each hop can drop. The `WsBridgeClient` on mobile has auto-reconnect
+logic, but gaps cause temporary UI freezes.
+
+```bash
+# Check for SmartURL network change events (WiFi↔Cellular transitions)
+grep "SmartURL" ~/.polypilot/event-diagnostics.log | tail -10
+
+# Check HEALTH events (Mac wake/sleep affects tunnel)
+grep "HEALTH" ~/.polypilot/event-diagnostics.log | tail -10
+```
+
+### Bridge-Specific Diagnostic Tags
+
+| Tag | Meaning |
+|-----|---------|
+| `[BRIDGE-COMPLETE]` | Bridge `OnTurnEnd` cleared IsProcessing for a remote session |
+| `[BRIDGE-SESSION-COMPLETE]` | Stale IsProcessing cleared during bridge sync |
+| `[SmartURL]` | Network change detected (WiFi gain/loss) — may trigger reconnect |
+| `[HEALTH]` | Connection health check after Mac wake/sleep |
+
+### DevTunnel Management
+
+```bash
+# Re-login if token expired (interactive — opens browser)
+devtunnel login
+
+# Show tunnel details
+devtunnel show TUNNEL_ID
+
+# Restart tunnel hosting (if process died)
+devtunnel host TUNNEL_ID --allow-anonymous &
+
+# Check tunnel port forwarding
+devtunnel list
+```
+
+Note: The DevTunnel host process runs independently from PolyPilot. If the Mac sleeps
+and wakes, the tunnel process usually survives but the WebSocket connections through
+it may need to re-establish.
diff --git a/PolyPilot.Tests/SessionStabilityTests.cs b/PolyPilot.Tests/SessionStabilityTests.cs
index a10b680c5..6d0efe182 100644
--- a/PolyPilot.Tests/SessionStabilityTests.cs
+++ b/PolyPilot.Tests/SessionStabilityTests.cs
@@ -79,27 +79,21 @@ public void ForceCompleteProcessing_ClearsAllInv1Fields()
var source = File.ReadAllText(TestPaths.OrganizationCs);
var method = ExtractMethod(source, "Task ForceCompleteProcessingAsync");
- // Every INV-1 field must be cleared
- var requiredClears = new[]
+ // ForceCompleteProcessingAsync must call ClearProcessingState (which atomically
+ // clears all INV-1 fields) and the other required operations
+ var requiredPatterns = new[]
{
- "ActiveToolCallCount", // INV-1 field 3
- "HasUsedToolsThisTurn", // INV-1 field 2
- "SendingFlag", // INV-1 field 7
- "IsResumed", // INV-1 field 1
- "ProcessingStartedAt", // INV-1 field 4
- "ToolCallCount", // INV-1 field 5
- "ProcessingPhase", // INV-1 field 6
- "ClearPermissionDenials", // INV-1 field 8
- "FlushCurrentResponse", // INV-1 field 9
- "IsProcessing", // The flag itself
- "OnSessionComplete", // INV-1 field 10
- "TrySetResult", // Resolves the worker TCS
+ "ClearProcessingState(state", // Atomic INV-1 field clearing
+ "FlushCurrentResponse", // INV-1 field 9
+ "OnSessionComplete", // INV-1 field 10
+ "TrySetResult", // Resolves the worker TCS
+ "AllowTurnStartRearm = false", // Explicit recovery terminal
};
- foreach (var field in requiredClears)
+ foreach (var field in requiredPatterns)
{
Assert.True(method.Contains(field, StringComparison.Ordinal),
- $"ForceCompleteProcessingAsync must clear '{field}' (INV-1 compliance)");
+ $"ForceCompleteProcessingAsync must contain '{field}'");
}
}
@@ -141,12 +135,15 @@ public void ForceCompleteProcessing_BoundsAbortAsyncTimeout()
}
[Fact]
- public void ForceCompleteProcessing_PreservesCarryOverDeferredIdleFingerprint()
+ public void ForceCompleteProcessing_UsesClearProcessingState()
{
+ // ForceCompleteProcessingAsync must delegate to ClearProcessingState rather than
+ // manually clearing fields. ClearProcessingState calls ClearDeferredIdleTracking
+ // internally (without preserveCarryOver, which is only needed in SendPromptAsync).
var source = File.ReadAllText(TestPaths.OrganizationCs);
var method = ExtractMethod(source, "Task ForceCompleteProcessingAsync");
- Assert.Contains("ClearDeferredIdleTracking(state, preserveCarryOver: true)", method);
+ Assert.Contains("ClearProcessingState(state", method);
}
[Fact]
diff --git a/PolyPilot/Services/CopilotService.Organization.cs b/PolyPilot/Services/CopilotService.Organization.cs
index b8c790b49..37d39ba15 100644
--- a/PolyPilot/Services/CopilotService.Organization.cs
+++ b/PolyPilot/Services/CopilotService.Organization.cs
@@ -2216,24 +2216,13 @@ private async Task ForceCompleteProcessingAsync(string sessionName, SessionState
{
if (!state.Info.IsProcessing) { tcs.TrySetResult(true); return; }
- // Full cleanup mirroring CompleteResponse / unstartedWorkers recovery
+ // Full cleanup via ClearProcessingState
FlushCurrentResponse(state);
- Interlocked.Exchange(ref state.ActiveToolCallCount, 0);
- Interlocked.Exchange(ref state.SendingFlag, 0);
- Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0);
Interlocked.Exchange(ref state.WatchdogCaseAResets, 0);
Interlocked.Exchange(ref state.WatchdogCaseBResets, 0);
Interlocked.Exchange(ref state.WatchdogCaseBLastFileSize, 0);
Interlocked.Exchange(ref state.WatchdogCaseBStaleCount, 0);
- state.HasUsedToolsThisTurn = false;
- ClearDeferredIdleTracking(state, preserveCarryOver: true);
- state.AllowTurnStartRearm = false; // Force-complete is explicit recovery, not a speculative idle completion
state.FallbackCanceledByTurnStart = false;
- state.Info.IsResumed = false;
- state.Info.ProcessingStartedAt = null;
- state.Info.ToolCallCount = 0;
- state.Info.ProcessingPhase = 0;
- state.Info.ClearPermissionDenials();
var response = state.CurrentResponse.ToString();
var fullResponse = state.FlushedResponse.Length > 0
@@ -2242,11 +2231,11 @@ private async Task ForceCompleteProcessingAsync(string sessionName, SessionState
: state.FlushedResponse + "\n\n" + response)
: response;
- state.CurrentResponse.Clear();
- state.FlushedResponse.Clear();
- ClearFlushedReplayDedup(state);
- state.PendingReasoningMessages.Clear();
- state.Info.IsProcessing = false;
+ // Accumulate API time but don't count as premium request (forced recovery)
+ if (state.Info.ProcessingStartedAt is { } forceStarted)
+ state.Info.TotalApiTimeSeconds += (DateTime.UtcNow - forceStarted).TotalSeconds;
+ ClearProcessingState(state, accumulateApiTime: false);
+ state.AllowTurnStartRearm = false; // Force-complete is explicit recovery, not a speculative idle completion
state.ResponseCompletion?.TrySetResult(fullResponse);
var summary = fullResponse.Length > 0 ? (fullResponse.Length > 100 ? fullResponse[..100] + "..." : fullResponse) : "";
diff --git a/PolyPilot/Services/CopilotService.cs b/PolyPilot/Services/CopilotService.cs
index 3d1de493c..7a98814f3 100644
--- a/PolyPilot/Services/CopilotService.cs
+++ b/PolyPilot/Services/CopilotService.cs
@@ -919,6 +919,28 @@ internal static bool ShouldPersistDiagnostic(string message)
message.Contains("watchdog") || message.Contains("Failed to");
}
+ ///
+ /// Static logging entry point for WsBridgeServer diagnostics.
+ /// Writes directly to the event-diagnostics.log file without requiring a CopilotService instance.
+ ///
+ internal static void LogBridgeDiagnostic(string message)
+ {
+ Console.WriteLine($"[DEBUG] {message}");
+ if (ShouldPersistDiagnostic(message))
+ {
+ try
+ {
+ lock (_diagnosticLogLock)
+ {
+ var logPath = Path.Combine(PolyPilotBaseDir, "event-diagnostics.log");
+ File.AppendAllText(logPath,
+ $"{DateTime.UtcNow:yyyy-MM-dd HH:mm:ss.fff} {message}{Environment.NewLine}");
+ }
+ }
+ catch { }
+ }
+ }
+
private void Debug(string message)
{
LastDebugMessage = message;
@@ -3789,6 +3811,10 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis
{
Debug($"[RECONNECT] Session ID changed on resume: '{state.Info.SessionId}' → '{actualId}' for '{sessionName}'");
CopyEventsToNewSession(state.Info.SessionId, actualId);
+ // Mark the old session ID as closed so the merge in SaveActiveSessionsToDisk
+ // doesn't re-add it — otherwise a stale entry with the old ID lingers and
+ // gets renamed to "(previous)" on the next save cycle.
+ _closedSessionIds[state.Info.SessionId] = 0;
state.Info.SessionId = actualId;
// Persist the new session ID so restarts don't revert to the old one
FlushSaveActiveSessionsToDisk();
diff --git a/PolyPilot/Services/WsBridgeServer.cs b/PolyPilot/Services/WsBridgeServer.cs
index b4df8e13c..01abbf302 100644
--- a/PolyPilot/Services/WsBridgeServer.cs
+++ b/PolyPilot/Services/WsBridgeServer.cs
@@ -43,6 +43,26 @@ private record PendingBridgePrompt(string SessionName, string Message, string? A
public bool IsRunning => _listener?.IsListening == true && _proxyListener != null;
public bool SupportsRemoteConnections { get; private set; }
+ ///
+ /// Logs a bridge diagnostic message to the event-diagnostics log via CopilotService.
+ /// Falls back to Console.WriteLine if CopilotService is not yet attached.
+ /// All messages are prefixed with [BRIDGE] for filtering.
+ ///
+ private void BridgeLog(string message)
+ {
+ var tagged = message.StartsWith("[") ? message : $"[BRIDGE] {message}";
+ if (_copilot != null)
+ {
+ // Use reflection-free approach: CopilotService.LogExternal is a public static method
+ // that writes to the event-diagnostics log without requiring instance access.
+ CopilotService.LogBridgeDiagnostic(tagged);
+ }
+ else
+ {
+ Console.WriteLine(tagged);
+ }
+ }
+
///
/// Access token that clients must provide via X-Tunnel-Authorization header or query param.
///
@@ -69,12 +89,12 @@ public void Start(int bridgePort, int targetPort)
if (TryBindBridgePipeline(bridgePort))
{
- Console.WriteLine($"[WsBridge] Listening on port {bridgePort} (state-sync mode, network via loopback proxy localhost:{_internalListenerPort})");
+ BridgeLog($"[BRIDGE] Listening on port {bridgePort} (state-sync mode, network via loopback proxy localhost:{_internalListenerPort})");
OnStateChanged?.Invoke();
}
else
{
- Console.WriteLine($"[WsBridge] Port {bridgePort} unavailable — will retry in accept loops");
+ BridgeLog($"[BRIDGE] Port {bridgePort} unavailable — will retry in accept loops");
}
_acceptTask = AcceptLoopAsync(_cts.Token);
@@ -113,12 +133,12 @@ private bool TryBindInternalListener()
listener.Start();
_listener = listener;
_internalListenerPort = port;
- Console.WriteLine($"[WsBridge] Internal listener on localhost:{port}");
+ BridgeLog($"[BRIDGE] Internal listener on localhost:{port}");
return true;
}
catch (Exception ex)
{
- Console.WriteLine($"[WsBridge] Internal bind on localhost:{port} failed: {ex.Message}");
+ BridgeLog($"[BRIDGE] Internal bind on localhost:{port} failed: {ex.Message}");
}
}
@@ -133,12 +153,12 @@ private bool TryBindProxyListener(int port)
proxy.Server.DualMode = true;
proxy.Start();
_proxyListener = proxy;
- Console.WriteLine($"[WsBridge] Public proxy listening on port {port} (dual-stack)");
+ BridgeLog($"[BRIDGE] Public proxy listening on port {port} (dual-stack)");
return true;
}
catch (Exception ex)
{
- Console.WriteLine($"[WsBridge] Public dual-stack bind on port {port} failed: {ex.Message}");
+ BridgeLog($"[BRIDGE] Public dual-stack bind on port {port} failed: {ex.Message}");
}
try
@@ -146,12 +166,12 @@ private bool TryBindProxyListener(int port)
var proxy = new TcpListener(IPAddress.Any, port);
proxy.Start();
_proxyListener = proxy;
- Console.WriteLine($"[WsBridge] Public proxy listening on port {port} (IPv4)");
+ BridgeLog($"[BRIDGE] Public proxy listening on port {port} (IPv4)");
return true;
}
catch (Exception ex)
{
- Console.WriteLine($"[WsBridge] Public bind on port {port} failed: {ex.Message}");
+ BridgeLog($"[BRIDGE] Public bind on port {port} failed: {ex.Message}");
return false;
}
}
@@ -296,14 +316,14 @@ public async Task DrainPendingPromptsAsync()
{
while (_pendingBridgePrompts.TryDequeue(out var pending))
{
- Console.WriteLine($"[BRIDGE] Replaying queued prompt for '{pending.SessionName}'");
+ BridgeLog($"[BRIDGE] Replaying queued prompt for '{pending.SessionName}'");
try
{
await DispatchBridgePromptAsync(pending.SessionName, pending.Message, pending.AgentMode);
}
catch (Exception ex)
{
- Console.WriteLine($"[BRIDGE] Failed to replay prompt for '{pending.SessionName}': {ex.Message}");
+ BridgeLog($"[BRIDGE] Failed to replay prompt for '{pending.SessionName}': {ex.Message}");
}
}
}
@@ -329,7 +349,7 @@ private async Task DispatchBridgePromptAsync(string sessionName, string message,
var orchGroup = _copilot.Organization.Groups.FirstOrDefault(g => g.Id == orchGroupId);
if (orchGroup?.OrchestratorMode == MultiAgentMode.OrchestratorReflect)
_copilot.StartGroupReflection(orchGroupId, message, orchGroup.MaxReflectIterations ?? 5);
- Console.WriteLine($"[WsBridge] Routing '{sessionName}' through orchestration pipeline (group={orchGroupId})");
+ BridgeLog($"[BRIDGE] Routing '{sessionName}' through orchestration pipeline (group={orchGroupId})");
await _copilot.SendToMultiAgentGroupAsync(orchGroupId, message, ct);
}
else
@@ -351,13 +371,13 @@ private async Task DispatchBridgePromptAsync(string sessionName, string message,
{
// Orchestrated sessions route through SendToMultiAgentGroupAsync which has
// its own busy handling; blindly queuing would bypass the orchestration pipeline.
- Console.WriteLine($"[WsBridge] Orchestrator '{sessionName}' busy, dropping mobile message (retry manually)");
+ BridgeLog($"[BRIDGE] Orchestrator '{sessionName}' busy, dropping mobile message (retry manually)");
Broadcast(BridgeMessage.Create(BridgeMessageTypes.ErrorEvent,
new ErrorPayload { SessionName = sessionName, Error = "Session is busy processing a request. Please retry when the current turn completes." }));
}
else
{
- Console.WriteLine($"[WsBridge] '{sessionName}' busy, queuing mobile message for next turn");
+ BridgeLog($"[BRIDGE] '{sessionName}' busy, queuing mobile message for next turn");
_copilot.EnqueueMessage(sessionName, message, agentMode: agentMode);
}
return Task.CompletedTask;
@@ -378,7 +398,7 @@ public void Stop()
foreach (var kvp in _clientSendLocks) kvp.Value.Dispose();
_clientSendLocks.Clear();
StopListenersOnly();
- Console.WriteLine("[WsBridge] Stopped");
+ BridgeLog("[BRIDGE] Stopped");
OnStateChanged?.Invoke();
}
@@ -425,7 +445,7 @@ private async Task AcceptLoopAsync(CancellationToken ct)
{
context.Response.StatusCode = 429;
context.Response.Close();
- Console.WriteLine("[WsBridge] Pair request rate-limited");
+ BridgeLog("[BRIDGE] Pair request rate-limited");
continue;
}
_ = Task.Run(() => HandlePairHandshakeAsync(context, ct), ct);
@@ -436,7 +456,7 @@ private async Task AcceptLoopAsync(CancellationToken ct)
{
context.Response.StatusCode = 401;
context.Response.Close();
- Console.WriteLine("[WsBridge] Rejected unauthenticated WebSocket connection");
+ BridgeLog("[BRIDGE] Rejected unauthenticated WebSocket connection");
continue;
}
_ = Task.Run(() => HandleClientAsync(context, ct), ct);
@@ -485,13 +505,13 @@ private async Task AcceptLoopAsync(CancellationToken ct)
catch (HttpListenerException ex)
{
if (ct.IsCancellationRequested) break;
- Console.WriteLine($"[WsBridge] Listener error ({ex.ErrorCode}): {ex.Message} — will restart");
+ BridgeLog($"[BRIDGE] Listener error ({ex.ErrorCode}): {ex.Message} — will restart");
await _restartLock.WaitAsync(CancellationToken.None).ConfigureAwait(false);
try { StopListenersOnly(); } finally { _restartLock.Release(); }
}
catch (Exception ex)
{
- Console.WriteLine($"[WsBridge] Accept error: {ex.Message}");
+ BridgeLog($"[BRIDGE] Accept error: {ex.Message}");
}
}
}
@@ -539,13 +559,13 @@ private async Task ProxyAcceptLoopAsync(CancellationToken ct)
catch (SocketException ex)
{
if (ct.IsCancellationRequested) break;
- Console.WriteLine($"[WsBridge] Proxy accept error: {ex.Message} — will restart");
+ BridgeLog($"[BRIDGE] Proxy accept error: {ex.Message} — will restart");
await _restartLock.WaitAsync(CancellationToken.None).ConfigureAwait(false);
try { StopListenersOnly(); } finally { _restartLock.Release(); }
}
catch (Exception ex)
{
- Console.WriteLine($"[WsBridge] Proxy error: {ex.Message}");
+ BridgeLog($"[BRIDGE] Proxy error: {ex.Message}");
}
}
}
@@ -568,7 +588,7 @@ private async Task TryRestartListenerAsync(CancellationToken ct)
if (TryBindBridgePipeline(_bridgePort))
{
- Console.WriteLine($"[WsBridge] Restarted listening on port {_bridgePort}");
+ BridgeLog($"[BRIDGE] Restarted listening on port {_bridgePort}");
OnStateChanged?.Invoke();
return true;
}
@@ -605,7 +625,7 @@ private async Task ProxyClientAsync(TcpClient client, CancellationToken ct)
var request = await ReadProxyRequestAsync(downstreamStream, ct).ConfigureAwait(false);
if (request == null)
{
- Console.WriteLine("[WsBridge] Proxy request dropped (incomplete headers or exceeded 64KB limit)");
+ BridgeLog("[BRIDGE] Proxy request dropped (incomplete headers or exceeded 64KB limit)");
return;
}
@@ -627,7 +647,7 @@ private async Task ProxyClientAsync(TcpClient client, CancellationToken ct)
catch (OperationCanceledException) { }
catch (Exception ex)
{
- Console.WriteLine($"[WsBridge] Proxy client error: {ex.Message}");
+ BridgeLog($"[BRIDGE] Proxy client error: {ex.Message}");
}
}
@@ -799,7 +819,7 @@ private async Task HandleClientAsync(HttpListenerContext httpContext, Cancellati
ws = wsContext.WebSocket;
_clients[clientId] = ws;
_clientSendLocks[clientId] = new SemaphoreSlim(1, 1);
- Console.WriteLine($"[WsBridge] Client {clientId} connected ({_clients.Count} total)");
+ BridgeLog($"[BRIDGE] Client {clientId} connected ({_clients.Count} total)");
// Send initial state — if the server is still restoring sessions, wait so the
// client doesn't see sessions with MessageCount=0 (History hasn't loaded from
@@ -808,14 +828,14 @@ private async Task HandleClientAsync(HttpListenerContext httpContext, Cancellati
{
if (_copilot.IsRestoring)
{
- Console.WriteLine($"[WsBridge] Client {clientId} connected while server is restoring — waiting for restore to complete");
+ BridgeLog($"[BRIDGE] Client {clientId} connected while server is restoring — waiting for restore to complete");
var restoreDeadline = DateTime.UtcNow.AddSeconds(30);
while (_copilot.IsRestoring && DateTime.UtcNow < restoreDeadline && !ct.IsCancellationRequested)
await Task.Delay(200, ct);
if (!_copilot.IsRestoring)
- Console.WriteLine($"[WsBridge] Restore complete — sending session list to client {clientId}");
+ BridgeLog($"[BRIDGE] Restore complete — sending session list to client {clientId}");
else
- Console.WriteLine($"[WsBridge] Restore still running after 30s — sending partial session list to client {clientId}");
+ BridgeLog($"[BRIDGE] Restore still running after 30s — sending partial session list to client {clientId}");
}
await SendToClientAsync(clientId, ws,
BridgeMessage.Create(BridgeMessageTypes.SessionsList, BuildSessionsListPayload()), ct);
@@ -869,7 +889,7 @@ await SendToClientAsync(clientId, ws,
catch (OperationCanceledException) { }
catch (Exception ex)
{
- Console.WriteLine($"[WsBridge] Client {clientId} error: {ex.Message}");
+ BridgeLog($"[BRIDGE] Client {clientId} error: {ex.Message}");
}
finally
{
@@ -881,7 +901,7 @@ await SendToClientAsync(clientId, ws,
catch { }
}
ws?.Dispose();
- Console.WriteLine($"[WsBridge] Client {clientId} disconnected ({_clients.Count} remaining)");
+ BridgeLog($"[BRIDGE] Client {clientId} disconnected ({_clients.Count} remaining)");
}
}
@@ -909,7 +929,7 @@ await SendToClientAsync(clientId, ws,
var sendReq = msg.GetPayload();
if (sendReq != null && !string.IsNullOrWhiteSpace(sendReq.SessionName) && !string.IsNullOrWhiteSpace(sendReq.Message))
{
- Console.WriteLine($"[WsBridge] Client sending message to '{sendReq.SessionName}'");
+ BridgeLog($"[BRIDGE] Client sending message to '{sendReq.SessionName}'");
// Fire-and-forget: don't block the client message loop waiting for the full response.
// SendPromptAsync awaits ResponseCompletion (minutes). Responses stream back via events.
// Blocking here prevents the client from sending abort, switch, or other commands.
@@ -921,7 +941,7 @@ await SendToClientAsync(clientId, ws,
if (_copilot.IsRestoring)
{
_pendingBridgePrompts.Enqueue(new PendingBridgePrompt(sendSession, sendMessage, sendAgentMode));
- Console.WriteLine($"[BRIDGE] Queued prompt for '{sendSession}' during restore ({_pendingBridgePrompts.Count} pending)");
+ BridgeLog($"[BRIDGE] Queued prompt for '{sendSession}' during restore ({_pendingBridgePrompts.Count} pending)");
break;
}
@@ -929,7 +949,7 @@ await SendToClientAsync(clientId, ws,
_ = Task.Run(async () =>
{
try { await DispatchBridgePromptAsync(sendSession, sendMessage, sendAgentMode, ct); }
- catch (Exception ex) { Console.WriteLine($"[WsBridge] SendPromptAsync error for '{sendSession}': {ex.Message}"); }
+ catch (Exception ex) { BridgeLog($"[BRIDGE] SendPromptAsync error for '{sendSession}': {ex.Message}"); }
});
}
break;
@@ -949,14 +969,14 @@ await SendToClientAsync(clientId, ws,
createReq.WorkingDirectory.Contains("..") ||
!Directory.Exists(createReq.WorkingDirectory))
{
- Console.WriteLine($"[WsBridge] Rejected invalid WorkingDirectory: {createReq.WorkingDirectory}");
+ BridgeLog($"[BRIDGE] Rejected invalid WorkingDirectory: {createReq.WorkingDirectory}");
await SendToClientAsync(clientId, ws,
BridgeMessage.Create(BridgeMessageTypes.ErrorEvent,
new ErrorPayload { SessionName = createReq.Name, Error = $"Working directory not found on server: {createReq.WorkingDirectory}" }), ct);
break;
}
}
- Console.WriteLine($"[WsBridge] Client creating session '{createReq.Name}'");
+ BridgeLog($"[BRIDGE] Client creating session '{createReq.Name}'");
await _copilot.CreateSessionAsync(createReq.Name, createReq.Model, createReq.WorkingDirectory, ct);
BroadcastSessionsList();
BroadcastOrganizationState();
@@ -990,24 +1010,24 @@ await SendToClientAsync(clientId, ws,
// Validate session ID is a valid GUID to prevent path traversal
if (!Guid.TryParse(resumeReq.SessionId, out _))
{
- Console.WriteLine($"[WsBridge] Rejected invalid session ID format: {resumeReq.SessionId}");
+ BridgeLog($"[BRIDGE] Rejected invalid session ID format: {resumeReq.SessionId}");
await SendToClientAsync(clientId, ws,
BridgeMessage.Create(BridgeMessageTypes.ErrorEvent,
new ErrorPayload { SessionName = resumeReq.DisplayName ?? "Unknown", Error = "Invalid session ID format" }), ct);
break;
}
- Console.WriteLine($"[WsBridge] Client resuming session '{resumeReq.SessionId}'");
+ BridgeLog($"[BRIDGE] Client resuming session '{resumeReq.SessionId}'");
var displayName = resumeReq.DisplayName ?? "Resumed";
try
{
await _copilot.ResumeSessionAsync(resumeReq.SessionId, displayName, workingDirectory: null, model: null, cancellationToken: ct);
- Console.WriteLine($"[WsBridge] Session resumed successfully, broadcasting updated list");
+ BridgeLog($"[BRIDGE] Session resumed successfully, broadcasting updated list");
BroadcastSessionsList();
BroadcastOrganizationState();
}
catch (Exception resumeEx)
{
- Console.WriteLine($"[WsBridge] Resume failed: {resumeEx.Message}");
+ BridgeLog($"[BRIDGE] Resume failed: {resumeEx.Message}");
await SendToClientAsync(clientId, ws,
BridgeMessage.Create(BridgeMessageTypes.ErrorEvent,
new ErrorPayload { SessionName = displayName, Error = $"Resume failed: {resumeEx.Message}" }), ct);
@@ -1019,7 +1039,7 @@ await SendToClientAsync(clientId, ws,
var closeReq = msg.GetPayload();
if (closeReq != null && !string.IsNullOrWhiteSpace(closeReq.SessionName))
{
- Console.WriteLine($"[WsBridge] Client closing session '{closeReq.SessionName}'");
+ BridgeLog($"[BRIDGE] Client closing session '{closeReq.SessionName}'");
await _copilot.CloseSessionAsync(closeReq.SessionName);
}
break;
@@ -1028,7 +1048,7 @@ await SendToClientAsync(clientId, ws,
var abortReq = msg.GetPayload();
if (abortReq != null && !string.IsNullOrWhiteSpace(abortReq.SessionName))
{
- Console.WriteLine($"[WsBridge] Client aborting session '{abortReq.SessionName}'");
+ BridgeLog($"[BRIDGE] Client aborting session '{abortReq.SessionName}'");
// AbortSessionAsync mutates IsProcessing/History — must run on UI thread
_copilot.InvokeOnUI(() =>
{
@@ -1041,7 +1061,7 @@ await SendToClientAsync(clientId, ws,
var changeModelReq = msg.GetPayload();
if (changeModelReq != null && !string.IsNullOrWhiteSpace(changeModelReq.SessionName))
{
- Console.WriteLine($"[WsBridge] Client changing model for '{changeModelReq.SessionName}' to '{changeModelReq.NewModel}'");
+ BridgeLog($"[BRIDGE] Client changing model for '{changeModelReq.SessionName}' to '{changeModelReq.NewModel}'");
var modelChanged = await _copilot.ChangeModelAsync(changeModelReq.SessionName, changeModelReq.NewModel, changeModelReq.ReasoningEffort);
if (!modelChanged)
{
@@ -1058,7 +1078,7 @@ await SendToClientAsync(clientId, ws,
var renameReq = msg.GetPayload();
if (renameReq != null && !string.IsNullOrWhiteSpace(renameReq.OldName) && !string.IsNullOrWhiteSpace(renameReq.NewName))
{
- Console.WriteLine($"[WsBridge] Client renaming session '{renameReq.OldName}' to '{renameReq.NewName}'");
+ BridgeLog($"[BRIDGE] Client renaming session '{renameReq.OldName}' to '{renameReq.NewName}'");
var renamed = _copilot.RenameSession(renameReq.OldName, renameReq.NewName);
if (!renamed)
{
@@ -1139,7 +1159,7 @@ await _copilot.CreateGroupFromPresetAsync(preset,
}
catch (Exception ex)
{
- Console.WriteLine($"[WsBridge] CreateGroupFromPreset failed: {ex.Message}");
+ BridgeLog($"[BRIDGE] CreateGroupFromPreset failed: {ex.Message}");
try
{
await SendToClientAsync(presetClientId, presetWs,
@@ -1277,7 +1297,7 @@ await SendToClientAsync(clientId, ws,
}
catch (Exception ex)
{
- Console.WriteLine($"[WsBridgeServer] RemoveRepo error: {ex.Message}");
+ BridgeLog($"[BRIDGE] RemoveRepo error: {ex.Message}");
}
}
break;
@@ -1333,7 +1353,7 @@ await SendToClientAsync(clientId, ws,
}
catch (Exception ex)
{
- Console.WriteLine($"[WsBridgeServer] RemoveWorktree error: {ex.Message}");
+ BridgeLog($"[BRIDGE] RemoveWorktree error: {ex.Message}");
await SendToClientAsync(clientId, ws,
BridgeMessage.Create(BridgeMessageTypes.WorktreeError,
new RepoErrorPayload { RequestId = rmWtReq.RequestId, Error = ex.Message }), ct);
@@ -1350,7 +1370,7 @@ await SendToClientAsync(clientId, ws,
{
try
{
- Console.WriteLine($"[WsBridge] Client creating session+worktree for repo '{cswtReq.RepoId}'");
+ BridgeLog($"[BRIDGE] Client creating session+worktree for repo '{cswtReq.RepoId}'");
// Run on UI thread — CreateSessionWithWorktreeAsync calls
// ReconcileOrganization() which mutates Organization.Sessions
await _copilot.InvokeOnUIAsync(async () =>
@@ -1370,7 +1390,7 @@ await _copilot.CreateSessionWithWorktreeAsync(
}
catch (Exception ex)
{
- Console.WriteLine($"[WsBridge] CreateSessionWithWorktree error: {ex.Message}");
+ BridgeLog($"[BRIDGE] CreateSessionWithWorktree error: {ex.Message}");
await SendToClientAsync(clientId, ws,
BridgeMessage.Create(BridgeMessageTypes.ErrorEvent,
new ErrorPayload { SessionName = cswtReq.SessionName ?? "", Error = $"Create session+worktree failed: {ex.Message}" }), ct);
@@ -1388,7 +1408,7 @@ await SendToClientAsync(clientId, ws,
}
catch (Exception ex)
{
- Console.WriteLine($"[WsBridge] Error handling {msg.Type}: {ex.Message}");
+ BridgeLog($"[BRIDGE] Error handling {msg.Type}: {ex.Message}");
}
}
@@ -1448,7 +1468,7 @@ await SendToClientAsync(clientId, ws,
}
catch (Exception ex)
{
- Console.WriteLine($"[WsBridge] Failed to send directory list: {ex.Message}");
+ BridgeLog($"[BRIDGE] Failed to send directory list: {ex.Message}");
}
}
@@ -1517,7 +1537,7 @@ private async Task SendSessionHistoryToClient(string clientId, WebSocket ws, str
try { snapshot = session.History.ToArray(); }
catch (Exception ex)
{
- Console.WriteLine($"[WsBridge] History snapshot failed for '{sessionName}': {ex.Message}");
+ BridgeLog($"[BRIDGE] History snapshot failed for '{sessionName}': {ex.Message}");
return;
}
@@ -1649,7 +1669,7 @@ private void DebouncedBroadcastState()
public void BroadcastStateToClients()
{
if (_clients.IsEmpty) return;
- Console.WriteLine("[WsBridge] Broadcasting state to clients after unlock/wake");
+ BridgeLog("[BRIDGE] Broadcasting state to clients after unlock/wake");
BroadcastSessionsList();
BroadcastOrganizationState();
}
@@ -1741,7 +1761,7 @@ private async Task BroadcastSessionHistoryAsync(string sessionName)
}
catch (Exception ex)
{
- Console.WriteLine($"[WsBridge] BroadcastSessionHistory error for '{sessionName}': {ex.Message}");
+ BridgeLog($"[BRIDGE] BroadcastSessionHistory error for '{sessionName}': {ex.Message}");
}
}
@@ -1953,14 +1973,14 @@ private async Task HandlePairHandshakeAsync(HttpListenerContext ctx, Cancellatio
var wsCtx = await ctx.AcceptWebSocketAsync(null);
ws = wsCtx.WebSocket;
var remoteIp = GetClientAddress(ctx.Request)?.ToString() ?? "unknown";
- Console.WriteLine($"[WsBridge] Pair handshake from {remoteIp}");
+ BridgeLog($"[BRIDGE] Pair handshake from {remoteIp}");
if (_fiestaService != null)
await _fiestaService.HandleIncomingPairHandshakeAsync(ws, remoteIp, ct);
}
catch (Exception ex)
{
- Console.WriteLine($"[WsBridge] Pair handshake error: {ex.Message}");
+ BridgeLog($"[BRIDGE] Pair handshake error: {ex.Message}");
}
finally
{