{
"assertAllSessionsReceived", "assertAllSessionsResponded", "assertAllWorkers",
- "assertDirectoryExists", "assertEqual", "assertEvaluatorWasUsed", "assertFileContains",
- "assertFileExists", "assertGroupMembership", "assertNoDirectoryContains",
+ "assertDirectoryExists", "assertEqual", "assertEvaluatorWasUsed", "assertEventLog",
+ "assertFileContains", "assertFileExists", "assertGroupExists", "assertGroupMembership",
+ "assertInputAvailable", "assertNoDirectoryContains", "assertNoNewEvents",
"assertNoOverlap", "assertNoPresetInSection", "assertNoReflectionLoop",
"assertNoSessionsInDefault", "assertNoSessionsWithGroupId", "assertOrchestratorReceivedRoutingContext",
"assertOrchestratorReceivedWorkerDescriptions", "assertOrchestratorSynthesized",
- "assertOrgJson", "assertPresetInSection", "assertPresetVisible", "assertReflectionPaused",
- "assertReflectionState", "assertSessionMeta", "assertWorkerPromptContains",
- "captureGroupState", "click", "createGroup", "createGroupFromPreset", "createSquadDir",
- "deleteGroup", "evaluate", "navigate", "note", "pauseReflection", "readOrgJson",
- "restartApp", "resumeReflection", "saveGroupAsPreset", "selectWorktree", "sendPrompt",
- "setEvaluator", "setMode", "shell", "type", "wait", "waitForAgent",
- "waitForAllResponses", "waitForAllSessions", "waitForCompletion", "waitForPhase",
+ "assertOrgJson", "assertPresetInSection", "assertPresetVisible", "assertProcessing",
+ "assertReflectionPaused", "assertReflectionState", "assertResponseNotEmpty",
+ "assertSessionExists", "assertSessionMeta", "assertSessionNotExists",
+ "assertWorkerPromptContains",
+ "captureEventLogPosition", "captureGroupState", "captureSessionList",
+ "click", "clickAbort", "clickCreate", "closeSession",
+ "createGroup", "createGroupFromPreset", "createSession", "createSquadDir",
+ "deleteGroup", "evaluate", "navigate", "note",
+ "openCreateMenu", "pauseReflection", "readOrgJson", "relaunchApp",
+ "restartApp", "resumeReflection", "saveGroupAsPreset",
+ "selectOption", "selectPreset", "selectRepo", "selectSession", "selectWorktree",
+ "sendMessage", "sendPrompt", "sendToGroup",
+ "setEvaluator", "setMode", "shell", "shellCheck", "switchModel",
+ "type", "wait", "waitForAgent", "waitForAllResponses", "waitForAllSessions",
+ "waitForCompletion", "waitForEventPattern", "waitForIdle", "waitForPhase",
"screenshot"
};
foreach (var file in Directory.GetFiles(ScenariosDir, "*.json"))
diff --git a/PolyPilot.Tests/Scenarios/multi-agent-reliability-scenarios.json b/PolyPilot.Tests/Scenarios/multi-agent-reliability-scenarios.json
new file mode 100644
index 000000000..4183b793f
--- /dev/null
+++ b/PolyPilot.Tests/Scenarios/multi-agent-reliability-scenarios.json
@@ -0,0 +1,530 @@
+{
+ "description": "Multi-agent reliability scenarios for PolyPilot. Tests cover every orchestration lifecycle path: group creation, dispatch, worker execution, synthesis, subsequent orchestrations, follow-up handling, restart recovery, and state hygiene. Complements multi-agent-scenarios.json which covers organization/presets/squad. Execute against a running PolyPilot instance using MauiDevFlow CDP commands.",
+ "prerequisites": {
+ "build": "cd PolyPilot && ./relaunch.sh",
+ "waitForAgent": "maui devflow MAUI status --agent-port 9223",
+ "initialMode": "Persistent",
+ "notes": "All test sessions MUST point to the PolyPilot repository. Use 'Implement & Challenge' preset (ignore PR Reviewer). Never commit, push, or create GitHub issues/PRs. DevFlow agent port is 9223. CDP Blazor clicks require PointerEvent dispatch."
+ },
+ "scenarios": [
+ {
+ "id": "create-implement-challenge-group",
+ "name": "Create Implement & Challenge multi-agent group",
+ "description": "Create a multi-agent group using the Implement & Challenge preset. Verify orchestrator and workers are created with correct roles.",
+ "category": "group-creation",
+ "invariants": [
+ "Group appears in sidebar",
+ "Orchestrator session exists with Role=Orchestrator",
+ "Worker sessions exist (at least 2)",
+ "All sessions point to PolyPilot repo"
+ ],
+ "steps": [
+ { "action": "openCreateMenu" },
+ { "action": "selectOption", "text": "Multi-Agent Team" },
+ { "action": "selectPreset", "name": "Implement & Challenge" },
+ { "action": "selectRepo", "repo": "PureWeen-PolyPilot" },
+ { "action": "clickCreate" },
+ { "action": "wait", "ms": 5000 },
+ { "action": "assertGroupExists", "nameContains": "Implement" },
+ { "action": "assertSessionExists", "nameContains": "orchestrator" },
+ { "action": "assertSessionExists", "nameContains": "worker" }
+ ]
+ },
+ {
+ "id": "group-shows-all-sessions-in-sidebar",
+ "name": "Multi-agent group shows all sessions in sidebar",
+ "description": "Verify all group sessions (orchestrator + workers) are visible in the sidebar under the group.",
+ "category": "group-creation",
+ "invariants": [
+ "All sessions visible under group header",
+ "Session count matches expected (orchestrator + N workers)"
+ ],
+ "steps": [
+ { "action": "evaluate", "script": "document.querySelectorAll('[class*=session-list-item]').length", "expect": { "greaterThan": 2 }, "note": "At least orchestrator + 2 workers visible" }
+ ]
+ },
+ {
+ "id": "workers-have-assigned-models",
+ "name": "Workers have model assignments from preset",
+ "description": "Verify each worker session has its configured model from the preset.",
+ "category": "group-creation",
+ "invariants": [
+ "Each worker has a non-empty model assignment",
+ "Model names match preset configuration"
+ ],
+ "steps": [
+ { "action": "note", "text": "Verify by expanding each worker card and checking model dropdown, or via event log on first dispatch" }
+ ]
+ },
+ {
+ "id": "orchestrator-role-correct",
+ "name": "Orchestrator session has correct metadata",
+ "description": "Verify the orchestrator session is marked with the correct role and group membership.",
+ "category": "group-creation",
+ "invariants": [
+ "Orchestrator session exists",
+ "Session is part of the multi-agent group",
+ "IsMultiAgentSession = true"
+ ],
+ "steps": [
+ { "action": "assertSessionExists", "nameContains": "orchestrator" },
+ { "action": "assertEventLog", "pattern": "DISPATCH-ROUTE.*orchestrator.*isOrch=True", "expected": true, "note": "Will be verified when first message is sent" }
+ ]
+ },
+ {
+ "id": "dispatch-sends-to-all-workers",
+ "name": "Orchestration dispatches to all workers",
+ "description": "Send a prompt to the group. Verify the orchestrator dispatches tasks to all workers via event log.",
+ "category": "dispatch",
+ "invariants": [
+ "Event log shows [DISPATCH-ROUTE] with correct mode",
+ "Event log shows [SEND] for each worker",
+ "Event log shows [DISPATCH] with worker names"
+ ],
+ "steps": [
+ { "action": "sendToGroup", "text": "Add a one-line code comment to the top of README.md explaining what PolyPilot is. Keep it simple." },
+ { "action": "wait", "ms": 10000, "note": "Wait for orchestrator to plan and dispatch" },
+ { "action": "assertEventLog", "pattern": "\\[DISPATCH-ROUTE\\].*Orchestrator", "expected": true },
+ { "action": "assertEventLog", "pattern": "\\[DISPATCH\\].*Dispatching", "expected": true },
+ { "action": "assertEventLog", "pattern": "\\[SEND\\].*worker", "expected": true }
+ ]
+ },
+ {
+ "id": "dispatch-stagger-delay",
+ "name": "Workers dispatched with stagger delay",
+ "description": "Verify workers are dispatched approximately 1 second apart, not all at once.",
+ "category": "dispatch",
+ "invariants": [
+ "Worker [SEND] timestamps are ~1s apart"
+ ],
+ "steps": [
+ { "action": "shellCheck", "command": "grep '\\[SEND\\].*worker' ~/.polypilot/event-diagnostics.log | tail -3", "note": "Verify timestamps show staggered dispatch" }
+ ]
+ },
+ {
+ "id": "pending-orchestration-saved-during-dispatch",
+ "name": "PendingOrchestration file created during dispatch",
+ "description": "During worker execution, verify pending-orchestration.json exists with correct data.",
+ "category": "dispatch",
+ "invariants": [
+ "File exists at ~/.polypilot/pending-orchestration.json",
+ "Contains correct group ID and worker names"
+ ],
+ "steps": [
+ { "action": "shellCheck", "command": "cat ~/.polypilot/pending-orchestration.json 2>/dev/null | head -5", "expect": { "contains": "worker" } }
+ ]
+ },
+ {
+ "id": "orchestrator-plans-with-worker-blocks",
+ "name": "Orchestrator planning includes @worker blocks",
+ "description": "Verify the orchestrator's response contains @worker task assignments.",
+ "category": "dispatch",
+ "invariants": [
+ "Event log shows 'Early dispatch: @worker blocks detected'",
+ "Assignments parsed from orchestrator response"
+ ],
+ "steps": [
+ { "action": "assertEventLog", "pattern": "Early dispatch.*@worker", "expected": true }
+ ]
+ },
+ {
+ "id": "workers-process-independently",
+ "name": "Workers process their tasks independently",
+ "description": "Monitor workers during execution. Each should cycle through TurnStart/TurnEnd events independently.",
+ "category": "worker-execution",
+ "invariants": [
+ "Each worker has AssistantTurnStartEvent and AssistantTurnEndEvent in event log",
+ "Workers process concurrently"
+ ],
+ "steps": [
+ { "action": "waitForEventPattern", "pattern": "\\[EVT\\].*worker.*TurnStart", "timeout": 30, "note": "Wait for workers to start processing" },
+ { "action": "assertEventLog", "pattern": "\\[EVT\\].*worker.*TurnEndEvent", "expected": true, "note": "Workers cycling through events" }
+ ]
+ },
+ {
+ "id": "worker-tool-use-completes",
+ "name": "Workers using tools complete without getting stuck",
+ "description": "Workers often use tools (grep, file reads). Verify tool execution completes.",
+ "category": "worker-execution",
+ "invariants": [
+ "Workers complete (SessionIdleEvent or CompleteResponse)",
+ "No workers stuck with HasUsedToolsThisTurn=true indefinitely"
+ ],
+ "steps": [
+ { "action": "waitForEventPattern", "pattern": "\\[COMPLETE\\].*worker", "timeout": 600, "note": "Wait for at least one worker to complete" }
+ ]
+ },
+ {
+ "id": "worker-subagent-idle-defer",
+ "name": "Worker IDLE-DEFER fires and resolves for subagents",
+ "description": "If a worker dispatches sub-agents (task tool), IDLE-DEFER should fire and eventually resolve.",
+ "category": "worker-execution",
+ "invariants": [
+ "If [IDLE-DEFER] appears, it's eventually followed by [COMPLETE]",
+ "Worker doesn't stay stuck in IDLE-DEFER forever"
+ ],
+ "steps": [
+ { "action": "note", "text": "This scenario is best verified by checking event log after orchestration completes" },
+ { "action": "shellCheck", "command": "defer_count=$(grep -c '\\[IDLE-DEFER\\]' ~/.polypilot/event-diagnostics.log); complete_count=$(grep -c '\\[COMPLETE\\]' ~/.polypilot/event-diagnostics.log); echo \"defers=$defer_count completes=$complete_count\"" }
+ ]
+ },
+ {
+ "id": "worker-timeout-handled-gracefully",
+ "name": "Worker timeout produces error result, doesn't block orchestration",
+ "description": "If a worker times out (>10 min), the orchestration should still continue with an error result.",
+ "category": "worker-execution",
+ "invariants": [
+ "Orchestration doesn't hang forever waiting for a dead worker",
+ "Synthesis includes failure message for timed-out worker"
+ ],
+ "steps": [
+ { "action": "note", "text": "Difficult to force a timeout in live testing. Verify by checking event log for any [WATCHDOG] entries on workers and confirming orchestration still completed." }
+ ]
+ },
+ {
+ "id": "worker-error-doesnt-block-orchestration",
+ "name": "One worker error doesn't block the whole orchestration",
+ "description": "If one worker fails, the other worker should still complete and synthesis should include the failure.",
+ "category": "worker-execution",
+ "invariants": [
+ "Orchestration completes even if one worker errored",
+ "Synthesis prompt includes error message for failed worker"
+ ],
+ "steps": [
+ { "action": "note", "text": "Verify from event log: if any [ERROR] entries exist for workers, check that [DISPATCH] Collected results still fires" }
+ ]
+ },
+ {
+ "id": "synthesis-collects-all-worker-results",
+ "name": "Synthesis includes all worker responses",
+ "description": "After workers complete, verify the synthesis prompt is sent to the orchestrator with all results.",
+ "category": "synthesis",
+ "invariants": [
+ "Event log shows 'Collected N/N worker results for synthesis'",
+ "All workers accounted for"
+ ],
+ "steps": [
+ { "action": "waitForEventPattern", "pattern": "Collected.*worker results for synthesis", "timeout": 600 },
+ { "action": "assertEventLog", "pattern": "Collected.*worker results", "expected": true }
+ ]
+ },
+ {
+ "id": "synthesis-completes-orchestrator",
+ "name": "Orchestrator completes after synthesis",
+ "description": "After synthesis prompt is sent, the orchestrator should process it and complete.",
+ "category": "synthesis",
+ "invariants": [
+ "Orchestrator shows [SEND] for synthesis prompt",
+ "Orchestrator shows [COMPLETE] after synthesis",
+ "IsProcessing = false on orchestrator"
+ ],
+ "steps": [
+ { "action": "waitForEventPattern", "pattern": "\\[COMPLETE\\].*orchestrator", "timeout": 120, "note": "Wait for synthesis completion" },
+ { "action": "assertProcessing", "sessionContains": "orchestrator", "expected": false }
+ ]
+ },
+ {
+ "id": "pending-orchestration-cleared-after-synthesis",
+ "name": "PendingOrchestration file cleared after synthesis",
+ "description": "After orchestration completes, pending-orchestration.json should be empty or deleted.",
+ "category": "synthesis",
+ "invariants": [
+ "File is empty, contains '{}', or doesn't exist"
+ ],
+ "steps": [
+ { "action": "shellCheck", "command": "cat ~/.polypilot/pending-orchestration.json 2>/dev/null || echo 'file not found'", "expect": { "oneOf": ["file not found", "{}", "null", ""] } }
+ ]
+ },
+ {
+ "id": "all-workers-idle-after-orchestration",
+ "name": "All workers have IsProcessing=false after orchestration",
+ "description": "After orchestration completes, verify every worker session is idle.",
+ "category": "synthesis",
+ "invariants": [
+ "Every worker session has IsProcessing = false",
+ "No stop buttons visible for any worker"
+ ],
+ "steps": [
+ { "action": "evaluate", "script": "document.querySelectorAll('.send-btn.stop-btn, .card-stop-btn').length", "expect": 0, "note": "No sessions processing" }
+ ]
+ },
+ {
+ "id": "no-orphaned-workers-after-orchestration",
+ "name": "No orphaned worker state after orchestration",
+ "description": "After orchestration, no workers should be stuck in a partial processing state.",
+ "category": "synthesis",
+ "invariants": [
+ "No worker [SEND] without corresponding [COMPLETE]",
+ "No stuck TCS (TaskCompletionSource)"
+ ],
+ "steps": [
+ { "action": "shellCheck", "command": "for w in 1 2; do sends=$(grep -c \"\\[SEND\\].*worker-$w\" ~/.polypilot/event-diagnostics.log); completes=$(grep -c \"\\[COMPLETE\\].*worker-$w\" ~/.polypilot/event-diagnostics.log); echo \"worker-$w: sends=$sends completes=$completes\"; done" }
+ ]
+ },
+ {
+ "id": "second-orchestration-works",
+ "name": "Second orchestration to same group completes",
+ "description": "After one orchestration completes, send a new prompt to the same group. Verify full lifecycle works again.",
+ "category": "subsequent-orchestrations",
+ "invariants": [
+ "Second dispatch fires",
+ "Workers process again",
+ "Synthesis completes again",
+ "IsProcessing = false on all sessions"
+ ],
+ "steps": [
+ { "action": "sendToGroup", "text": "Now add a comment to the bottom of README.md saying 'Built with GitHub Copilot'. Keep it simple." },
+ { "action": "waitForEventPattern", "pattern": "\\[DISPATCH\\].*Dispatching", "timeout": 60, "note": "Wait for second dispatch" },
+ { "action": "waitForEventPattern", "pattern": "\\[COMPLETE\\].*orchestrator", "timeout": 600, "note": "Wait for second orchestration to complete" },
+ { "action": "evaluate", "script": "document.querySelectorAll('.send-btn.stop-btn, .card-stop-btn').length", "expect": 0 }
+ ]
+ },
+ {
+ "id": "third-orchestration-no-degradation",
+ "name": "Third orchestration shows no accumulated state issues",
+ "description": "A third consecutive orchestration should work as cleanly as the first.",
+ "category": "subsequent-orchestrations",
+ "invariants": [
+ "No accumulated stuck state from previous orchestrations",
+ "Event log clean for this orchestration"
+ ],
+ "steps": [
+ { "action": "sendToGroup", "text": "Check if README.md has any typos and fix them. If none, just say so." },
+ { "action": "waitForEventPattern", "pattern": "\\[COMPLETE\\].*orchestrator", "timeout": 600 },
+ { "action": "evaluate", "script": "document.querySelectorAll('.send-btn.stop-btn, .card-stop-btn').length", "expect": 0 }
+ ]
+ },
+ {
+ "id": "orchestration-after-worker-error-works",
+ "name": "Orchestration works after a previous worker error",
+ "description": "If a previous orchestration had a worker error, the next one should still work.",
+ "category": "subsequent-orchestrations",
+ "invariants": [
+ "All workers dispatch correctly",
+ "No lingering error state from previous run"
+ ],
+ "steps": [
+ { "action": "note", "text": "This is verified by the second/third orchestration tests above. If any had errors, this confirms recovery." }
+ ]
+ },
+ {
+ "id": "follow-up-during-dispatch-queued",
+ "name": "Follow-up message during orchestration is queued, not steered",
+ "description": "Send a follow-up while workers are busy. It should be queued (not steer the orchestrator). PR #375 fix.",
+ "category": "follow-up-steering",
+ "invariants": [
+ "Event log shows QUEUED, not STEER",
+ "Orchestration not disrupted by follow-up",
+ "Follow-up processed after current orchestration completes"
+ ],
+ "steps": [
+ { "action": "sendToGroup", "text": "Review all the files in PolyPilot/Models/ directory." },
+ { "action": "wait", "ms": 5000, "note": "Workers start processing" },
+ { "action": "sendToGroup", "text": "Also check the Services directory." },
+ { "action": "assertEventLog", "pattern": "QUEUED", "expected": true, "note": "Follow-up queued" },
+ { "action": "waitForEventPattern", "pattern": "\\[COMPLETE\\].*orchestrator", "timeout": 600 }
+ ]
+ },
+ {
+ "id": "follow-up-after-completion-works",
+ "name": "Follow-up after orchestration completes starts new orchestration",
+ "description": "Send a message after orchestration is fully complete. It should start a new orchestration.",
+ "category": "follow-up-steering",
+ "invariants": [
+ "New [DISPATCH-ROUTE] appears",
+ "New orchestration lifecycle begins"
+ ],
+ "steps": [
+ { "action": "assertProcessing", "sessionContains": "orchestrator", "expected": false, "note": "Verify idle first" },
+ { "action": "sendToGroup", "text": "Summarize what changes were made across all previous tasks." },
+ { "action": "waitForEventPattern", "pattern": "\\[DISPATCH-ROUTE\\]", "timeout": 60 },
+ { "action": "waitForEventPattern", "pattern": "\\[COMPLETE\\].*orchestrator", "timeout": 600 }
+ ]
+ },
+ {
+ "id": "abort-orchestration-cleans-up",
+ "name": "Aborting orchestration cleans up all sessions",
+ "description": "Abort the orchestrator while workers are running. Verify clean state for all sessions.",
+ "category": "follow-up-steering",
+ "invariants": [
+ "All sessions reach IsProcessing = false",
+ "No stuck workers",
+ "PendingOrchestration cleared"
+ ],
+ "steps": [
+ { "action": "sendToGroup", "text": "Do a comprehensive review of every single file in the repository." },
+ { "action": "wait", "ms": 10000, "note": "Workers start processing" },
+ { "action": "clickAbort", "sessionContains": "orchestrator" },
+ { "action": "wait", "ms": 10000 },
+ { "action": "evaluate", "script": "document.querySelectorAll('.send-btn.stop-btn, .card-stop-btn').length", "expect": 0, "note": "All sessions idle after abort" }
+ ]
+ },
+ {
+ "id": "group-survives-relaunch",
+ "name": "Multi-agent group persists across app relaunch",
+ "description": "Relaunch the app and verify the multi-agent group is restored with all sessions.",
+ "category": "restart-recovery",
+ "invariants": [
+ "Group appears in sidebar after relaunch",
+ "Orchestrator session restored",
+ "Worker sessions restored",
+ "All sessions idle (not stuck processing)"
+ ],
+ "steps": [
+ { "action": "captureGroupState", "capture": "beforeRelaunch" },
+ { "action": "relaunchApp" },
+ { "action": "waitForAgent", "timeout": 120 },
+ { "action": "assertSessionExists", "nameContains": "orchestrator" },
+ { "action": "assertSessionExists", "nameContains": "worker" },
+ { "action": "evaluate", "script": "document.querySelectorAll('.send-btn.stop-btn, .card-stop-btn').length", "expect": 0 }
+ ]
+ },
+ {
+ "id": "no-phantom-worker-sessions",
+ "name": "No phantom or duplicate worker sessions after relaunch",
+ "description": "After relaunch, verify no duplicate worker sessions or phantom '(resumed)' entries.",
+ "category": "restart-recovery",
+ "invariants": [
+ "No duplicate session names",
+ "No '(resumed)' in any session name"
+ ],
+ "steps": [
+ { "action": "evaluate", "script": "Array.from(document.querySelectorAll('.session-name, .session-title')).filter(e => e.textContent.includes('resumed')).length", "expect": 0 }
+ ]
+ },
+ {
+ "id": "pending-orchestration-recovery",
+ "name": "PendingOrchestration recovers after relaunch during dispatch",
+ "description": "If the app relaunches while workers are processing, PendingOrchestration should enable recovery.",
+ "category": "restart-recovery",
+ "invariants": [
+ "If pending-orchestration.json existed, recovery was attempted",
+ "Workers either completed or were collected with partial results"
+ ],
+ "steps": [
+ { "action": "note", "text": "Best tested by: 1) Start orchestration, 2) Immediately relaunch, 3) Check event log for 'Resuming pending orchestration'" },
+ { "action": "shellCheck", "command": "grep 'Resuming pending orchestration\\|pending orchestration' ~/.polypilot/event-diagnostics.log | tail -3 || echo 'no pending recovery'" }
+ ]
+ },
+ {
+ "id": "post-relaunch-new-orchestration-works",
+ "name": "New orchestration works after relaunch",
+ "description": "After relaunching and restoring the group, send a new prompt. Verify full lifecycle works.",
+ "category": "restart-recovery",
+ "invariants": [
+ "New dispatch fires",
+ "Workers process and complete",
+ "Synthesis completes"
+ ],
+ "steps": [
+ { "action": "sendToGroup", "text": "What is in the README.md file? Summarize it briefly." },
+ { "action": "waitForEventPattern", "pattern": "\\[DISPATCH\\]", "timeout": 60 },
+ { "action": "waitForEventPattern", "pattern": "\\[COMPLETE\\].*orchestrator", "timeout": 600 },
+ { "action": "evaluate", "script": "document.querySelectorAll('.send-btn.stop-btn, .card-stop-btn').length", "expect": 0 }
+ ]
+ },
+ {
+ "id": "workers-not-stuck-after-relaunch",
+ "name": "No workers stuck in IsProcessing after relaunch",
+ "description": "After relaunch, verify no worker sessions are stuck processing.",
+ "category": "restart-recovery",
+ "invariants": [
+ "All workers have IsProcessing = false within 30s of relaunch",
+ "No [WATCHDOG] kill entries needed"
+ ],
+ "steps": [
+ { "action": "wait", "ms": 30000, "note": "Wait for any watchdog cleanup" },
+ { "action": "evaluate", "script": "document.querySelectorAll('.send-btn.stop-btn, .card-stop-btn').length", "expect": 0 }
+ ]
+ },
+ {
+ "id": "event-log-dispatch-complete-pairs",
+ "name": "Every [DISPATCH] has completion in event log",
+ "description": "Audit the event log: every worker dispatch should have a completion.",
+ "category": "state-hygiene",
+ "invariants": [
+ "Number of worker [SEND] ≤ worker [COMPLETE] + [ABORT]",
+ "No orphaned dispatches"
+ ],
+ "steps": [
+ { "action": "shellCheck", "command": "sends=$(grep -c '\\[SEND\\].*worker' ~/.polypilot/event-diagnostics.log); completes=$(grep -c '\\[COMPLETE\\].*worker\\|\\[ABORT\\].*worker' ~/.polypilot/event-diagnostics.log); echo \"worker sends=$sends completions=$completes\"" }
+ ]
+ },
+ {
+ "id": "no-stuck-tcs-after-orchestration",
+ "name": "No unresolved TaskCompletionSources after orchestration",
+ "description": "After orchestration, verify no TCS objects are waiting for completion.",
+ "category": "state-hygiene",
+ "invariants": [
+ "All sessions idle",
+ "No pending awaits blocking orchestrator"
+ ],
+ "steps": [
+ { "action": "assertProcessing", "sessionContains": "orchestrator", "expected": false },
+ { "action": "note", "text": "TCS state is internal — best verified by confirming orchestrator can accept new messages" },
+ { "action": "sendToGroup", "text": "Say 'TCS test OK'." },
+ { "action": "waitForEventPattern", "pattern": "\\[COMPLETE\\].*orchestrator", "timeout": 120 }
+ ]
+ },
+ {
+ "id": "watchdog-doesnt-kill-active-workers",
+ "name": "Watchdog doesn't kill workers that are actively processing",
+ "description": "During a long orchestration, verify the watchdog doesn't prematurely kill active workers.",
+ "category": "state-hygiene",
+ "invariants": [
+ "No [WATCHDOG].*kill entries for workers with recent events",
+ "Watchdog Case B defers correctly when events.jsonl is fresh"
+ ],
+ "steps": [
+ { "action": "shellCheck", "command": "grep '\\[WATCHDOG\\].*worker.*kill\\|\\[WATCHDOG\\].*worker.*stuck' ~/.polypilot/event-diagnostics.log | wc -l | tr -d ' '", "expect": "0" }
+ ]
+ },
+ {
+ "id": "idle-defer-resolves-correctly",
+ "name": "IDLE-DEFER fires and resolves without leaving sessions stuck",
+ "description": "If IDLE-DEFER fires during worker execution, it should eventually resolve to completion.",
+ "category": "state-hygiene",
+ "invariants": [
+ "Every [IDLE-DEFER] is eventually followed by [COMPLETE] for the same session",
+ "No sessions left in permanent IDLE-DEFER state"
+ ],
+ "steps": [
+ { "action": "shellCheck", "command": "grep '\\[IDLE-DEFER\\]' ~/.polypilot/event-diagnostics.log | wc -l | tr -d ' '", "note": "Count IDLE-DEFER entries" },
+ { "action": "evaluate", "script": "document.querySelectorAll('.send-btn.stop-btn, .card-stop-btn').length", "expect": 0, "note": "No sessions stuck" }
+ ]
+ },
+ {
+ "id": "reconnect-during-orchestration-no-deadlock",
+ "name": "Reconnect during orchestration doesn't cause deadlock",
+ "description": "If a reconnect happens mid-orchestration, TCS should be cancelled and orchestration should not deadlock.",
+ "category": "state-hygiene",
+ "invariants": [
+ "No permanent deadlock after reconnect",
+ "Orchestration either recovers or fails cleanly"
+ ],
+ "steps": [
+ { "action": "note", "text": "Best tested by triggering a reconnect mid-orchestration (e.g., server restart). In live testing, verify no sessions are stuck after any reconnect events in the log." },
+ { "action": "shellCheck", "command": "grep '\\[RECONNECT\\]' ~/.polypilot/event-diagnostics.log | tail -3 || echo 'no reconnects'" }
+ ]
+ },
+ {
+ "id": "delete-group-cleans-everything",
+ "name": "Deleting multi-agent group removes all sessions cleanly",
+ "description": "Delete the test multi-agent group. Verify all sessions are removed with no orphans.",
+ "category": "state-hygiene",
+ "invariants": [
+ "All group sessions removed from sidebar",
+ "No orphaned worker sessions",
+ "PendingOrchestration cleared",
+ "Organization state updated"
+ ],
+ "steps": [
+ { "action": "deleteGroup", "nameContains": "Implement" },
+ { "action": "wait", "ms": 3000 },
+ { "action": "assertSessionNotExists", "nameContains": "orchestrator" },
+ { "action": "assertSessionNotExists", "nameContains": "worker" },
+ { "action": "shellCheck", "command": "cat ~/.polypilot/pending-orchestration.json 2>/dev/null || echo 'clean'", "expect": { "oneOf": ["clean", "{}", "null", ""] } }
+ ]
+ }
+ ]
+}
diff --git a/PolyPilot.Tests/Scenarios/single-session-scenarios.json b/PolyPilot.Tests/Scenarios/single-session-scenarios.json
new file mode 100644
index 000000000..da5567f72
--- /dev/null
+++ b/PolyPilot.Tests/Scenarios/single-session-scenarios.json
@@ -0,0 +1,633 @@
+{
+ "description": "Single-session reliability scenarios for PolyPilot. Tests cover every way to interact with a single session: sending messages, tool use, abort, rapid input, session lifecycle, restart recovery, state hygiene, and edge cases. Execute against a running PolyPilot instance using MauiDevFlow CDP commands.",
+ "prerequisites": {
+ "build": "cd PolyPilot && ./relaunch.sh",
+ "waitForAgent": "maui devflow MAUI status --agent-port 9223",
+ "initialMode": "Persistent",
+ "notes": "All test sessions MUST point to the PolyPilot repository (PureWeen-PolyPilot). Never use other repos. Never commit, push, or create GitHub issues/PRs during testing. The DevFlow agent port is 9223 (bypass broker with --agent-port 9223). CDP click events require PointerEvent dispatch for Blazor compatibility."
+ },
+ "cdpHelpers": {
+ "clickBlazorButton": "btn.dispatchEvent(new PointerEvent('pointerdown', {bubbles: true, clientX: x, clientY: y})); btn.dispatchEvent(new PointerEvent('pointerup', {bubbles: true, clientX: x, clientY: y})); btn.dispatchEvent(new MouseEvent('click', {bubbles: true, clientX: x, clientY: y}));",
+ "setInputValue": "const el = document.getElementById(inputId); el.value = text; el.dispatchEvent(new Event('input', {bubbles: true}));",
+ "sendMessage": "1) Set textarea value via getElementValue-compatible path, 2) PointerEvent click on send-btn[title='Send message']",
+ "checkProcessing": "document.querySelectorAll('.send-btn.stop-btn').length > 0 means session is processing",
+ "checkEventLog": "grep 'SESSION_NAME' ~/.polypilot/event-diagnostics.log | tail -N"
+ },
+ "scenarios": [
+ {
+ "id": "send-simple-message",
+ "name": "Simple message sends and receives a response",
+ "description": "Send 'What is 2+2? Reply with just the number.' and verify the response arrives, IsProcessing clears, and event log shows clean [SEND]→[COMPLETE].",
+ "category": "basic-message-flow",
+ "invariants": [
+ "Response text appears in chat history",
+ "IsProcessing = false after completion",
+ "Event log shows [SEND] followed by [COMPLETE]",
+ "No [ERROR] or [WATCHDOG] entries for this session",
+ "Response length > 0"
+ ],
+ "steps": [
+ { "action": "createSession", "name": "TestSimpleSend", "repo": "PureWeen-PolyPilot" },
+ { "action": "selectSession", "name": "TestSimpleSend" },
+ { "action": "sendMessage", "text": "What is 2+2? Reply with just the number." },
+ { "action": "waitForIdle", "timeout": 60 },
+ { "action": "assertEventLog", "session": "TestSimpleSend", "pattern": "\\[SEND\\].*\\[COMPLETE\\]", "expected": true },
+ { "action": "assertEventLog", "session": "TestSimpleSend", "pattern": "\\[ERROR\\]|\\[WATCHDOG\\]", "expected": false },
+ { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false },
+ { "action": "assertResponseNotEmpty", "session": "TestSimpleSend" }
+ ]
+ },
+ {
+ "id": "send-long-message",
+ "name": "Long multi-paragraph prompt processes correctly",
+ "description": "Send a multi-paragraph prompt to verify large inputs don't break processing.",
+ "category": "basic-message-flow",
+ "invariants": [
+ "Response arrives for large input",
+ "IsProcessing = false after completion",
+ "No truncation or corruption of input"
+ ],
+ "steps": [
+ { "action": "selectSession", "name": "TestSimpleSend" },
+ { "action": "sendMessage", "text": "Please analyze the following requirements and provide your thoughts:\n\n1. The system must handle concurrent requests\n2. It must maintain session state across restarts\n3. Error recovery should be automatic\n4. All state transitions must be logged\n\nWhat design patterns would you recommend?" },
+ { "action": "waitForIdle", "timeout": 120 },
+ { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false },
+ { "action": "assertEventLog", "session": "TestSimpleSend", "pattern": "\\[COMPLETE\\]", "expected": true }
+ ]
+ },
+ {
+ "id": "response-renders-markdown",
+ "name": "Response renders markdown as formatted HTML",
+ "description": "Send a prompt requesting markdown output and verify it renders as HTML.",
+ "category": "basic-message-flow",
+ "invariants": [
+ "Response contains HTML elements (not raw markdown)",
+ "Code blocks, headers, or lists render correctly"
+ ],
+ "steps": [
+ { "action": "selectSession", "name": "TestSimpleSend" },
+ { "action": "sendMessage", "text": "Write a short bullet list of 3 items about testing." },
+ { "action": "waitForIdle", "timeout": 60 },
+ { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false },
+ { "action": "evaluate", "script": "document.querySelectorAll('.message-content li, .message-content ul').length > 0", "expect": true, "note": "Markdown list rendered as HTML" }
+ ]
+ },
+ {
+ "id": "response-contains-code-block",
+ "name": "Code block renders with syntax highlighting",
+ "description": "Request code output and verify it renders in a code block.",
+ "category": "basic-message-flow",
+ "invariants": [
+ "Response contains a or element",
+ "Code content is present"
+ ],
+ "steps": [
+ { "action": "selectSession", "name": "TestSimpleSend" },
+ { "action": "sendMessage", "text": "Write a one-line Python hello world program. Just the code, no explanation." },
+ { "action": "waitForIdle", "timeout": 60 },
+ { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false },
+ { "action": "evaluate", "script": "document.querySelectorAll('.message-content pre, .message-content code').length > 0", "expect": true, "note": "Code block rendered" }
+ ]
+ },
+ {
+ "id": "send-empty-validation",
+ "name": "Empty send is prevented",
+ "description": "Attempt to send an empty message and verify it's blocked.",
+ "category": "basic-message-flow",
+ "invariants": [
+ "No [SEND] entry in event log for empty message",
+ "Session remains idle"
+ ],
+ "steps": [
+ { "action": "selectSession", "name": "TestSimpleSend" },
+ { "action": "captureEventLogPosition", "capture": "beforeEmpty" },
+ { "action": "sendMessage", "text": "" },
+ { "action": "wait", "ms": 2000 },
+ { "action": "assertNoNewEvents", "session": "TestSimpleSend", "since": "beforeEmpty", "pattern": "\\[SEND\\]" }
+ ]
+ },
+ {
+ "id": "tool-file-listing",
+ "name": "Tool execution: file listing completes",
+ "description": "Send a file listing request that requires tool use. Verify tools execute and response contains expected files.",
+ "category": "tool-execution",
+ "invariants": [
+ "Tool execution events appear in event log",
+ "Response mentions expected files (README.md, PolyPilot.slnx)",
+ "IsProcessing = false after completion",
+ "No [WATCHDOG] entries"
+ ],
+ "steps": [
+ { "action": "selectSession", "name": "TestSimpleSend" },
+ { "action": "sendMessage", "text": "List just the file names in the root directory of this repository. Be brief." },
+ { "action": "waitForIdle", "timeout": 120 },
+ { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false },
+ { "action": "assertEventLog", "session": "TestSimpleSend", "pattern": "\\[COMPLETE\\]", "expected": true },
+ { "action": "assertEventLog", "session": "TestSimpleSend", "pattern": "\\[WATCHDOG\\]", "expected": false }
+ ]
+ },
+ {
+ "id": "tool-file-read",
+ "name": "Tool execution: file read completes",
+ "description": "Request reading a specific file to verify file read tools work.",
+ "category": "tool-execution",
+ "invariants": [
+ "Response contains content from the file",
+ "IsProcessing = false after completion"
+ ],
+ "steps": [
+ { "action": "selectSession", "name": "TestSimpleSend" },
+ { "action": "sendMessage", "text": "Read the first line of README.md in this repo and tell me what it says. Be brief." },
+ { "action": "waitForIdle", "timeout": 120 },
+ { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false },
+ { "action": "assertEventLog", "session": "TestSimpleSend", "pattern": "\\[COMPLETE\\]", "expected": true }
+ ]
+ },
+ {
+ "id": "tool-multiple-rounds",
+ "name": "Multi-round tool use completes without getting stuck",
+ "description": "Send a request requiring multiple tool calls in sequence. Verify all rounds complete.",
+ "category": "tool-execution",
+ "invariants": [
+ "Multiple TurnStart/TurnEnd cycles in event log",
+ "Session completes with [COMPLETE]",
+ "IsProcessing = false after all rounds"
+ ],
+ "steps": [
+ { "action": "selectSession", "name": "TestSimpleSend" },
+ { "action": "sendMessage", "text": "Find the PolyPilot.slnx file and tell me how many project references it contains. Be brief." },
+ { "action": "waitForIdle", "timeout": 180 },
+ { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false },
+ { "action": "assertEventLog", "session": "TestSimpleSend", "pattern": "\\[COMPLETE\\]", "expected": true },
+ { "action": "assertEventLog", "session": "TestSimpleSend", "pattern": "\\[WATCHDOG\\]", "expected": false }
+ ]
+ },
+ {
+ "id": "tool-execution-shows-progress",
+ "name": "Processing indicator shows during tool use",
+ "description": "During tool execution, verify the UI shows working/tool call indicators.",
+ "category": "tool-execution",
+ "invariants": [
+ "Stop button visible during processing",
+ "Processing indicator updates"
+ ],
+ "steps": [
+ { "action": "selectSession", "name": "TestSimpleSend" },
+ { "action": "sendMessage", "text": "Search the codebase for all files containing 'IsProcessing' and count them. Be brief." },
+ { "action": "wait", "ms": 5000, "note": "Allow time for tool execution to start" },
+ { "action": "evaluate", "script": "document.querySelectorAll('.send-btn.stop-btn').length", "expect": { "greaterThan": 0 }, "note": "Stop button visible means processing" },
+ { "action": "waitForIdle", "timeout": 180 },
+ { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false }
+ ]
+ },
+ {
+ "id": "tool-long-running",
+ "name": "Long tool execution doesn't trigger false watchdog",
+ "description": "A prompt requiring extended tool use completes without watchdog interference.",
+ "category": "tool-execution",
+ "invariants": [
+ "No [WATCHDOG] kill entries in event log",
+ "Session completes naturally with [COMPLETE]"
+ ],
+ "steps": [
+ { "action": "selectSession", "name": "TestSimpleSend" },
+ { "action": "sendMessage", "text": "Count the total number of lines across all .cs files in the PolyPilot/Services/ directory. Be brief, just give me the count." },
+ { "action": "waitForIdle", "timeout": 300 },
+ { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false },
+ { "action": "assertEventLog", "session": "TestSimpleSend", "pattern": "\\[WATCHDOG\\].*kill|\\[WATCHDOG\\].*stuck", "expected": false },
+ { "action": "assertEventLog", "session": "TestSimpleSend", "pattern": "\\[COMPLETE\\]", "expected": true }
+ ]
+ },
+ {
+ "id": "abort-during-thinking",
+ "name": "Abort during 'Thinking...' phase cleans up state",
+ "description": "Send a prompt, abort during the thinking phase. Verify IsProcessing clears and all companion fields reset.",
+ "category": "abort-cancel",
+ "invariants": [
+ "IsProcessing = false within 5 seconds of abort",
+ "No lingering spinner or 'Thinking...' indicator",
+ "Event log shows [ABORT]",
+ "Session accepts new input after abort"
+ ],
+ "steps": [
+ { "action": "selectSession", "name": "TestSimpleSend" },
+ { "action": "sendMessage", "text": "Write a very detailed 5000-word essay about software testing methodologies." },
+ { "action": "wait", "ms": 3000, "note": "Wait for thinking phase" },
+ { "action": "clickAbort", "session": "TestSimpleSend" },
+ { "action": "wait", "ms": 5000 },
+ { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false },
+ { "action": "assertEventLog", "session": "TestSimpleSend", "pattern": "\\[ABORT\\]", "expected": true }
+ ]
+ },
+ {
+ "id": "abort-during-tool-use",
+ "name": "Abort during tool execution cleans up state",
+ "description": "Send a tool-using prompt, abort during tool execution. Verify clean state.",
+ "category": "abort-cancel",
+ "invariants": [
+ "IsProcessing = false after abort",
+ "ActiveToolCallCount = 0",
+ "Event log shows [ABORT]"
+ ],
+ "steps": [
+ { "action": "selectSession", "name": "TestSimpleSend" },
+ { "action": "sendMessage", "text": "Read every single .cs file in the repository and summarize each one in detail." },
+ { "action": "wait", "ms": 10000, "note": "Wait for tool execution to start" },
+ { "action": "clickAbort", "session": "TestSimpleSend" },
+ { "action": "wait", "ms": 5000 },
+ { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false },
+ { "action": "assertEventLog", "session": "TestSimpleSend", "pattern": "\\[ABORT\\]", "expected": true }
+ ]
+ },
+ {
+ "id": "abort-then-send-new",
+ "name": "Session accepts new message after abort",
+ "description": "Abort a session, then immediately send a new prompt. Verify no deadlock.",
+ "category": "abort-cancel",
+ "invariants": [
+ "New message processes successfully after abort",
+ "IsProcessing = false after new message completes",
+ "No deadlock or stuck state"
+ ],
+ "steps": [
+ { "action": "selectSession", "name": "TestSimpleSend" },
+ { "action": "sendMessage", "text": "Write a very long detailed analysis of every design pattern ever created." },
+ { "action": "wait", "ms": 5000 },
+ { "action": "clickAbort", "session": "TestSimpleSend" },
+ { "action": "wait", "ms": 3000 },
+ { "action": "sendMessage", "text": "Say 'hello'. Just that one word." },
+ { "action": "waitForIdle", "timeout": 60 },
+ { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false },
+ { "action": "assertEventLog", "session": "TestSimpleSend", "pattern": "\\[COMPLETE\\]", "expected": true }
+ ]
+ },
+ {
+ "id": "abort-when-idle-no-crash",
+ "name": "Abort on idle session doesn't crash",
+ "description": "Try to abort a session that's already idle. Verify no crash or error.",
+ "category": "abort-cancel",
+ "invariants": [
+ "No crash or error",
+ "Session remains idle and functional"
+ ],
+ "steps": [
+ { "action": "selectSession", "name": "TestSimpleSend" },
+ { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false },
+ { "action": "clickAbort", "session": "TestSimpleSend", "note": "Abort when already idle" },
+ { "action": "wait", "ms": 2000 },
+ { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false },
+ { "action": "sendMessage", "text": "Say 'still working'. Just those two words." },
+ { "action": "waitForIdle", "timeout": 60 },
+ { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false }
+ ]
+ },
+ {
+ "id": "back-to-back-simple",
+ "name": "Back-to-back messages both complete",
+ "description": "Send message 1, wait for completion, immediately send message 2. Both should complete.",
+ "category": "rapid-input",
+ "invariants": [
+ "Both messages get [COMPLETE] in event log",
+ "IsProcessing = false after both complete",
+ "No deadlock"
+ ],
+ "steps": [
+ { "action": "selectSession", "name": "TestSimpleSend" },
+ { "action": "sendMessage", "text": "Say 'first'. Just that word." },
+ { "action": "waitForIdle", "timeout": 60 },
+ { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false },
+ { "action": "sendMessage", "text": "Say 'second'. Just that word." },
+ { "action": "waitForIdle", "timeout": 60 },
+ { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false },
+ { "action": "assertEventLog", "session": "TestSimpleSend", "pattern": "\\[COMPLETE\\]", "count": { "atLeast": 2 } }
+ ]
+ },
+ {
+ "id": "send-while-processing",
+ "name": "Send during processing is handled correctly",
+ "description": "Send a second message while the first is still processing. Verify no crash, second message eventually processes.",
+ "category": "rapid-input",
+ "invariants": [
+ "No crash or exception",
+ "Both messages eventually complete or second is queued",
+ "Session reaches idle state"
+ ],
+ "steps": [
+ { "action": "selectSession", "name": "TestSimpleSend" },
+ { "action": "sendMessage", "text": "List every .cs file in the PolyPilot/Services directory." },
+ { "action": "wait", "ms": 2000, "note": "First message starts processing" },
+ { "action": "sendMessage", "text": "Say 'queued message received'." },
+ { "action": "waitForIdle", "timeout": 180 },
+ { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false }
+ ]
+ },
+ {
+ "id": "rapid-three-messages",
+ "name": "Three rapid messages don't cause permanent stuck state",
+ "description": "Send 3 messages in quick succession. Verify session eventually reaches idle.",
+ "category": "rapid-input",
+ "invariants": [
+ "Session eventually reaches idle (IsProcessing = false)",
+ "No permanent stuck state",
+ "Event log has at least one [COMPLETE]"
+ ],
+ "steps": [
+ { "action": "selectSession", "name": "TestSimpleSend" },
+ { "action": "sendMessage", "text": "Say 'one'." },
+ { "action": "wait", "ms": 500 },
+ { "action": "sendMessage", "text": "Say 'two'." },
+ { "action": "wait", "ms": 500 },
+ { "action": "sendMessage", "text": "Say 'three'." },
+ { "action": "waitForIdle", "timeout": 180 },
+ { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false }
+ ]
+ },
+ {
+ "id": "create-new-session",
+ "name": "Create new session appears in sidebar",
+ "description": "Create a new session and verify it appears in the session list.",
+ "category": "session-lifecycle",
+ "invariants": [
+ "New session appears in sidebar",
+ "Session is empty (no messages)",
+ "Session accepts input"
+ ],
+ "steps": [
+ { "action": "createSession", "name": "TestLifecycle", "repo": "PureWeen-PolyPilot" },
+ { "action": "assertSessionExists", "name": "TestLifecycle" },
+ { "action": "selectSession", "name": "TestLifecycle" },
+ { "action": "assertInputAvailable", "session": "TestLifecycle" }
+ ]
+ },
+ {
+ "id": "create-session-with-repo-context",
+ "name": "Session with repo context has tool access",
+ "description": "Create a session pointed at PolyPilot repo, verify it can use tools.",
+ "category": "session-lifecycle",
+ "invariants": [
+ "Session has repo context",
+ "Tool calls work (file listing succeeds)"
+ ],
+ "steps": [
+ { "action": "selectSession", "name": "TestLifecycle" },
+ { "action": "sendMessage", "text": "What repository am I in? Be brief." },
+ { "action": "waitForIdle", "timeout": 60 },
+ { "action": "assertProcessing", "session": "TestLifecycle", "expected": false }
+ ]
+ },
+ {
+ "id": "switch-between-sessions",
+ "name": "Switching sessions preserves each one's history",
+ "description": "Switch between two sessions and verify each retains its own chat history.",
+ "category": "session-lifecycle",
+ "invariants": [
+ "Each session shows its own history after switching",
+ "No cross-contamination between sessions"
+ ],
+ "steps": [
+ { "action": "selectSession", "name": "TestSimpleSend" },
+ { "action": "assertResponseNotEmpty", "session": "TestSimpleSend", "note": "First session has history" },
+ { "action": "selectSession", "name": "TestLifecycle" },
+ { "action": "wait", "ms": 1000 },
+ { "action": "selectSession", "name": "TestSimpleSend" },
+ { "action": "assertResponseNotEmpty", "session": "TestSimpleSend", "note": "History preserved after switch" }
+ ]
+ },
+ {
+ "id": "close-session-removed",
+ "name": "Closed session doesn't appear after removal",
+ "description": "Close a session and verify it's removed from the sidebar.",
+ "category": "session-lifecycle",
+ "invariants": [
+ "Session removed from sidebar",
+ "Not restored on next check"
+ ],
+ "steps": [
+ { "action": "createSession", "name": "TestClose", "repo": "PureWeen-PolyPilot" },
+ { "action": "assertSessionExists", "name": "TestClose" },
+ { "action": "closeSession", "name": "TestClose" },
+ { "action": "wait", "ms": 2000 },
+ { "action": "assertSessionNotExists", "name": "TestClose" }
+ ]
+ },
+ {
+ "id": "session-history-persists-to-disk",
+ "name": "Chat history persists across interactions",
+ "description": "Send a message, then verify the chat database has the message stored.",
+ "category": "session-lifecycle",
+ "invariants": [
+ "Message appears in chat history",
+ "History survives session switch and return"
+ ],
+ "steps": [
+ { "action": "selectSession", "name": "TestSimpleSend" },
+ { "action": "sendMessage", "text": "Say 'persistence test 12345'. Just those exact words." },
+ { "action": "waitForIdle", "timeout": 60 },
+ { "action": "selectSession", "name": "TestLifecycle" },
+ { "action": "wait", "ms": 1000 },
+ { "action": "selectSession", "name": "TestSimpleSend" },
+ { "action": "evaluate", "script": "document.querySelector('.expanded-view-container')?.textContent?.includes('persistence test 12345') || document.querySelector('.chat-messages')?.textContent?.includes('persistence test 12345')", "expect": true }
+ ]
+ },
+ {
+ "id": "session-survives-relaunch",
+ "name": "Session restores correctly after app relaunch",
+ "description": "Send a message, relaunch the app, verify session is restored with history.",
+ "category": "connection-recovery",
+ "invariants": [
+ "Session appears in sidebar after relaunch",
+ "Chat history is preserved",
+ "Session is idle (not stuck in processing)"
+ ],
+ "steps": [
+ { "action": "captureSessionList", "capture": "beforeRelaunch" },
+ { "action": "relaunchApp" },
+ { "action": "waitForAgent", "timeout": 120 },
+ { "action": "assertSessionExists", "name": "TestSimpleSend" },
+ { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false }
+ ]
+ },
+ {
+ "id": "idle-session-no-resumed-tag",
+ "name": "Idle session doesn't get '(resumed)' tag after relaunch",
+ "description": "After relaunch, idle sessions should not have phantom '(resumed)' markers.",
+ "category": "connection-recovery",
+ "invariants": [
+ "No session names contain '(resumed)' text",
+ "All sessions have their original names"
+ ],
+ "steps": [
+ { "action": "evaluate", "script": "Array.from(document.querySelectorAll('.session-name, .session-title')).filter(e => e.textContent.includes('resumed')).length", "expect": 0, "note": "No phantom (resumed) sessions" }
+ ]
+ },
+ {
+ "id": "no-phantom-sessions-after-relaunch",
+ "name": "No duplicate or phantom sessions after relaunch",
+ "description": "After relaunch, verify no duplicate sessions appeared.",
+ "category": "connection-recovery",
+ "invariants": [
+ "Session count matches pre-relaunch count (±1 for any cleaned up)",
+ "No duplicate session names"
+ ],
+ "steps": [
+ { "action": "evaluate", "script": "new Set(Array.from(document.querySelectorAll('.session-name')).map(e => e.textContent.trim())).size === document.querySelectorAll('.session-name').length", "expect": true, "note": "No duplicate session names" }
+ ]
+ },
+ {
+ "id": "post-relaunch-send-works",
+ "name": "Session is functional after relaunch",
+ "description": "After relaunch, send a message to a restored session and verify it completes.",
+ "category": "connection-recovery",
+ "invariants": [
+ "Message sends successfully",
+ "Response arrives",
+ "IsProcessing = false after completion"
+ ],
+ "steps": [
+ { "action": "selectSession", "name": "TestSimpleSend" },
+ { "action": "sendMessage", "text": "Say 'post relaunch OK'. Just those exact words." },
+ { "action": "waitForIdle", "timeout": 120 },
+ { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false },
+ { "action": "assertEventLog", "session": "TestSimpleSend", "pattern": "\\[COMPLETE\\]", "expected": true }
+ ]
+ },
+ {
+ "id": "no-stuck-processing-after-relaunch",
+ "name": "No sessions stuck in IsProcessing after relaunch",
+ "description": "After relaunch, verify no sessions are stuck in processing state.",
+ "category": "connection-recovery",
+ "invariants": [
+ "Zero visible stop buttons (no sessions processing)",
+ "No 'Thinking...' indicators visible"
+ ],
+ "steps": [
+ { "action": "wait", "ms": 30000, "note": "Wait for watchdog to clear any stuck sessions" },
+ { "action": "evaluate", "script": "document.querySelectorAll('.send-btn.stop-btn, .card-stop-btn').length", "expect": 0, "note": "No sessions stuck processing" }
+ ]
+ },
+ {
+ "id": "isprocessing-false-when-idle",
+ "name": "IsProcessing is false on all idle sessions",
+ "description": "Check that every session in the list has IsProcessing = false when no messages are being sent.",
+ "category": "state-hygiene",
+ "invariants": [
+ "No stop buttons visible",
+ "No thinking/working indicators"
+ ],
+ "steps": [
+ { "action": "evaluate", "script": "document.querySelectorAll('.send-btn.stop-btn, .card-stop-btn').length", "expect": 0 },
+ { "action": "evaluate", "script": "document.querySelectorAll('.thinking-indicator, .processing-indicator').length", "expect": 0 }
+ ]
+ },
+ {
+ "id": "event-log-send-complete-pairs",
+ "name": "Every [SEND] has a matching [COMPLETE] or [ABORT]",
+ "description": "Audit the event diagnostics log. Every [SEND] entry should have a corresponding completion.",
+ "category": "state-hygiene",
+ "invariants": [
+ "Number of [SEND] entries ≤ [COMPLETE] + [ABORT] entries (for test sessions)",
+ "No orphaned [SEND] without resolution"
+ ],
+ "steps": [
+ { "action": "shellCheck", "command": "sends=$(grep -c '\\[SEND\\].*TestSimpleSend' ~/.polypilot/event-diagnostics.log); completes=$(grep -c '\\[COMPLETE\\].*TestSimpleSend\\|\\[ABORT\\].*TestSimpleSend' ~/.polypilot/event-diagnostics.log); echo \"sends=$sends completes=$completes\"; [ $sends -le $completes ]" }
+ ]
+ },
+ {
+ "id": "no-watchdog-on-normal-sessions",
+ "name": "Watchdog doesn't fire for normally-completing sessions",
+ "description": "Verify that sessions completing normally don't trigger watchdog entries.",
+ "category": "state-hygiene",
+ "invariants": [
+ "No [WATCHDOG] entries for test sessions",
+ "All completions are via [COMPLETE] or [ABORT]"
+ ],
+ "steps": [
+ { "action": "shellCheck", "command": "grep '\\[WATCHDOG\\].*TestSimpleSend' ~/.polypilot/event-diagnostics.log | grep -v 'deferred' | wc -l | tr -d ' '", "expect": "0" }
+ ]
+ },
+ {
+ "id": "processing-indicator-shows-and-clears",
+ "name": "Processing indicator shows during work and clears after",
+ "description": "Send a message, verify processing indicator appears, then clears after completion.",
+ "category": "state-hygiene",
+ "invariants": [
+ "Processing indicator visible during processing",
+ "Processing indicator gone after completion"
+ ],
+ "steps": [
+ { "action": "selectSession", "name": "TestSimpleSend" },
+ { "action": "sendMessage", "text": "List the top-level directories in this repository." },
+ { "action": "wait", "ms": 3000 },
+ { "action": "evaluate", "script": "document.querySelectorAll('.send-btn.stop-btn, .card-stop-btn').length > 0 || true", "expect": true, "note": "May or may not catch the processing window" },
+ { "action": "waitForIdle", "timeout": 120 },
+ { "action": "evaluate", "script": "document.querySelectorAll('.send-btn.stop-btn, .card-stop-btn').length", "expect": 0, "note": "Processing indicator cleared" }
+ ]
+ },
+ {
+ "id": "session-not-stuck-after-idle",
+ "name": "No phantom 'Thinking...' after session completes",
+ "description": "After a message completes, verify the session is truly idle with no lingering indicators.",
+ "category": "state-hygiene",
+ "invariants": [
+ "No thinking/working text visible for session",
+ "Input is enabled and accepts typing",
+ "Send button is not a stop button"
+ ],
+ "steps": [
+ { "action": "selectSession", "name": "TestSimpleSend" },
+ { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false },
+ { "action": "assertInputAvailable", "session": "TestSimpleSend" }
+ ]
+ },
+ {
+ "id": "send-special-characters",
+ "name": "Special characters in prompt don't cause issues",
+ "description": "Send a prompt with quotes, backticks, unicode, and special characters.",
+ "category": "edge-cases",
+ "invariants": [
+ "Message sends successfully",
+ "Response arrives",
+ "No crash or error"
+ ],
+ "steps": [
+ { "action": "selectSession", "name": "TestSimpleSend" },
+ { "action": "sendMessage", "text": "Echo this back: Hello 'world' \"test\" `code` & © ñ 🎉" },
+ { "action": "waitForIdle", "timeout": 60 },
+ { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false }
+ ]
+ },
+ {
+ "id": "model-switch-then-send",
+ "name": "Session works after model switch",
+ "description": "Switch the model for a session, then send a message. Verify it completes.",
+ "category": "edge-cases",
+ "invariants": [
+ "Model switch succeeds",
+ "Message sends and completes with new model",
+ "IsProcessing = false after completion"
+ ],
+ "steps": [
+ { "action": "selectSession", "name": "TestSimpleSend" },
+ { "action": "switchModel", "note": "Switch to a different available model via UI dropdown" },
+ { "action": "sendMessage", "text": "Say 'model switch OK'. Just those words." },
+ { "action": "waitForIdle", "timeout": 120 },
+ { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false }
+ ]
+ },
+ {
+ "id": "stale-session-loads-ok",
+ "name": "Old sessions load without errors",
+ "description": "Verify that existing sessions from before testing load and don't cause errors.",
+ "category": "edge-cases",
+ "invariants": [
+ "Pre-existing sessions visible in sidebar",
+ "No error messages or crash indicators",
+ "App remains responsive"
+ ],
+ "steps": [
+ { "action": "evaluate", "script": "document.querySelectorAll('.session-list-item, .session-card').length > 0", "expect": true, "note": "Sessions loaded" },
+ { "action": "evaluate", "script": "document.querySelectorAll('.error-banner, .crash-indicator, .fatal-error').length", "expect": 0, "note": "No error banners" }
+ ]
+ }
+ ]
+}
diff --git a/PolyPilot.Tests/SessionPersistenceTests.cs b/PolyPilot.Tests/SessionPersistenceTests.cs
index d57c3e022..6dcbb9283 100644
--- a/PolyPilot.Tests/SessionPersistenceTests.cs
+++ b/PolyPilot.Tests/SessionPersistenceTests.cs
@@ -1721,10 +1721,11 @@ public void Merge_NameCollision_DifferentGroupId_WithoutRecoveryMarker_KeepsPrev
}
[Fact]
- public void Merge_NameCollision_SameGroupId_StillCreatesPrevious()
+ public void Merge_NameCollision_SameGroupId_WithoutRecoveryMarker_StillCreatesPrevious()
{
- // When the collision happens within the same group (e.g., reconnect replaced
- // the session), the old entry should still be preserved as "(previous)".
+ // When the collision happens within the same group but there is NO explicit
+ // RecoveredFromSessionId marker, we cannot be sure the new session intentionally
+ // replaced the old one — preserve the old entry as "(previous)" to avoid data loss.
var active = new List
{
new() { SessionId = "new-id", DisplayName = "MyWorker", Model = "m",
@@ -1743,6 +1744,31 @@ public void Merge_NameCollision_SameGroupId_StillCreatesPrevious()
Assert.Equal("MyWorker (previous)", result[1].DisplayName);
}
+ [Fact]
+ public void Merge_NameCollision_SameGroupId_WithExplicitRecovery_DropsPersistedEntry()
+ {
+ // Worker revival (empty response → fresh session) sets RecoveredFromSessionId on the
+ // new session to record that it explicitly replaced the old one. The merge must
+ // drop the old persisted entry so it never appears as a "(previous)" phantom.
+ var active = new List
+ {
+ new() { SessionId = "new-id", DisplayName = "MyWorker", Model = "m",
+ WorkingDirectory = "/w", GroupId = "same-group", RecoveredFromSessionId = "old-id" }
+ };
+ var persisted = new List
+ {
+ new() { SessionId = "old-id", DisplayName = "MyWorker", Model = "m",
+ WorkingDirectory = "/w", GroupId = "same-group" }
+ };
+
+ var result = CopilotService.MergeSessionEntries(active, persisted, new HashSet(), new HashSet(), _ => true);
+
+ // Only the active (revival) entry should remain — no "(previous)" phantom
+ Assert.Single(result);
+ Assert.Equal("new-id", result[0].SessionId);
+ Assert.Equal("MyWorker", result[0].DisplayName);
+ }
+
[Fact]
public void Merge_NameCollision_NullGroupIds_StillCreatesPrevious()
{
diff --git a/PolyPilot.Tests/SessionStabilityTests.cs b/PolyPilot.Tests/SessionStabilityTests.cs
index 434501b9e..a10b680c5 100644
--- a/PolyPilot.Tests/SessionStabilityTests.cs
+++ b/PolyPilot.Tests/SessionStabilityTests.cs
@@ -303,20 +303,13 @@ public void WatchdogCrashRecovery_ClearsAllCompanionFields()
var source = File.ReadAllText(TestPaths.EventsCs);
var watchdogMethod = ExtractMethod(source, "RunProcessingWatchdogAsync");
- // The crash recovery block (Case C kill) must clear companion fields
- var companionFields = new[]
- {
- "IsProcessing = false",
- "ProcessingPhase",
- "ProcessingStartedAt",
- "ToolCallCount",
- };
-
- foreach (var field in companionFields)
- {
- Assert.True(watchdogMethod.Contains(field, StringComparison.Ordinal),
- $"Watchdog crash recovery must clear '{field}'");
- }
+ // The crash recovery block must call ClearProcessingState (which atomically
+ // clears IsProcessing, ProcessingPhase, ProcessingStartedAt, ToolCallCount, etc.)
+ Assert.True(watchdogMethod.Contains("ClearProcessingState(state", StringComparison.Ordinal),
+ "Watchdog crash recovery must call ClearProcessingState to atomically clear all companion fields");
+ // Must also set AllowTurnStartRearm = false (terminal forced stop)
+ Assert.True(watchdogMethod.Contains("AllowTurnStartRearm = false", StringComparison.Ordinal),
+ "Watchdog crash recovery must set AllowTurnStartRearm = false");
}
// ─── Multi-Agent Fix Prompt Enhancement ───
diff --git a/PolyPilot/Services/CopilotService.Events.cs b/PolyPilot/Services/CopilotService.Events.cs
index 72281e121..bb75f3108 100644
--- a/PolyPilot/Services/CopilotService.Events.cs
+++ b/PolyPilot/Services/CopilotService.Events.cs
@@ -1581,20 +1581,9 @@ private void CompleteResponse(SessionState state, long? expectedGeneration = nul
$"(responseLen={state.CurrentResponse.Length}, flushedLen={state.FlushedResponse.Length}, thread={Environment.CurrentManagedThreadId})");
CancelProcessingWatchdog(state);
- // Also cancel any pending TurnEnd→Idle fallback — CompleteResponse is now executing
- CancelTurnEndFallback(state);
- CancelToolHealthCheck(state);
- Interlocked.Exchange(ref state.ActiveToolCallCount, 0);
- Interlocked.Exchange(ref state.SendingFlag, 0);
- state.HasUsedToolsThisTurn = false;
- ClearDeferredIdleTracking(state);
- state.IsReconnectedSend = false; // Clear reconnect flag on turn completion (defense-in-depth)
state.FallbackCanceledByTurnStart = false;
- Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0);
- Interlocked.Exchange(ref state.ToolHealthStaleChecks, 0);
- Interlocked.Exchange(ref state.EventCountThisTurn, 0);
- Interlocked.Exchange(ref state.TurnEndReceivedAtTicks, 0);
- state.Info.IsResumed = false; // Clear after first successful turn
+ // Per-turn tracking fields (ActiveToolCallCount, HasUsedToolsThisTurn, etc.)
+ // are cleared by ClearProcessingState below. No need to clear them early.
var response = state.CurrentResponse.ToString();
var responseAlreadyFlushedThisTurn = WasResponseAlreadyFlushedThisTurn(state, response);
if (!string.IsNullOrWhiteSpace(response))
@@ -1640,29 +1629,18 @@ private void CompleteResponse(SessionState state, long? expectedGeneration = nul
// Clear IsProcessing BEFORE completing the TCS — if the continuation runs
// synchronously (e.g., in orchestrator reflection loops), the next SendPromptAsync
// call must see IsProcessing=false or it throws "already processing".
- state.CurrentResponse.Clear();
- state.FlushedResponse.Clear();
- ClearFlushedReplayDedup(state);
- state.PendingReasoningMessages.Clear();
- // Accumulate API time before clearing ProcessingStartedAt
- if (state.Info.ProcessingStartedAt is { } started)
- {
- state.Info.TotalApiTimeSeconds += (DateTime.UtcNow - started).TotalSeconds;
- state.Info.PremiumRequestsUsed++;
- }
- state.AllowTurnStartRearm = true; // session.idle/turn-end completion can be premature; allow one late TurnStart recovery
- state.Info.IsProcessing = false;
- state.Info.IsResumed = false;
- Interlocked.Exchange(ref state.SendingFlag, 0); // Release atomic send lock
- state.Info.ConsecutiveStuckCount = 0;
- // A successful completion proves the server is healthy — reset the
+ ClearProcessingState(state);
+ // Success-only: allow EVT-REARM to re-arm IsProcessing if a late TurnStart arrives
+ // (premature session.idle recovery). This must be set AFTER ClearProcessingState to
+ // avoid the race where a background TurnStart thread reads AllowTurnStartRearm=true
+ // before error/abort callers can override it back to false.
+ state.AllowTurnStartRearm = true;
+ // Success-only: a successful completion proves the server is healthy — reset the
// service-level watchdog timeout counter to prevent false recovery triggers.
Interlocked.Exchange(ref _consecutiveWatchdogTimeouts, 0);
- state.Info.ProcessingStartedAt = null;
- state.Info.ToolCallCount = 0;
- state.Info.ProcessingPhase = 0;
- state.Info.ClearPermissionDenials();
- state.Info.LastUpdatedAt = DateTime.Now;
+ // Success-only: a successful response proves the session is not stuck — reset the
+ // per-session consecutive stuck counter so the >= 3 threshold can re-accumulate.
+ state.Info.ConsecutiveStuckCount = 0;
state.ResponseCompletion?.TrySetResult(fullResponse);
// Fire completion notification BEFORE OnStateChanged — this ensures
@@ -2337,7 +2315,6 @@ private void TriggerToolHealthRecovery(SessionState state, string sessionName, s
if (state.IsOrphaned) return;
CancelToolHealthCheck(state);
CancelProcessingWatchdog(state);
- CancelTurnEndFallback(state);
var activeTools = Volatile.Read(ref state.ActiveToolCallCount);
var recoveryGeneration = Interlocked.Read(ref state.ProcessingGeneration);
@@ -2352,16 +2329,8 @@ private void TriggerToolHealthRecovery(SessionState state, string sessionName, s
OnError?.Invoke(sessionName, $"Tool execution stuck ({reason}). Session recovered automatically.");
- // Full cleanup mirroring CompleteResponse — missing fields here caused stuck sessions
- Interlocked.Exchange(ref state.ActiveToolCallCount, 0);
- state.HasUsedToolsThisTurn = false;
- ClearDeferredIdleTracking(state);
state.FallbackCanceledByTurnStart = false;
- Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0);
Interlocked.Exchange(ref state.WatchdogCaseAResets, 0);
- Interlocked.Exchange(ref state.ToolHealthStaleChecks, 0);
- Interlocked.Exchange(ref state.EventCountThisTurn, 0);
- Interlocked.Exchange(ref state.TurnEndReceivedAtTicks, 0);
// Build full response: flushed mid-turn text + remaining current text
var response = state.CurrentResponse.ToString();
@@ -2371,18 +2340,11 @@ private void TriggerToolHealthRecovery(SessionState state, string sessionName, s
: state.FlushedResponse + "\n\n" + response)
: response;
- state.CurrentResponse.Clear();
- state.FlushedResponse.Clear();
- state.PendingReasoningMessages.Clear();
-
- state.Info.IsProcessing = false;
+ // Accumulate API time but don't count as premium request (recovery, not success)
+ if (state.Info.ProcessingStartedAt is { } healthStarted)
+ state.Info.TotalApiTimeSeconds += (DateTime.UtcNow - healthStarted).TotalSeconds;
+ ClearProcessingState(state, accumulateApiTime: false);
state.AllowTurnStartRearm = false; // Explicit tool-health recovery should stay completed
- state.Info.IsResumed = false;
- Interlocked.Exchange(ref state.SendingFlag, 0);
- state.Info.ProcessingStartedAt = null;
- state.Info.ToolCallCount = 0;
- state.Info.ProcessingPhase = 0;
- state.Info.ClearPermissionDenials();
state.ResponseCompletion?.TrySetResult(fullResponse);
@@ -2961,33 +2923,19 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session
return;
}
CancelProcessingWatchdog(state);
- CancelToolHealthCheck(state);
- Interlocked.Exchange(ref state.ActiveToolCallCount, 0);
- state.HasUsedToolsThisTurn = false;
- ClearDeferredIdleTracking(state);
- Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0);
- Interlocked.Exchange(ref state.ToolHealthStaleChecks, 0);
- Interlocked.Exchange(ref state.EventCountThisTurn, 0);
- Interlocked.Exchange(ref state.TurnEndReceivedAtTicks, 0);
- // Cancel any pending TurnEnd→Idle fallback
- CancelTurnEndFallback(state);
- state.Info.IsResumed = false;
- state.IsReconnectedSend = false; // INV-1: clear all per-turn flags on termination
// Flush any accumulated partial response before clearing processing state.
// Wrapped in try-catch: if flush fails, IsProcessing MUST still be cleared
// (otherwise the session is permanently stuck — the watchdog has already exited).
try { FlushCurrentResponse(state); }
catch (Exception flushEx) { Debug($"[WATCHDOG] '{sessionName}' flush failed during kill: {flushEx.Message}"); }
Debug($"[WATCHDOG] '{sessionName}' IsProcessing=false — watchdog timeout after {totalProcessingSeconds:F0}s total, elapsed={elapsed:F0}s, exceededMaxTime={exceededMaxTime}");
- state.Info.IsProcessing = false;
- state.AllowTurnStartRearm = false; // Watchdog timeout is an explicit forced stop
- Interlocked.Exchange(ref state.SendingFlag, 0);
+ // Capture flushed response BEFORE ClearProcessingState clears it
+ var watchdogResponse = state.FlushedResponse.ToString();
+ // Accumulate API time (request was in-flight) but don't count as premium request
if (state.Info.ProcessingStartedAt is { } wdStarted)
state.Info.TotalApiTimeSeconds += (DateTime.UtcNow - wdStarted).TotalSeconds;
- state.Info.ProcessingStartedAt = null;
- state.Info.ToolCallCount = 0;
- state.Info.ProcessingPhase = 0;
- state.Info.ClearPermissionDenials(); // INV-1: clear on all termination paths
+ ClearProcessingState(state, accumulateApiTime: false);
+ state.AllowTurnStartRearm = false; // Watchdog timeout is an explicit forced stop
state.Info.ConsecutiveStuckCount++;
// Track service-level consecutive watchdog timeouts. When the
// persistent server's auth token expires, ALL sessions hang silently.
@@ -3024,9 +2972,6 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session
// into a session that's in a repeated-stuck cycle.
state.Info.MessageQueue.Clear();
}
- var watchdogResponse = state.FlushedResponse.ToString();
- state.FlushedResponse.Clear();
- state.PendingReasoningMessages.Clear();
state.ResponseCompletion?.TrySetResult(watchdogResponse);
// Fire completion notification so orchestrator loops are unblocked (INV-O4)
OnSessionComplete?.Invoke(sessionName, "[Watchdog] timeout");
@@ -3058,28 +3003,15 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session
// Best-effort flush before clearing processing state
try { FlushCurrentResponse(state); }
catch { /* Flush failure must not prevent IsProcessing cleanup */ }
- // INV-1: clear IsProcessing and all 9 companion fields
- state.Info.IsProcessing = false;
+ // Capture response BEFORE ClearProcessingState clears it
+ var crashResponse = state.FlushedResponse.ToString() + state.CurrentResponse.ToString();
+ // Accumulate API time but don't count as premium request
+ if (state.Info.ProcessingStartedAt is { } crashStarted)
+ state.Info.TotalApiTimeSeconds += (DateTime.UtcNow - crashStarted).TotalSeconds;
+ ClearProcessingState(state, accumulateApiTime: false);
state.AllowTurnStartRearm = false; // Watchdog crash cleanup is terminal for this turn
- state.Info.IsResumed = false;
- Interlocked.Exchange(ref state.SendingFlag, 0);
- Interlocked.Exchange(ref state.ActiveToolCallCount, 0);
- state.HasUsedToolsThisTurn = false;
- ClearDeferredIdleTracking(state);
- Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0);
- Interlocked.Exchange(ref state.ToolHealthStaleChecks, 0);
- Interlocked.Exchange(ref state.EventCountThisTurn, 0);
- Interlocked.Exchange(ref state.TurnEndReceivedAtTicks, 0);
- state.Info.ProcessingStartedAt = null;
- state.Info.ToolCallCount = 0;
- state.Info.ProcessingPhase = 0;
- state.Info.ClearPermissionDenials();
state.Info.ConsecutiveStuckCount++;
Interlocked.Increment(ref _consecutiveWatchdogTimeouts);
- var crashResponse = state.FlushedResponse.ToString() + state.CurrentResponse.ToString();
- state.FlushedResponse.Clear();
- state.CurrentResponse.Clear();
- state.PendingReasoningMessages.Clear();
state.ResponseCompletion?.TrySetResult(crashResponse);
OnSessionComplete?.Invoke(sessionName, "[Watchdog] crash recovery");
OnError?.Invoke(sessionName, "Internal error in session monitoring. Try sending your message again.");
diff --git a/PolyPilot/Services/CopilotService.Organization.cs b/PolyPilot/Services/CopilotService.Organization.cs
index 698ac072b..b8c790b49 100644
--- a/PolyPilot/Services/CopilotService.Organization.cs
+++ b/PolyPilot/Services/CopilotService.Organization.cs
@@ -42,20 +42,19 @@ public partial class CopilotService
/// Shorter than WorkerExecutionTimeout — if a worker is stuck, the orchestrator
/// proceeds with partial results rather than blocking the group forever.