diff --git a/PolyPilot.Tests/ChatExperienceSafetyTests.cs b/PolyPilot.Tests/ChatExperienceSafetyTests.cs index 91de0cc35..3c916ec82 100644 --- a/PolyPilot.Tests/ChatExperienceSafetyTests.cs +++ b/PolyPilot.Tests/ChatExperienceSafetyTests.cs @@ -1051,9 +1051,9 @@ public void CompleteResponse_Source_ClearsSendingFlag() var crIdx = source.IndexOf("private void CompleteResponse(", StringComparison.Ordinal); Assert.True(crIdx > 0); - // Within CompleteResponse, SendingFlag must be cleared (may be 80+ lines into method) - var afterCR = source.Substring(crIdx, Math.Min(6000, source.Length - crIdx)); - Assert.Contains("SendingFlag", afterCR); + // CompleteResponse must call ClearProcessingState (which clears SendingFlag along with all other fields) + var afterCR = source.Substring(crIdx, Math.Min(10000, source.Length - crIdx)); + Assert.Contains("ClearProcessingState", afterCR); } /// @@ -1246,4 +1246,113 @@ public async Task FlushCurrentResponse_WritesToChatDatabase() Assert.Equal("test-db-session-id", lastDbMsg.SessionId); Assert.Contains("Content for database", lastDbMsg.Message.Content); } + + /// + /// PR #531 review finding: ClearProcessingState must NOT set AllowTurnStartRearm=true. + /// Only CompleteResponse (normal-completion path) should allow EVT-REARM. + /// Error/abort paths must not have a race window where a background TurnStart event + /// reads AllowTurnStartRearm=true before the caller can set it back to false. + /// + [Fact] + public void ClearProcessingState_DoesNotSetAllowTurnStartRearm() + { + // Structural: ClearProcessingState method body must NOT contain AllowTurnStartRearm = true + var svcPath = Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.cs"); + var source = File.ReadAllText(svcPath); + + var methodIdx = source.IndexOf("private void ClearProcessingState(", StringComparison.Ordinal); + Assert.True(methodIdx >= 0, "ClearProcessingState must exist"); + var methodEnd = source.IndexOf("\n }", methodIdx + 1, StringComparison.Ordinal); + var methodBody = source.Substring(methodIdx, methodEnd - methodIdx); + + // Strip comment lines before checking — comments may document the invariant using + // the very string we're guarding against appearing in actual code. + var codeOnly = string.Join("\n", methodBody.Split('\n') + .Where(l => !l.TrimStart().StartsWith("//"))); + + Assert.False(codeOnly.Contains("AllowTurnStartRearm = true", StringComparison.Ordinal), + "ClearProcessingState must NOT set AllowTurnStartRearm=true — only CompleteResponse should, " + + "to avoid a race where error/abort paths get it briefly set before they can override to false."); + } + + /// + /// PR #531 review finding: _consecutiveWatchdogTimeouts reset must NOT be in + /// ClearProcessingState. It is a success-only signal (healthy server) — resetting it + /// on error/abort paths defeats the server-recovery detection threshold. + /// + [Fact] + public void ClearProcessingState_DoesNotResetConsecutiveWatchdogTimeouts() + { + var svcPath = Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.cs"); + var source = File.ReadAllText(svcPath); + + var methodIdx = source.IndexOf("private void ClearProcessingState(", StringComparison.Ordinal); + Assert.True(methodIdx >= 0, "ClearProcessingState must exist"); + var methodEnd = source.IndexOf("\n }", methodIdx + 1, StringComparison.Ordinal); + var methodBody = source.Substring(methodIdx, methodEnd - methodIdx); + + // Strip comment lines — comments may reference the counter by name while documenting why it's excluded. + var codeOnly = string.Join("\n", methodBody.Split('\n') + .Where(l => !l.TrimStart().StartsWith("//"))); + + Assert.False(codeOnly.Contains("_consecutiveWatchdogTimeouts", StringComparison.Ordinal), + "ClearProcessingState must NOT reset _consecutiveWatchdogTimeouts — only CompleteResponse " + + "(success path) should, since resetting on error/abort paths defeats server-recovery detection."); + + Assert.False(codeOnly.Contains("ConsecutiveStuckCount", StringComparison.Ordinal), + "ClearProcessingState must NOT reset ConsecutiveStuckCount — only CompleteResponse " + + "(success path) should, since resetting on watchdog/error paths breaks the >= 3 " + + "threshold that stops system message accumulation in repeatedly-stuck sessions."); + } + + /// + /// PR #531 review finding: CompleteResponse must set AllowTurnStartRearm=true, + /// reset _consecutiveWatchdogTimeouts, and reset ConsecutiveStuckCount after ClearProcessingState. + /// + [Fact] + public void CompleteResponse_SetsAllowTurnStartRearmAndResetsWatchdogCounter() + { + var eventsPath = Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs"); + var source = File.ReadAllText(eventsPath); + + var methodIdx = source.IndexOf("private void CompleteResponse(", StringComparison.Ordinal); + Assert.True(methodIdx >= 0, "CompleteResponse must exist"); + // Use 10000 chars — the method is ~112 lines before AllowTurnStartRearm, ~6400 chars in. + var methodBody = source.Substring(methodIdx, Math.Min(10000, source.Length - methodIdx)); + + Assert.True(methodBody.Contains("AllowTurnStartRearm = true", StringComparison.Ordinal), + "CompleteResponse must explicitly set AllowTurnStartRearm=true after ClearProcessingState"); + Assert.True(methodBody.Contains("_consecutiveWatchdogTimeouts", StringComparison.Ordinal), + "CompleteResponse must reset _consecutiveWatchdogTimeouts on successful completion"); + Assert.True(methodBody.Contains("ConsecutiveStuckCount = 0", StringComparison.Ordinal), + "CompleteResponse must reset ConsecutiveStuckCount on successful completion"); + } + + /// + /// PR #531 re-review finding: AbortSessionAsync must NOT increment PremiumRequestsUsed. + /// User-initiated aborts consumed server resources (API time) but shouldn't count against + /// the premium request budget. The call must use accumulateApiTime: false and manually + /// accumulate TotalApiTimeSeconds before ClearProcessingState. + /// + [Fact] + public void AbortSessionAsync_DoesNotIncrementPremiumRequestsViaAccumulateApiTime() + { + var svcPath = Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.cs"); + var source = File.ReadAllText(svcPath); + + var methodIdx = source.IndexOf("public async Task AbortSessionAsync(", StringComparison.Ordinal); + Assert.True(methodIdx >= 0, "AbortSessionAsync must exist"); + var methodEnd = source.IndexOf("\n }", methodIdx + 1, StringComparison.Ordinal); + var methodBody = source.Substring(methodIdx, methodEnd - methodIdx); + + // The abort path must call ClearProcessingState with accumulateApiTime: false + Assert.True(methodBody.Contains("accumulateApiTime: false", StringComparison.Ordinal), + "AbortSessionAsync must call ClearProcessingState with accumulateApiTime: false — " + + "user aborts should not increment PremiumRequestsUsed"); + + // It must also manually accumulate TotalApiTimeSeconds (request was consumed) + Assert.True(methodBody.Contains("TotalApiTimeSeconds", StringComparison.Ordinal), + "AbortSessionAsync must manually accumulate TotalApiTimeSeconds before ClearProcessingState — " + + "the request consumed server resources even though the user aborted"); + } } diff --git a/PolyPilot.Tests/ConsecutiveStuckSessionTests.cs b/PolyPilot.Tests/ConsecutiveStuckSessionTests.cs index bd9ea36ea..24a232f1f 100644 --- a/PolyPilot.Tests/ConsecutiveStuckSessionTests.cs +++ b/PolyPilot.Tests/ConsecutiveStuckSessionTests.cs @@ -187,10 +187,16 @@ public void WatchdogSource_ClearsMessageQueueOnRepeatedStucks() public void CompleteResponseSource_ResetsConsecutiveStuckCount() { // Verify CompleteResponse resets the stuck counter on success. + // ConsecutiveStuckCount = 0 must only appear in CompleteResponse (success path), + // NOT in ClearProcessingState (which would break accumulation on watchdog timeouts). var repoRoot = GetRepoRoot(); var eventsSource = File.ReadAllText(Path.Combine(repoRoot, "PolyPilot", "Services", "CopilotService.Events.cs")); - Assert.Contains("ConsecutiveStuckCount = 0", eventsSource); + // Must be in CompleteResponse + var crIdx = eventsSource.IndexOf("private void CompleteResponse(", StringComparison.Ordinal); + Assert.True(crIdx >= 0, "CompleteResponse must exist"); + var crBody = eventsSource.Substring(crIdx, Math.Min(10000, eventsSource.Length - crIdx)); + Assert.Contains("ConsecutiveStuckCount = 0", crBody); } // ========================================================================= diff --git a/PolyPilot.Tests/MultiAgentRegressionTests.cs b/PolyPilot.Tests/MultiAgentRegressionTests.cs index 30460286f..b0514d22b 100644 --- a/PolyPilot.Tests/MultiAgentRegressionTests.cs +++ b/PolyPilot.Tests/MultiAgentRegressionTests.cs @@ -2450,17 +2450,6 @@ public void LongRunningOrchestrator_UserFollowup_MustQueue() #region Premature Idle Recovery Tests (PR #375 — SDK bug #299) - [Fact] - public void PrematureIdleDetectionWindowMs_IsReasonable() - { - // The detection window must be long enough for EVT-REARM to fire on the UI thread - // after the premature idle, but short enough not to delay normal completions excessively. - Assert.True(CopilotService.PrematureIdleDetectionWindowMs >= 3000, - "Detection window must be >= 3s to allow UI thread EVT-REARM dispatch"); - Assert.True(CopilotService.PrematureIdleDetectionWindowMs <= 10_000, - "Detection window must be <= 10s to avoid excessive delay on normal completions"); - } - [Fact] public void PrematureIdleRecoveryTimeoutMs_IsReasonable() { @@ -2510,7 +2499,7 @@ public void PrematureIdleSignal_ResetInSendPromptAsync() var sendIdx = source.IndexOf("async Task SendPromptAsync(", StringComparison.Ordinal); Assert.True(sendIdx >= 0, "SendPromptAsync must exist in CopilotService.cs"); - var sendBlock = source.Substring(sendIdx, Math.Min(6000, source.Length - sendIdx)); + var sendBlock = source.Substring(sendIdx, Math.Min(8000, source.Length - sendIdx)); Assert.Contains("PrematureIdleSignal.Reset()", sendBlock); } @@ -2560,7 +2549,7 @@ public void RecoverFromPrematureIdleIfNeededAsync_SubscribesToOnSessionComplete( // Find the method definition (not a call site) var methodIdx = source.IndexOf("private async Task RecoverFromPrematureIdleIfNeededAsync", StringComparison.Ordinal); Assert.True(methodIdx >= 0, "RecoverFromPrematureIdleIfNeededAsync method definition must exist"); - var methodBlock = source.Substring(methodIdx, Math.Min(8000, source.Length - methodIdx)); + var methodBlock = source.Substring(methodIdx, Math.Min(12000, source.Length - methodIdx)); Assert.Contains("OnSessionComplete +=", methodBlock); Assert.Contains("OnSessionComplete -=", methodBlock); // Must unsubscribe in finally @@ -2619,6 +2608,68 @@ public void MutationBeforeCommit_SessionIdSetAfterTryUpdate() "SessionId must be assigned AFTER TryUpdate succeeds (mutation-after-commit pattern)"); } + [Fact] + public void Revival_SetsRecoveredFromSessionId_BeforeUpdatingSessionId() + { + // Structural: revival must set RecoveredFromSessionId on the Info object BEFORE + // updating SessionId. This ensures MergeSessionEntries can identify that the new + // session explicitly replaced the old one and drop the old "(previous)" phantom entry. + var orgPath = Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Organization.cs"); + var source = File.ReadAllText(orgPath); + + var tryUpdateIdx = source.IndexOf("TryUpdate(workerName, freshState, deadState)", StringComparison.Ordinal); + Assert.True(tryUpdateIdx >= 0, "TryUpdate call must exist"); + + var recoveredFromIdx = source.IndexOf("deadState.Info.RecoveredFromSessionId ??=", StringComparison.Ordinal); + Assert.True(recoveredFromIdx >= 0, "Revival must set RecoveredFromSessionId after TryUpdate"); + Assert.True(recoveredFromIdx > tryUpdateIdx, "RecoveredFromSessionId must be set after TryUpdate"); + + var sessionIdAssign = source.IndexOf("deadState.Info.SessionId = freshSession.SessionId", StringComparison.Ordinal); + Assert.True(recoveredFromIdx < sessionIdAssign, + "RecoveredFromSessionId must be set BEFORE SessionId is updated (so the old ID is correctly recorded)"); + } + + [Fact] + public void Revival_AddsOldSessionIdToClosedSet_PreventingPhantomPreviousEntry() + { + // Structural: when worker revival creates a fresh session, it must add the OLD + // session ID to _closedSessionIds. Without this, SaveActiveSessionsToDisk's merge + // sees the old ID in persisted active-sessions.json and creates a "(previous)" entry. + var orgPath = Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Organization.cs"); + var source = File.ReadAllText(orgPath); + + var tryUpdateIdx = source.IndexOf("TryUpdate(workerName, freshState, deadState)", StringComparison.Ordinal); + Assert.True(tryUpdateIdx >= 0, "TryUpdate call must exist"); + + // Find the else block after TryUpdate (the success path) + var successBlock = source.Substring(tryUpdateIdx, Math.Min(3000, source.Length - tryUpdateIdx)); + + Assert.Contains("_closedSessionIds[deadSessionId] = 0", + successBlock, + StringComparison.Ordinal); + } + + [Fact] + public void Revival_CapturesOldSessionIdBeforeOrphaning() + { + // Structural: the old session ID must be captured BEFORE marking the state as orphaned + // and disposing the session. After orphaning, Info.SessionId might be mutated by + // concurrent code. The captured ID is what gets added to _closedSessionIds. + var orgPath = Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Organization.cs"); + var source = File.ReadAllText(orgPath); + + // Find the revival section + var revivalStart = source.IndexOf("Worker revival: empty response", StringComparison.Ordinal); + Assert.True(revivalStart >= 0, "Revival comment must exist"); + var revivalBlock = source.Substring(revivalStart, Math.Min(2000, source.Length - revivalStart)); + + // Old session ID must be captured before IsOrphaned = true + var captureIdx = revivalBlock.IndexOf("var deadSessionId = deadState.Info.SessionId", StringComparison.Ordinal); + var orphanIdx = revivalBlock.IndexOf("deadState.IsOrphaned = true", StringComparison.Ordinal); + Assert.True(captureIdx >= 0, "Old session ID must be captured before orphaning"); + Assert.True(captureIdx < orphanIdx, "Session ID capture must come before IsOrphaned = true"); + } + [Fact] public void RecoverFromPrematureIdleIfNeededAsync_UsesEventsFileFreshness() { @@ -2630,7 +2681,7 @@ public void RecoverFromPrematureIdleIfNeededAsync_UsesEventsFileFreshness() var methodIdx = source.IndexOf("private async Task RecoverFromPrematureIdleIfNeededAsync", StringComparison.Ordinal); Assert.True(methodIdx >= 0, "RecoverFromPrematureIdleIfNeededAsync method definition must exist"); - var methodBlock = source.Substring(methodIdx, Math.Min(8000, source.Length - methodIdx)); + var methodBlock = source.Substring(methodIdx, Math.Min(12000, source.Length - methodIdx)); Assert.Contains("IsEventsFileActive", methodBlock); } @@ -2661,7 +2712,7 @@ public void RecoverFromPrematureIdleIfNeededAsync_LoopsOnRepeatedPrematureIdle() var methodIdx = source.IndexOf("private async Task RecoverFromPrematureIdleIfNeededAsync", StringComparison.Ordinal); Assert.True(methodIdx >= 0, "RecoverFromPrematureIdleIfNeededAsync method definition must exist"); - var methodBlock = source.Substring(methodIdx, Math.Min(8000, source.Length - methodIdx)); + var methodBlock = source.Substring(methodIdx, Math.Min(12000, source.Length - methodIdx)); // Must have a loop for repeated premature idle rounds Assert.Contains("while (", methodBlock); @@ -2684,6 +2735,18 @@ public void PrematureIdleEventsFileFreshnessSeconds_ConstantExists() Assert.True(constIdx >= 0, "Must be an internal const int"); } + [Fact] + public void PrematureIdleEventsSettleMs_ConstantExists() + { + // Settle period constant: time to wait before taking the stable baseline snapshot. + // Must be > 0 (need time for OS to flush mtime) and < GracePeriodMs (must leave + // observation window after settle). + Assert.True(CopilotService.PrematureIdleEventsSettleMs > 0, + "Settle must be > 0ms"); + Assert.True(CopilotService.PrematureIdleEventsSettleMs < CopilotService.PrematureIdleEventsGracePeriodMs, + "Settle must be < GracePeriodMs to leave an observation window"); + } + [Fact] public void PrematureIdleEventsGracePeriodMs_ConstantExists() { @@ -2719,34 +2782,60 @@ public void GetEventsFileMtime_HelperExists() [Fact] public void RecoverFromPrematureIdleIfNeededAsync_UsesMtimeComparisonForInitialDetection() { - // Structural: instead of raw IsEventsFileActive (which sees the idle event's own write - // as "fresh" and false-positives), the method must snapshot mtime, wait the grace period, - // then compare mtimes. Only a changed mtime proves the CLI is still writing. + // Structural: two-phase mtime check avoids OS mtime-flush false positives. + // Phase 1: settle (500ms) to let OS flush the completing write's mtime update + // Phase 2: observe (1500ms) to see if MORE writes happen (genuine premature idle) + // Only writes AFTER the settle baseline trigger detection. var orgPath = Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Organization.cs"); var source = File.ReadAllText(orgPath); var methodIdx = source.IndexOf("private async Task RecoverFromPrematureIdleIfNeededAsync", StringComparison.Ordinal); Assert.True(methodIdx >= 0, "RecoverFromPrematureIdleIfNeededAsync must exist"); - var methodBlock = source.Substring(methodIdx, Math.Min(4000, source.Length - methodIdx)); + var methodBlock = source.Substring(methodIdx, Math.Min(5000, source.Length - methodIdx)); // Must use mtime comparison in the detection phase Assert.Contains("GetEventsFileMtime", methodBlock); - Assert.Contains("PrematureIdleEventsGracePeriodMs", methodBlock); Assert.Contains("stableMtime", methodBlock); + Assert.Contains("endMtime", methodBlock); - // The grace period delay must appear before the stableMtime assignment - var delayIdx = methodBlock.IndexOf("PrematureIdleEventsGracePeriodMs", StringComparison.Ordinal); + // Two-phase: settle delay must precede the stableMtime (baseline) assignment + var settleIdx = methodBlock.IndexOf("PrematureIdleEventsSettleMs", StringComparison.Ordinal); var assignIdx = methodBlock.IndexOf("stableMtime = GetEventsFileMtime", StringComparison.Ordinal); - Assert.True(assignIdx >= 0, "stableMtime must be assigned from GetEventsFileMtime after the delay"); - Assert.True(delayIdx < assignIdx, "Grace period delay must precede stable-mtime assignment"); + Assert.True(settleIdx >= 0, "PrematureIdleEventsSettleMs constant must appear in the method"); + Assert.True(assignIdx >= 0, "stableMtime must be assigned from GetEventsFileMtime after the settle delay"); + Assert.True(settleIdx < assignIdx, "Settle delay must precede stable-mtime baseline assignment"); + + // The observation comparison must use endMtime > stableMtime + Assert.Contains("endMtime.HasValue && endMtime.Value > (stableMtime", methodBlock); + } + + [Fact] + public void RecoverFromPrematureIdleIfNeededAsync_GracePeriodSkipsReconnectWrites() + { + // Structural: the grace period mtime check must guard against reconnect writes. + // When the app restarts mid-orchestration, EnsureSessionConnectedAsync writes + // session.resume to events.jsonl — this advances mtime but is NOT premature idle. + // The fix: after detecting mtime changed, check last event type and skip if it's + // session.resume or session.shutdown (terminal/reconnect events). + var orgPath = Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Organization.cs"); + var source = File.ReadAllText(orgPath); + + var methodIdx = source.IndexOf("private async Task RecoverFromPrematureIdleIfNeededAsync", StringComparison.Ordinal); + Assert.True(methodIdx >= 0, "RecoverFromPrematureIdleIfNeededAsync method definition must exist"); + var methodBlock = source.Substring(methodIdx, Math.Min(6000, source.Length - methodIdx)); + + // Must use GetLastEventType to check the content, not just raw mtime delta + Assert.Contains("GetLastEventType", methodBlock); + // Must skip session.resume writes (reconnect scenario) + Assert.Contains("session.resume", methodBlock); } [Fact] - public void RecoverFromPrematureIdleIfNeededAsync_PollingLoopUsesMtimeComparison() + public void RecoverFromPrematureIdleIfNeededAsync_UsesTwoPhaseMtimeCheck() { - // Structural: the secondary polling loop must also use mtime comparison (not raw - // IsEventsFileActive) so that a stale-but-fresh file doesn't trigger false detection - // in subsequent poll cycles. + // Structural: the method must use the two-phase approach — settle period then + // snapshot stableMtime, then observe — so OS-flush false positives are avoided. + // After the two-phase check returns not-detected, it returns immediately (no polling loop). var orgPath = Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Organization.cs"); var source = File.ReadAllText(orgPath); @@ -2754,11 +2843,11 @@ public void RecoverFromPrematureIdleIfNeededAsync_PollingLoopUsesMtimeComparison Assert.True(methodIdx >= 0, "RecoverFromPrematureIdleIfNeededAsync must exist"); var methodBlock = source.Substring(methodIdx, Math.Min(5000, source.Length - methodIdx)); - // The polling loop must compare currentMtime against stableMtime - Assert.Contains("currentMtime", methodBlock); + // Must use stableMtime (post-settle baseline) and endMtime (post-observe) Assert.Contains("stableMtime", methodBlock); + Assert.Contains("endMtime", methodBlock); - // Both GetEventsFileMtime calls should appear in the method + // Both GetEventsFileMtime calls should appear (settle snapshot + observe snapshot) var calls = 0; var searchFrom = 0; while (true) @@ -2768,7 +2857,11 @@ public void RecoverFromPrematureIdleIfNeededAsync_PollingLoopUsesMtimeComparison calls++; searchFrom = idx + 1; } - Assert.True(calls >= 2, $"GetEventsFileMtime must be called at least twice (grace + polling), found {calls}"); + Assert.True(calls >= 2, $"GetEventsFileMtime must be called at least twice (settle + observe), found {calls}"); + + // Must NOT have a secondary polling loop (the while loop that kept looping for 10s) + // because it added 10s of delay to every normal worker completion. + Assert.DoesNotContain("detectStart", methodBlock); } #endregion diff --git a/PolyPilot.Tests/ProcessingWatchdogTests.cs b/PolyPilot.Tests/ProcessingWatchdogTests.cs index 0985417ae..6f730752f 100644 --- a/PolyPilot.Tests/ProcessingWatchdogTests.cs +++ b/PolyPilot.Tests/ProcessingWatchdogTests.cs @@ -2735,20 +2735,16 @@ public void WatchdogCatchBlock_ClearsIsProcessing_InSource() Assert.True(watchdogBody.Contains("clearing IsProcessing after watchdog crash"), "Watchdog catch block must log that it is clearing IsProcessing on crash"); - // The crash recovery must clear all INV-1 companion fields - // Search the full catch block (after the [WATCHDOG-CRASH] tag) for required patterns + // The crash recovery must call ClearProcessingState (which atomically clears all + // companion fields: IsProcessing, SendingFlag, ProcessingStartedAt, etc.) var crashIdx = watchdogBody.IndexOf("[WATCHDOG-CRASH]"); var crashBlock = watchdogBody.Substring(crashIdx); - Assert.True(crashBlock.Contains("IsProcessing = false"), - "Watchdog crash recovery must set IsProcessing = false"); - Assert.True(crashBlock.Contains("SendingFlag"), - "Watchdog crash recovery must clear SendingFlag (INV-1)"); - Assert.True(crashBlock.Contains("ProcessingStartedAt = null"), - "Watchdog crash recovery must clear ProcessingStartedAt (INV-1)"); - Assert.True(crashBlock.Contains("ProcessingPhase = 0"), - "Watchdog crash recovery must clear ProcessingPhase (INV-1)"); - Assert.True(crashBlock.Contains("ClearPermissionDenials"), - "Watchdog crash recovery must clear permission denials (INV-1)"); + Assert.True(crashBlock.Contains("ClearProcessingState(state"), + "Watchdog crash recovery must call ClearProcessingState to atomically clear all companion fields"); + Assert.True(crashBlock.Contains("AllowTurnStartRearm = false"), + "Watchdog crash recovery must set AllowTurnStartRearm = false (terminal forced stop)"); + Assert.True(crashBlock.Contains("ConsecutiveStuckCount++"), + "Watchdog crash recovery must increment ConsecutiveStuckCount"); } [Fact] diff --git a/PolyPilot.Tests/RenderThrottleTests.cs b/PolyPilot.Tests/RenderThrottleTests.cs index eae7d1f4c..1f6ac33e8 100644 --- a/PolyPilot.Tests/RenderThrottleTests.cs +++ b/PolyPilot.Tests/RenderThrottleTests.cs @@ -214,18 +214,18 @@ public void CompleteResponse_OnSessionComplete_FiresBeforeOnStateChanged() var methodStart = source.IndexOf("private void CompleteResponse(", StringComparison.Ordinal); Assert.True(methodStart >= 0, "CompleteResponse method not found"); - // The critical ordering is AFTER IsProcessing = false (the main completion path). + // The critical ordering is AFTER ClearProcessingState (which sets IsProcessing=false). // Use the preceding comment as anchor to find the right occurrence. var anchor = source.IndexOf("// Clear IsProcessing BEFORE completing the TCS", methodStart, StringComparison.Ordinal); Assert.True(anchor >= 0, "CompleteResponse TCS comment not found"); - var isProcessingFalse = source.IndexOf("state.Info.IsProcessing = false;", anchor, StringComparison.Ordinal); - Assert.True(isProcessingFalse >= 0, "IsProcessing = false not found in CompleteResponse main path"); + var clearProcessing = source.IndexOf("ClearProcessingState(state)", anchor, StringComparison.Ordinal); + Assert.True(clearProcessing >= 0, "ClearProcessingState(state) not found in CompleteResponse main path"); - var afterProcessing = source.Substring(isProcessingFalse, 1200); + var afterProcessing = source.Substring(clearProcessing, 2000); var completeIdx = afterProcessing.IndexOf("OnSessionComplete?", StringComparison.Ordinal); var stateIdx = afterProcessing.IndexOf("OnStateChanged?", StringComparison.Ordinal); - Assert.True(completeIdx >= 0, "OnSessionComplete not found after IsProcessing=false in CompleteResponse"); - Assert.True(stateIdx >= 0, "OnStateChanged not found after IsProcessing=false in CompleteResponse"); + Assert.True(completeIdx >= 0, "OnSessionComplete not found after ClearProcessingState in CompleteResponse"); + Assert.True(stateIdx >= 0, "OnStateChanged not found after ClearProcessingState in CompleteResponse"); Assert.True(completeIdx < stateIdx, "OnSessionComplete must fire BEFORE OnStateChanged in CompleteResponse. " + "HandleComplete populates completedSessions which RefreshState checks to bypass throttle."); diff --git a/PolyPilot.Tests/ScenarioReferenceTests.cs b/PolyPilot.Tests/ScenarioReferenceTests.cs index c14c1abb9..cf596df40 100644 --- a/PolyPilot.Tests/ScenarioReferenceTests.cs +++ b/PolyPilot.Tests/ScenarioReferenceTests.cs @@ -59,18 +59,27 @@ public void ScenarioFiles_StepsHaveValidActions() var validActions = new HashSet { "assertAllSessionsReceived", "assertAllSessionsResponded", "assertAllWorkers", - "assertDirectoryExists", "assertEqual", "assertEvaluatorWasUsed", "assertFileContains", - "assertFileExists", "assertGroupMembership", "assertNoDirectoryContains", + "assertDirectoryExists", "assertEqual", "assertEvaluatorWasUsed", "assertEventLog", + "assertFileContains", "assertFileExists", "assertGroupExists", "assertGroupMembership", + "assertInputAvailable", "assertNoDirectoryContains", "assertNoNewEvents", "assertNoOverlap", "assertNoPresetInSection", "assertNoReflectionLoop", "assertNoSessionsInDefault", "assertNoSessionsWithGroupId", "assertOrchestratorReceivedRoutingContext", "assertOrchestratorReceivedWorkerDescriptions", "assertOrchestratorSynthesized", - "assertOrgJson", "assertPresetInSection", "assertPresetVisible", "assertReflectionPaused", - "assertReflectionState", "assertSessionMeta", "assertWorkerPromptContains", - "captureGroupState", "click", "createGroup", "createGroupFromPreset", "createSquadDir", - "deleteGroup", "evaluate", "navigate", "note", "pauseReflection", "readOrgJson", - "restartApp", "resumeReflection", "saveGroupAsPreset", "selectWorktree", "sendPrompt", - "setEvaluator", "setMode", "shell", "type", "wait", "waitForAgent", - "waitForAllResponses", "waitForAllSessions", "waitForCompletion", "waitForPhase", + "assertOrgJson", "assertPresetInSection", "assertPresetVisible", "assertProcessing", + "assertReflectionPaused", "assertReflectionState", "assertResponseNotEmpty", + "assertSessionExists", "assertSessionMeta", "assertSessionNotExists", + "assertWorkerPromptContains", + "captureEventLogPosition", "captureGroupState", "captureSessionList", + "click", "clickAbort", "clickCreate", "closeSession", + "createGroup", "createGroupFromPreset", "createSession", "createSquadDir", + "deleteGroup", "evaluate", "navigate", "note", + "openCreateMenu", "pauseReflection", "readOrgJson", "relaunchApp", + "restartApp", "resumeReflection", "saveGroupAsPreset", + "selectOption", "selectPreset", "selectRepo", "selectSession", "selectWorktree", + "sendMessage", "sendPrompt", "sendToGroup", + "setEvaluator", "setMode", "shell", "shellCheck", "switchModel", + "type", "wait", "waitForAgent", "waitForAllResponses", "waitForAllSessions", + "waitForCompletion", "waitForEventPattern", "waitForIdle", "waitForPhase", "screenshot" }; foreach (var file in Directory.GetFiles(ScenariosDir, "*.json")) diff --git a/PolyPilot.Tests/Scenarios/multi-agent-reliability-scenarios.json b/PolyPilot.Tests/Scenarios/multi-agent-reliability-scenarios.json new file mode 100644 index 000000000..4183b793f --- /dev/null +++ b/PolyPilot.Tests/Scenarios/multi-agent-reliability-scenarios.json @@ -0,0 +1,530 @@ +{ + "description": "Multi-agent reliability scenarios for PolyPilot. Tests cover every orchestration lifecycle path: group creation, dispatch, worker execution, synthesis, subsequent orchestrations, follow-up handling, restart recovery, and state hygiene. Complements multi-agent-scenarios.json which covers organization/presets/squad. Execute against a running PolyPilot instance using MauiDevFlow CDP commands.", + "prerequisites": { + "build": "cd PolyPilot && ./relaunch.sh", + "waitForAgent": "maui devflow MAUI status --agent-port 9223", + "initialMode": "Persistent", + "notes": "All test sessions MUST point to the PolyPilot repository. Use 'Implement & Challenge' preset (ignore PR Reviewer). Never commit, push, or create GitHub issues/PRs. DevFlow agent port is 9223. CDP Blazor clicks require PointerEvent dispatch." + }, + "scenarios": [ + { + "id": "create-implement-challenge-group", + "name": "Create Implement & Challenge multi-agent group", + "description": "Create a multi-agent group using the Implement & Challenge preset. Verify orchestrator and workers are created with correct roles.", + "category": "group-creation", + "invariants": [ + "Group appears in sidebar", + "Orchestrator session exists with Role=Orchestrator", + "Worker sessions exist (at least 2)", + "All sessions point to PolyPilot repo" + ], + "steps": [ + { "action": "openCreateMenu" }, + { "action": "selectOption", "text": "Multi-Agent Team" }, + { "action": "selectPreset", "name": "Implement & Challenge" }, + { "action": "selectRepo", "repo": "PureWeen-PolyPilot" }, + { "action": "clickCreate" }, + { "action": "wait", "ms": 5000 }, + { "action": "assertGroupExists", "nameContains": "Implement" }, + { "action": "assertSessionExists", "nameContains": "orchestrator" }, + { "action": "assertSessionExists", "nameContains": "worker" } + ] + }, + { + "id": "group-shows-all-sessions-in-sidebar", + "name": "Multi-agent group shows all sessions in sidebar", + "description": "Verify all group sessions (orchestrator + workers) are visible in the sidebar under the group.", + "category": "group-creation", + "invariants": [ + "All sessions visible under group header", + "Session count matches expected (orchestrator + N workers)" + ], + "steps": [ + { "action": "evaluate", "script": "document.querySelectorAll('[class*=session-list-item]').length", "expect": { "greaterThan": 2 }, "note": "At least orchestrator + 2 workers visible" } + ] + }, + { + "id": "workers-have-assigned-models", + "name": "Workers have model assignments from preset", + "description": "Verify each worker session has its configured model from the preset.", + "category": "group-creation", + "invariants": [ + "Each worker has a non-empty model assignment", + "Model names match preset configuration" + ], + "steps": [ + { "action": "note", "text": "Verify by expanding each worker card and checking model dropdown, or via event log on first dispatch" } + ] + }, + { + "id": "orchestrator-role-correct", + "name": "Orchestrator session has correct metadata", + "description": "Verify the orchestrator session is marked with the correct role and group membership.", + "category": "group-creation", + "invariants": [ + "Orchestrator session exists", + "Session is part of the multi-agent group", + "IsMultiAgentSession = true" + ], + "steps": [ + { "action": "assertSessionExists", "nameContains": "orchestrator" }, + { "action": "assertEventLog", "pattern": "DISPATCH-ROUTE.*orchestrator.*isOrch=True", "expected": true, "note": "Will be verified when first message is sent" } + ] + }, + { + "id": "dispatch-sends-to-all-workers", + "name": "Orchestration dispatches to all workers", + "description": "Send a prompt to the group. Verify the orchestrator dispatches tasks to all workers via event log.", + "category": "dispatch", + "invariants": [ + "Event log shows [DISPATCH-ROUTE] with correct mode", + "Event log shows [SEND] for each worker", + "Event log shows [DISPATCH] with worker names" + ], + "steps": [ + { "action": "sendToGroup", "text": "Add a one-line code comment to the top of README.md explaining what PolyPilot is. Keep it simple." }, + { "action": "wait", "ms": 10000, "note": "Wait for orchestrator to plan and dispatch" }, + { "action": "assertEventLog", "pattern": "\\[DISPATCH-ROUTE\\].*Orchestrator", "expected": true }, + { "action": "assertEventLog", "pattern": "\\[DISPATCH\\].*Dispatching", "expected": true }, + { "action": "assertEventLog", "pattern": "\\[SEND\\].*worker", "expected": true } + ] + }, + { + "id": "dispatch-stagger-delay", + "name": "Workers dispatched with stagger delay", + "description": "Verify workers are dispatched approximately 1 second apart, not all at once.", + "category": "dispatch", + "invariants": [ + "Worker [SEND] timestamps are ~1s apart" + ], + "steps": [ + { "action": "shellCheck", "command": "grep '\\[SEND\\].*worker' ~/.polypilot/event-diagnostics.log | tail -3", "note": "Verify timestamps show staggered dispatch" } + ] + }, + { + "id": "pending-orchestration-saved-during-dispatch", + "name": "PendingOrchestration file created during dispatch", + "description": "During worker execution, verify pending-orchestration.json exists with correct data.", + "category": "dispatch", + "invariants": [ + "File exists at ~/.polypilot/pending-orchestration.json", + "Contains correct group ID and worker names" + ], + "steps": [ + { "action": "shellCheck", "command": "cat ~/.polypilot/pending-orchestration.json 2>/dev/null | head -5", "expect": { "contains": "worker" } } + ] + }, + { + "id": "orchestrator-plans-with-worker-blocks", + "name": "Orchestrator planning includes @worker blocks", + "description": "Verify the orchestrator's response contains @worker task assignments.", + "category": "dispatch", + "invariants": [ + "Event log shows 'Early dispatch: @worker blocks detected'", + "Assignments parsed from orchestrator response" + ], + "steps": [ + { "action": "assertEventLog", "pattern": "Early dispatch.*@worker", "expected": true } + ] + }, + { + "id": "workers-process-independently", + "name": "Workers process their tasks independently", + "description": "Monitor workers during execution. Each should cycle through TurnStart/TurnEnd events independently.", + "category": "worker-execution", + "invariants": [ + "Each worker has AssistantTurnStartEvent and AssistantTurnEndEvent in event log", + "Workers process concurrently" + ], + "steps": [ + { "action": "waitForEventPattern", "pattern": "\\[EVT\\].*worker.*TurnStart", "timeout": 30, "note": "Wait for workers to start processing" }, + { "action": "assertEventLog", "pattern": "\\[EVT\\].*worker.*TurnEndEvent", "expected": true, "note": "Workers cycling through events" } + ] + }, + { + "id": "worker-tool-use-completes", + "name": "Workers using tools complete without getting stuck", + "description": "Workers often use tools (grep, file reads). Verify tool execution completes.", + "category": "worker-execution", + "invariants": [ + "Workers complete (SessionIdleEvent or CompleteResponse)", + "No workers stuck with HasUsedToolsThisTurn=true indefinitely" + ], + "steps": [ + { "action": "waitForEventPattern", "pattern": "\\[COMPLETE\\].*worker", "timeout": 600, "note": "Wait for at least one worker to complete" } + ] + }, + { + "id": "worker-subagent-idle-defer", + "name": "Worker IDLE-DEFER fires and resolves for subagents", + "description": "If a worker dispatches sub-agents (task tool), IDLE-DEFER should fire and eventually resolve.", + "category": "worker-execution", + "invariants": [ + "If [IDLE-DEFER] appears, it's eventually followed by [COMPLETE]", + "Worker doesn't stay stuck in IDLE-DEFER forever" + ], + "steps": [ + { "action": "note", "text": "This scenario is best verified by checking event log after orchestration completes" }, + { "action": "shellCheck", "command": "defer_count=$(grep -c '\\[IDLE-DEFER\\]' ~/.polypilot/event-diagnostics.log); complete_count=$(grep -c '\\[COMPLETE\\]' ~/.polypilot/event-diagnostics.log); echo \"defers=$defer_count completes=$complete_count\"" } + ] + }, + { + "id": "worker-timeout-handled-gracefully", + "name": "Worker timeout produces error result, doesn't block orchestration", + "description": "If a worker times out (>10 min), the orchestration should still continue with an error result.", + "category": "worker-execution", + "invariants": [ + "Orchestration doesn't hang forever waiting for a dead worker", + "Synthesis includes failure message for timed-out worker" + ], + "steps": [ + { "action": "note", "text": "Difficult to force a timeout in live testing. Verify by checking event log for any [WATCHDOG] entries on workers and confirming orchestration still completed." } + ] + }, + { + "id": "worker-error-doesnt-block-orchestration", + "name": "One worker error doesn't block the whole orchestration", + "description": "If one worker fails, the other worker should still complete and synthesis should include the failure.", + "category": "worker-execution", + "invariants": [ + "Orchestration completes even if one worker errored", + "Synthesis prompt includes error message for failed worker" + ], + "steps": [ + { "action": "note", "text": "Verify from event log: if any [ERROR] entries exist for workers, check that [DISPATCH] Collected results still fires" } + ] + }, + { + "id": "synthesis-collects-all-worker-results", + "name": "Synthesis includes all worker responses", + "description": "After workers complete, verify the synthesis prompt is sent to the orchestrator with all results.", + "category": "synthesis", + "invariants": [ + "Event log shows 'Collected N/N worker results for synthesis'", + "All workers accounted for" + ], + "steps": [ + { "action": "waitForEventPattern", "pattern": "Collected.*worker results for synthesis", "timeout": 600 }, + { "action": "assertEventLog", "pattern": "Collected.*worker results", "expected": true } + ] + }, + { + "id": "synthesis-completes-orchestrator", + "name": "Orchestrator completes after synthesis", + "description": "After synthesis prompt is sent, the orchestrator should process it and complete.", + "category": "synthesis", + "invariants": [ + "Orchestrator shows [SEND] for synthesis prompt", + "Orchestrator shows [COMPLETE] after synthesis", + "IsProcessing = false on orchestrator" + ], + "steps": [ + { "action": "waitForEventPattern", "pattern": "\\[COMPLETE\\].*orchestrator", "timeout": 120, "note": "Wait for synthesis completion" }, + { "action": "assertProcessing", "sessionContains": "orchestrator", "expected": false } + ] + }, + { + "id": "pending-orchestration-cleared-after-synthesis", + "name": "PendingOrchestration file cleared after synthesis", + "description": "After orchestration completes, pending-orchestration.json should be empty or deleted.", + "category": "synthesis", + "invariants": [ + "File is empty, contains '{}', or doesn't exist" + ], + "steps": [ + { "action": "shellCheck", "command": "cat ~/.polypilot/pending-orchestration.json 2>/dev/null || echo 'file not found'", "expect": { "oneOf": ["file not found", "{}", "null", ""] } } + ] + }, + { + "id": "all-workers-idle-after-orchestration", + "name": "All workers have IsProcessing=false after orchestration", + "description": "After orchestration completes, verify every worker session is idle.", + "category": "synthesis", + "invariants": [ + "Every worker session has IsProcessing = false", + "No stop buttons visible for any worker" + ], + "steps": [ + { "action": "evaluate", "script": "document.querySelectorAll('.send-btn.stop-btn, .card-stop-btn').length", "expect": 0, "note": "No sessions processing" } + ] + }, + { + "id": "no-orphaned-workers-after-orchestration", + "name": "No orphaned worker state after orchestration", + "description": "After orchestration, no workers should be stuck in a partial processing state.", + "category": "synthesis", + "invariants": [ + "No worker [SEND] without corresponding [COMPLETE]", + "No stuck TCS (TaskCompletionSource)" + ], + "steps": [ + { "action": "shellCheck", "command": "for w in 1 2; do sends=$(grep -c \"\\[SEND\\].*worker-$w\" ~/.polypilot/event-diagnostics.log); completes=$(grep -c \"\\[COMPLETE\\].*worker-$w\" ~/.polypilot/event-diagnostics.log); echo \"worker-$w: sends=$sends completes=$completes\"; done" } + ] + }, + { + "id": "second-orchestration-works", + "name": "Second orchestration to same group completes", + "description": "After one orchestration completes, send a new prompt to the same group. Verify full lifecycle works again.", + "category": "subsequent-orchestrations", + "invariants": [ + "Second dispatch fires", + "Workers process again", + "Synthesis completes again", + "IsProcessing = false on all sessions" + ], + "steps": [ + { "action": "sendToGroup", "text": "Now add a comment to the bottom of README.md saying 'Built with GitHub Copilot'. Keep it simple." }, + { "action": "waitForEventPattern", "pattern": "\\[DISPATCH\\].*Dispatching", "timeout": 60, "note": "Wait for second dispatch" }, + { "action": "waitForEventPattern", "pattern": "\\[COMPLETE\\].*orchestrator", "timeout": 600, "note": "Wait for second orchestration to complete" }, + { "action": "evaluate", "script": "document.querySelectorAll('.send-btn.stop-btn, .card-stop-btn').length", "expect": 0 } + ] + }, + { + "id": "third-orchestration-no-degradation", + "name": "Third orchestration shows no accumulated state issues", + "description": "A third consecutive orchestration should work as cleanly as the first.", + "category": "subsequent-orchestrations", + "invariants": [ + "No accumulated stuck state from previous orchestrations", + "Event log clean for this orchestration" + ], + "steps": [ + { "action": "sendToGroup", "text": "Check if README.md has any typos and fix them. If none, just say so." }, + { "action": "waitForEventPattern", "pattern": "\\[COMPLETE\\].*orchestrator", "timeout": 600 }, + { "action": "evaluate", "script": "document.querySelectorAll('.send-btn.stop-btn, .card-stop-btn').length", "expect": 0 } + ] + }, + { + "id": "orchestration-after-worker-error-works", + "name": "Orchestration works after a previous worker error", + "description": "If a previous orchestration had a worker error, the next one should still work.", + "category": "subsequent-orchestrations", + "invariants": [ + "All workers dispatch correctly", + "No lingering error state from previous run" + ], + "steps": [ + { "action": "note", "text": "This is verified by the second/third orchestration tests above. If any had errors, this confirms recovery." } + ] + }, + { + "id": "follow-up-during-dispatch-queued", + "name": "Follow-up message during orchestration is queued, not steered", + "description": "Send a follow-up while workers are busy. It should be queued (not steer the orchestrator). PR #375 fix.", + "category": "follow-up-steering", + "invariants": [ + "Event log shows QUEUED, not STEER", + "Orchestration not disrupted by follow-up", + "Follow-up processed after current orchestration completes" + ], + "steps": [ + { "action": "sendToGroup", "text": "Review all the files in PolyPilot/Models/ directory." }, + { "action": "wait", "ms": 5000, "note": "Workers start processing" }, + { "action": "sendToGroup", "text": "Also check the Services directory." }, + { "action": "assertEventLog", "pattern": "QUEUED", "expected": true, "note": "Follow-up queued" }, + { "action": "waitForEventPattern", "pattern": "\\[COMPLETE\\].*orchestrator", "timeout": 600 } + ] + }, + { + "id": "follow-up-after-completion-works", + "name": "Follow-up after orchestration completes starts new orchestration", + "description": "Send a message after orchestration is fully complete. It should start a new orchestration.", + "category": "follow-up-steering", + "invariants": [ + "New [DISPATCH-ROUTE] appears", + "New orchestration lifecycle begins" + ], + "steps": [ + { "action": "assertProcessing", "sessionContains": "orchestrator", "expected": false, "note": "Verify idle first" }, + { "action": "sendToGroup", "text": "Summarize what changes were made across all previous tasks." }, + { "action": "waitForEventPattern", "pattern": "\\[DISPATCH-ROUTE\\]", "timeout": 60 }, + { "action": "waitForEventPattern", "pattern": "\\[COMPLETE\\].*orchestrator", "timeout": 600 } + ] + }, + { + "id": "abort-orchestration-cleans-up", + "name": "Aborting orchestration cleans up all sessions", + "description": "Abort the orchestrator while workers are running. Verify clean state for all sessions.", + "category": "follow-up-steering", + "invariants": [ + "All sessions reach IsProcessing = false", + "No stuck workers", + "PendingOrchestration cleared" + ], + "steps": [ + { "action": "sendToGroup", "text": "Do a comprehensive review of every single file in the repository." }, + { "action": "wait", "ms": 10000, "note": "Workers start processing" }, + { "action": "clickAbort", "sessionContains": "orchestrator" }, + { "action": "wait", "ms": 10000 }, + { "action": "evaluate", "script": "document.querySelectorAll('.send-btn.stop-btn, .card-stop-btn').length", "expect": 0, "note": "All sessions idle after abort" } + ] + }, + { + "id": "group-survives-relaunch", + "name": "Multi-agent group persists across app relaunch", + "description": "Relaunch the app and verify the multi-agent group is restored with all sessions.", + "category": "restart-recovery", + "invariants": [ + "Group appears in sidebar after relaunch", + "Orchestrator session restored", + "Worker sessions restored", + "All sessions idle (not stuck processing)" + ], + "steps": [ + { "action": "captureGroupState", "capture": "beforeRelaunch" }, + { "action": "relaunchApp" }, + { "action": "waitForAgent", "timeout": 120 }, + { "action": "assertSessionExists", "nameContains": "orchestrator" }, + { "action": "assertSessionExists", "nameContains": "worker" }, + { "action": "evaluate", "script": "document.querySelectorAll('.send-btn.stop-btn, .card-stop-btn').length", "expect": 0 } + ] + }, + { + "id": "no-phantom-worker-sessions", + "name": "No phantom or duplicate worker sessions after relaunch", + "description": "After relaunch, verify no duplicate worker sessions or phantom '(resumed)' entries.", + "category": "restart-recovery", + "invariants": [ + "No duplicate session names", + "No '(resumed)' in any session name" + ], + "steps": [ + { "action": "evaluate", "script": "Array.from(document.querySelectorAll('.session-name, .session-title')).filter(e => e.textContent.includes('resumed')).length", "expect": 0 } + ] + }, + { + "id": "pending-orchestration-recovery", + "name": "PendingOrchestration recovers after relaunch during dispatch", + "description": "If the app relaunches while workers are processing, PendingOrchestration should enable recovery.", + "category": "restart-recovery", + "invariants": [ + "If pending-orchestration.json existed, recovery was attempted", + "Workers either completed or were collected with partial results" + ], + "steps": [ + { "action": "note", "text": "Best tested by: 1) Start orchestration, 2) Immediately relaunch, 3) Check event log for 'Resuming pending orchestration'" }, + { "action": "shellCheck", "command": "grep 'Resuming pending orchestration\\|pending orchestration' ~/.polypilot/event-diagnostics.log | tail -3 || echo 'no pending recovery'" } + ] + }, + { + "id": "post-relaunch-new-orchestration-works", + "name": "New orchestration works after relaunch", + "description": "After relaunching and restoring the group, send a new prompt. Verify full lifecycle works.", + "category": "restart-recovery", + "invariants": [ + "New dispatch fires", + "Workers process and complete", + "Synthesis completes" + ], + "steps": [ + { "action": "sendToGroup", "text": "What is in the README.md file? Summarize it briefly." }, + { "action": "waitForEventPattern", "pattern": "\\[DISPATCH\\]", "timeout": 60 }, + { "action": "waitForEventPattern", "pattern": "\\[COMPLETE\\].*orchestrator", "timeout": 600 }, + { "action": "evaluate", "script": "document.querySelectorAll('.send-btn.stop-btn, .card-stop-btn').length", "expect": 0 } + ] + }, + { + "id": "workers-not-stuck-after-relaunch", + "name": "No workers stuck in IsProcessing after relaunch", + "description": "After relaunch, verify no worker sessions are stuck processing.", + "category": "restart-recovery", + "invariants": [ + "All workers have IsProcessing = false within 30s of relaunch", + "No [WATCHDOG] kill entries needed" + ], + "steps": [ + { "action": "wait", "ms": 30000, "note": "Wait for any watchdog cleanup" }, + { "action": "evaluate", "script": "document.querySelectorAll('.send-btn.stop-btn, .card-stop-btn').length", "expect": 0 } + ] + }, + { + "id": "event-log-dispatch-complete-pairs", + "name": "Every [DISPATCH] has completion in event log", + "description": "Audit the event log: every worker dispatch should have a completion.", + "category": "state-hygiene", + "invariants": [ + "Number of worker [SEND] ≤ worker [COMPLETE] + [ABORT]", + "No orphaned dispatches" + ], + "steps": [ + { "action": "shellCheck", "command": "sends=$(grep -c '\\[SEND\\].*worker' ~/.polypilot/event-diagnostics.log); completes=$(grep -c '\\[COMPLETE\\].*worker\\|\\[ABORT\\].*worker' ~/.polypilot/event-diagnostics.log); echo \"worker sends=$sends completions=$completes\"" } + ] + }, + { + "id": "no-stuck-tcs-after-orchestration", + "name": "No unresolved TaskCompletionSources after orchestration", + "description": "After orchestration, verify no TCS objects are waiting for completion.", + "category": "state-hygiene", + "invariants": [ + "All sessions idle", + "No pending awaits blocking orchestrator" + ], + "steps": [ + { "action": "assertProcessing", "sessionContains": "orchestrator", "expected": false }, + { "action": "note", "text": "TCS state is internal — best verified by confirming orchestrator can accept new messages" }, + { "action": "sendToGroup", "text": "Say 'TCS test OK'." }, + { "action": "waitForEventPattern", "pattern": "\\[COMPLETE\\].*orchestrator", "timeout": 120 } + ] + }, + { + "id": "watchdog-doesnt-kill-active-workers", + "name": "Watchdog doesn't kill workers that are actively processing", + "description": "During a long orchestration, verify the watchdog doesn't prematurely kill active workers.", + "category": "state-hygiene", + "invariants": [ + "No [WATCHDOG].*kill entries for workers with recent events", + "Watchdog Case B defers correctly when events.jsonl is fresh" + ], + "steps": [ + { "action": "shellCheck", "command": "grep '\\[WATCHDOG\\].*worker.*kill\\|\\[WATCHDOG\\].*worker.*stuck' ~/.polypilot/event-diagnostics.log | wc -l | tr -d ' '", "expect": "0" } + ] + }, + { + "id": "idle-defer-resolves-correctly", + "name": "IDLE-DEFER fires and resolves without leaving sessions stuck", + "description": "If IDLE-DEFER fires during worker execution, it should eventually resolve to completion.", + "category": "state-hygiene", + "invariants": [ + "Every [IDLE-DEFER] is eventually followed by [COMPLETE] for the same session", + "No sessions left in permanent IDLE-DEFER state" + ], + "steps": [ + { "action": "shellCheck", "command": "grep '\\[IDLE-DEFER\\]' ~/.polypilot/event-diagnostics.log | wc -l | tr -d ' '", "note": "Count IDLE-DEFER entries" }, + { "action": "evaluate", "script": "document.querySelectorAll('.send-btn.stop-btn, .card-stop-btn').length", "expect": 0, "note": "No sessions stuck" } + ] + }, + { + "id": "reconnect-during-orchestration-no-deadlock", + "name": "Reconnect during orchestration doesn't cause deadlock", + "description": "If a reconnect happens mid-orchestration, TCS should be cancelled and orchestration should not deadlock.", + "category": "state-hygiene", + "invariants": [ + "No permanent deadlock after reconnect", + "Orchestration either recovers or fails cleanly" + ], + "steps": [ + { "action": "note", "text": "Best tested by triggering a reconnect mid-orchestration (e.g., server restart). In live testing, verify no sessions are stuck after any reconnect events in the log." }, + { "action": "shellCheck", "command": "grep '\\[RECONNECT\\]' ~/.polypilot/event-diagnostics.log | tail -3 || echo 'no reconnects'" } + ] + }, + { + "id": "delete-group-cleans-everything", + "name": "Deleting multi-agent group removes all sessions cleanly", + "description": "Delete the test multi-agent group. Verify all sessions are removed with no orphans.", + "category": "state-hygiene", + "invariants": [ + "All group sessions removed from sidebar", + "No orphaned worker sessions", + "PendingOrchestration cleared", + "Organization state updated" + ], + "steps": [ + { "action": "deleteGroup", "nameContains": "Implement" }, + { "action": "wait", "ms": 3000 }, + { "action": "assertSessionNotExists", "nameContains": "orchestrator" }, + { "action": "assertSessionNotExists", "nameContains": "worker" }, + { "action": "shellCheck", "command": "cat ~/.polypilot/pending-orchestration.json 2>/dev/null || echo 'clean'", "expect": { "oneOf": ["clean", "{}", "null", ""] } } + ] + } + ] +} diff --git a/PolyPilot.Tests/Scenarios/single-session-scenarios.json b/PolyPilot.Tests/Scenarios/single-session-scenarios.json new file mode 100644 index 000000000..da5567f72 --- /dev/null +++ b/PolyPilot.Tests/Scenarios/single-session-scenarios.json @@ -0,0 +1,633 @@ +{ + "description": "Single-session reliability scenarios for PolyPilot. Tests cover every way to interact with a single session: sending messages, tool use, abort, rapid input, session lifecycle, restart recovery, state hygiene, and edge cases. Execute against a running PolyPilot instance using MauiDevFlow CDP commands.", + "prerequisites": { + "build": "cd PolyPilot && ./relaunch.sh", + "waitForAgent": "maui devflow MAUI status --agent-port 9223", + "initialMode": "Persistent", + "notes": "All test sessions MUST point to the PolyPilot repository (PureWeen-PolyPilot). Never use other repos. Never commit, push, or create GitHub issues/PRs during testing. The DevFlow agent port is 9223 (bypass broker with --agent-port 9223). CDP click events require PointerEvent dispatch for Blazor compatibility." + }, + "cdpHelpers": { + "clickBlazorButton": "btn.dispatchEvent(new PointerEvent('pointerdown', {bubbles: true, clientX: x, clientY: y})); btn.dispatchEvent(new PointerEvent('pointerup', {bubbles: true, clientX: x, clientY: y})); btn.dispatchEvent(new MouseEvent('click', {bubbles: true, clientX: x, clientY: y}));", + "setInputValue": "const el = document.getElementById(inputId); el.value = text; el.dispatchEvent(new Event('input', {bubbles: true}));", + "sendMessage": "1) Set textarea value via getElementValue-compatible path, 2) PointerEvent click on send-btn[title='Send message']", + "checkProcessing": "document.querySelectorAll('.send-btn.stop-btn').length > 0 means session is processing", + "checkEventLog": "grep 'SESSION_NAME' ~/.polypilot/event-diagnostics.log | tail -N" + }, + "scenarios": [ + { + "id": "send-simple-message", + "name": "Simple message sends and receives a response", + "description": "Send 'What is 2+2? Reply with just the number.' and verify the response arrives, IsProcessing clears, and event log shows clean [SEND]→[COMPLETE].", + "category": "basic-message-flow", + "invariants": [ + "Response text appears in chat history", + "IsProcessing = false after completion", + "Event log shows [SEND] followed by [COMPLETE]", + "No [ERROR] or [WATCHDOG] entries for this session", + "Response length > 0" + ], + "steps": [ + { "action": "createSession", "name": "TestSimpleSend", "repo": "PureWeen-PolyPilot" }, + { "action": "selectSession", "name": "TestSimpleSend" }, + { "action": "sendMessage", "text": "What is 2+2? Reply with just the number." }, + { "action": "waitForIdle", "timeout": 60 }, + { "action": "assertEventLog", "session": "TestSimpleSend", "pattern": "\\[SEND\\].*\\[COMPLETE\\]", "expected": true }, + { "action": "assertEventLog", "session": "TestSimpleSend", "pattern": "\\[ERROR\\]|\\[WATCHDOG\\]", "expected": false }, + { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false }, + { "action": "assertResponseNotEmpty", "session": "TestSimpleSend" } + ] + }, + { + "id": "send-long-message", + "name": "Long multi-paragraph prompt processes correctly", + "description": "Send a multi-paragraph prompt to verify large inputs don't break processing.", + "category": "basic-message-flow", + "invariants": [ + "Response arrives for large input", + "IsProcessing = false after completion", + "No truncation or corruption of input" + ], + "steps": [ + { "action": "selectSession", "name": "TestSimpleSend" }, + { "action": "sendMessage", "text": "Please analyze the following requirements and provide your thoughts:\n\n1. The system must handle concurrent requests\n2. It must maintain session state across restarts\n3. Error recovery should be automatic\n4. All state transitions must be logged\n\nWhat design patterns would you recommend?" }, + { "action": "waitForIdle", "timeout": 120 }, + { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false }, + { "action": "assertEventLog", "session": "TestSimpleSend", "pattern": "\\[COMPLETE\\]", "expected": true } + ] + }, + { + "id": "response-renders-markdown", + "name": "Response renders markdown as formatted HTML", + "description": "Send a prompt requesting markdown output and verify it renders as HTML.", + "category": "basic-message-flow", + "invariants": [ + "Response contains HTML elements (not raw markdown)", + "Code blocks, headers, or lists render correctly" + ], + "steps": [ + { "action": "selectSession", "name": "TestSimpleSend" }, + { "action": "sendMessage", "text": "Write a short bullet list of 3 items about testing." }, + { "action": "waitForIdle", "timeout": 60 }, + { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false }, + { "action": "evaluate", "script": "document.querySelectorAll('.message-content li, .message-content ul').length > 0", "expect": true, "note": "Markdown list rendered as HTML" } + ] + }, + { + "id": "response-contains-code-block", + "name": "Code block renders with syntax highlighting", + "description": "Request code output and verify it renders in a code block.", + "category": "basic-message-flow", + "invariants": [ + "Response contains a
 or  element",
+        "Code content is present"
+      ],
+      "steps": [
+        { "action": "selectSession", "name": "TestSimpleSend" },
+        { "action": "sendMessage", "text": "Write a one-line Python hello world program. Just the code, no explanation." },
+        { "action": "waitForIdle", "timeout": 60 },
+        { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false },
+        { "action": "evaluate", "script": "document.querySelectorAll('.message-content pre, .message-content code').length > 0", "expect": true, "note": "Code block rendered" }
+      ]
+    },
+    {
+      "id": "send-empty-validation",
+      "name": "Empty send is prevented",
+      "description": "Attempt to send an empty message and verify it's blocked.",
+      "category": "basic-message-flow",
+      "invariants": [
+        "No [SEND] entry in event log for empty message",
+        "Session remains idle"
+      ],
+      "steps": [
+        { "action": "selectSession", "name": "TestSimpleSend" },
+        { "action": "captureEventLogPosition", "capture": "beforeEmpty" },
+        { "action": "sendMessage", "text": "" },
+        { "action": "wait", "ms": 2000 },
+        { "action": "assertNoNewEvents", "session": "TestSimpleSend", "since": "beforeEmpty", "pattern": "\\[SEND\\]" }
+      ]
+    },
+    {
+      "id": "tool-file-listing",
+      "name": "Tool execution: file listing completes",
+      "description": "Send a file listing request that requires tool use. Verify tools execute and response contains expected files.",
+      "category": "tool-execution",
+      "invariants": [
+        "Tool execution events appear in event log",
+        "Response mentions expected files (README.md, PolyPilot.slnx)",
+        "IsProcessing = false after completion",
+        "No [WATCHDOG] entries"
+      ],
+      "steps": [
+        { "action": "selectSession", "name": "TestSimpleSend" },
+        { "action": "sendMessage", "text": "List just the file names in the root directory of this repository. Be brief." },
+        { "action": "waitForIdle", "timeout": 120 },
+        { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false },
+        { "action": "assertEventLog", "session": "TestSimpleSend", "pattern": "\\[COMPLETE\\]", "expected": true },
+        { "action": "assertEventLog", "session": "TestSimpleSend", "pattern": "\\[WATCHDOG\\]", "expected": false }
+      ]
+    },
+    {
+      "id": "tool-file-read",
+      "name": "Tool execution: file read completes",
+      "description": "Request reading a specific file to verify file read tools work.",
+      "category": "tool-execution",
+      "invariants": [
+        "Response contains content from the file",
+        "IsProcessing = false after completion"
+      ],
+      "steps": [
+        { "action": "selectSession", "name": "TestSimpleSend" },
+        { "action": "sendMessage", "text": "Read the first line of README.md in this repo and tell me what it says. Be brief." },
+        { "action": "waitForIdle", "timeout": 120 },
+        { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false },
+        { "action": "assertEventLog", "session": "TestSimpleSend", "pattern": "\\[COMPLETE\\]", "expected": true }
+      ]
+    },
+    {
+      "id": "tool-multiple-rounds",
+      "name": "Multi-round tool use completes without getting stuck",
+      "description": "Send a request requiring multiple tool calls in sequence. Verify all rounds complete.",
+      "category": "tool-execution",
+      "invariants": [
+        "Multiple TurnStart/TurnEnd cycles in event log",
+        "Session completes with [COMPLETE]",
+        "IsProcessing = false after all rounds"
+      ],
+      "steps": [
+        { "action": "selectSession", "name": "TestSimpleSend" },
+        { "action": "sendMessage", "text": "Find the PolyPilot.slnx file and tell me how many project references it contains. Be brief." },
+        { "action": "waitForIdle", "timeout": 180 },
+        { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false },
+        { "action": "assertEventLog", "session": "TestSimpleSend", "pattern": "\\[COMPLETE\\]", "expected": true },
+        { "action": "assertEventLog", "session": "TestSimpleSend", "pattern": "\\[WATCHDOG\\]", "expected": false }
+      ]
+    },
+    {
+      "id": "tool-execution-shows-progress",
+      "name": "Processing indicator shows during tool use",
+      "description": "During tool execution, verify the UI shows working/tool call indicators.",
+      "category": "tool-execution",
+      "invariants": [
+        "Stop button visible during processing",
+        "Processing indicator updates"
+      ],
+      "steps": [
+        { "action": "selectSession", "name": "TestSimpleSend" },
+        { "action": "sendMessage", "text": "Search the codebase for all files containing 'IsProcessing' and count them. Be brief." },
+        { "action": "wait", "ms": 5000, "note": "Allow time for tool execution to start" },
+        { "action": "evaluate", "script": "document.querySelectorAll('.send-btn.stop-btn').length", "expect": { "greaterThan": 0 }, "note": "Stop button visible means processing" },
+        { "action": "waitForIdle", "timeout": 180 },
+        { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false }
+      ]
+    },
+    {
+      "id": "tool-long-running",
+      "name": "Long tool execution doesn't trigger false watchdog",
+      "description": "A prompt requiring extended tool use completes without watchdog interference.",
+      "category": "tool-execution",
+      "invariants": [
+        "No [WATCHDOG] kill entries in event log",
+        "Session completes naturally with [COMPLETE]"
+      ],
+      "steps": [
+        { "action": "selectSession", "name": "TestSimpleSend" },
+        { "action": "sendMessage", "text": "Count the total number of lines across all .cs files in the PolyPilot/Services/ directory. Be brief, just give me the count." },
+        { "action": "waitForIdle", "timeout": 300 },
+        { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false },
+        { "action": "assertEventLog", "session": "TestSimpleSend", "pattern": "\\[WATCHDOG\\].*kill|\\[WATCHDOG\\].*stuck", "expected": false },
+        { "action": "assertEventLog", "session": "TestSimpleSend", "pattern": "\\[COMPLETE\\]", "expected": true }
+      ]
+    },
+    {
+      "id": "abort-during-thinking",
+      "name": "Abort during 'Thinking...' phase cleans up state",
+      "description": "Send a prompt, abort during the thinking phase. Verify IsProcessing clears and all companion fields reset.",
+      "category": "abort-cancel",
+      "invariants": [
+        "IsProcessing = false within 5 seconds of abort",
+        "No lingering spinner or 'Thinking...' indicator",
+        "Event log shows [ABORT]",
+        "Session accepts new input after abort"
+      ],
+      "steps": [
+        { "action": "selectSession", "name": "TestSimpleSend" },
+        { "action": "sendMessage", "text": "Write a very detailed 5000-word essay about software testing methodologies." },
+        { "action": "wait", "ms": 3000, "note": "Wait for thinking phase" },
+        { "action": "clickAbort", "session": "TestSimpleSend" },
+        { "action": "wait", "ms": 5000 },
+        { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false },
+        { "action": "assertEventLog", "session": "TestSimpleSend", "pattern": "\\[ABORT\\]", "expected": true }
+      ]
+    },
+    {
+      "id": "abort-during-tool-use",
+      "name": "Abort during tool execution cleans up state",
+      "description": "Send a tool-using prompt, abort during tool execution. Verify clean state.",
+      "category": "abort-cancel",
+      "invariants": [
+        "IsProcessing = false after abort",
+        "ActiveToolCallCount = 0",
+        "Event log shows [ABORT]"
+      ],
+      "steps": [
+        { "action": "selectSession", "name": "TestSimpleSend" },
+        { "action": "sendMessage", "text": "Read every single .cs file in the repository and summarize each one in detail." },
+        { "action": "wait", "ms": 10000, "note": "Wait for tool execution to start" },
+        { "action": "clickAbort", "session": "TestSimpleSend" },
+        { "action": "wait", "ms": 5000 },
+        { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false },
+        { "action": "assertEventLog", "session": "TestSimpleSend", "pattern": "\\[ABORT\\]", "expected": true }
+      ]
+    },
+    {
+      "id": "abort-then-send-new",
+      "name": "Session accepts new message after abort",
+      "description": "Abort a session, then immediately send a new prompt. Verify no deadlock.",
+      "category": "abort-cancel",
+      "invariants": [
+        "New message processes successfully after abort",
+        "IsProcessing = false after new message completes",
+        "No deadlock or stuck state"
+      ],
+      "steps": [
+        { "action": "selectSession", "name": "TestSimpleSend" },
+        { "action": "sendMessage", "text": "Write a very long detailed analysis of every design pattern ever created." },
+        { "action": "wait", "ms": 5000 },
+        { "action": "clickAbort", "session": "TestSimpleSend" },
+        { "action": "wait", "ms": 3000 },
+        { "action": "sendMessage", "text": "Say 'hello'. Just that one word." },
+        { "action": "waitForIdle", "timeout": 60 },
+        { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false },
+        { "action": "assertEventLog", "session": "TestSimpleSend", "pattern": "\\[COMPLETE\\]", "expected": true }
+      ]
+    },
+    {
+      "id": "abort-when-idle-no-crash",
+      "name": "Abort on idle session doesn't crash",
+      "description": "Try to abort a session that's already idle. Verify no crash or error.",
+      "category": "abort-cancel",
+      "invariants": [
+        "No crash or error",
+        "Session remains idle and functional"
+      ],
+      "steps": [
+        { "action": "selectSession", "name": "TestSimpleSend" },
+        { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false },
+        { "action": "clickAbort", "session": "TestSimpleSend", "note": "Abort when already idle" },
+        { "action": "wait", "ms": 2000 },
+        { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false },
+        { "action": "sendMessage", "text": "Say 'still working'. Just those two words." },
+        { "action": "waitForIdle", "timeout": 60 },
+        { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false }
+      ]
+    },
+    {
+      "id": "back-to-back-simple",
+      "name": "Back-to-back messages both complete",
+      "description": "Send message 1, wait for completion, immediately send message 2. Both should complete.",
+      "category": "rapid-input",
+      "invariants": [
+        "Both messages get [COMPLETE] in event log",
+        "IsProcessing = false after both complete",
+        "No deadlock"
+      ],
+      "steps": [
+        { "action": "selectSession", "name": "TestSimpleSend" },
+        { "action": "sendMessage", "text": "Say 'first'. Just that word." },
+        { "action": "waitForIdle", "timeout": 60 },
+        { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false },
+        { "action": "sendMessage", "text": "Say 'second'. Just that word." },
+        { "action": "waitForIdle", "timeout": 60 },
+        { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false },
+        { "action": "assertEventLog", "session": "TestSimpleSend", "pattern": "\\[COMPLETE\\]", "count": { "atLeast": 2 } }
+      ]
+    },
+    {
+      "id": "send-while-processing",
+      "name": "Send during processing is handled correctly",
+      "description": "Send a second message while the first is still processing. Verify no crash, second message eventually processes.",
+      "category": "rapid-input",
+      "invariants": [
+        "No crash or exception",
+        "Both messages eventually complete or second is queued",
+        "Session reaches idle state"
+      ],
+      "steps": [
+        { "action": "selectSession", "name": "TestSimpleSend" },
+        { "action": "sendMessage", "text": "List every .cs file in the PolyPilot/Services directory." },
+        { "action": "wait", "ms": 2000, "note": "First message starts processing" },
+        { "action": "sendMessage", "text": "Say 'queued message received'." },
+        { "action": "waitForIdle", "timeout": 180 },
+        { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false }
+      ]
+    },
+    {
+      "id": "rapid-three-messages",
+      "name": "Three rapid messages don't cause permanent stuck state",
+      "description": "Send 3 messages in quick succession. Verify session eventually reaches idle.",
+      "category": "rapid-input",
+      "invariants": [
+        "Session eventually reaches idle (IsProcessing = false)",
+        "No permanent stuck state",
+        "Event log has at least one [COMPLETE]"
+      ],
+      "steps": [
+        { "action": "selectSession", "name": "TestSimpleSend" },
+        { "action": "sendMessage", "text": "Say 'one'." },
+        { "action": "wait", "ms": 500 },
+        { "action": "sendMessage", "text": "Say 'two'." },
+        { "action": "wait", "ms": 500 },
+        { "action": "sendMessage", "text": "Say 'three'." },
+        { "action": "waitForIdle", "timeout": 180 },
+        { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false }
+      ]
+    },
+    {
+      "id": "create-new-session",
+      "name": "Create new session appears in sidebar",
+      "description": "Create a new session and verify it appears in the session list.",
+      "category": "session-lifecycle",
+      "invariants": [
+        "New session appears in sidebar",
+        "Session is empty (no messages)",
+        "Session accepts input"
+      ],
+      "steps": [
+        { "action": "createSession", "name": "TestLifecycle", "repo": "PureWeen-PolyPilot" },
+        { "action": "assertSessionExists", "name": "TestLifecycle" },
+        { "action": "selectSession", "name": "TestLifecycle" },
+        { "action": "assertInputAvailable", "session": "TestLifecycle" }
+      ]
+    },
+    {
+      "id": "create-session-with-repo-context",
+      "name": "Session with repo context has tool access",
+      "description": "Create a session pointed at PolyPilot repo, verify it can use tools.",
+      "category": "session-lifecycle",
+      "invariants": [
+        "Session has repo context",
+        "Tool calls work (file listing succeeds)"
+      ],
+      "steps": [
+        { "action": "selectSession", "name": "TestLifecycle" },
+        { "action": "sendMessage", "text": "What repository am I in? Be brief." },
+        { "action": "waitForIdle", "timeout": 60 },
+        { "action": "assertProcessing", "session": "TestLifecycle", "expected": false }
+      ]
+    },
+    {
+      "id": "switch-between-sessions",
+      "name": "Switching sessions preserves each one's history",
+      "description": "Switch between two sessions and verify each retains its own chat history.",
+      "category": "session-lifecycle",
+      "invariants": [
+        "Each session shows its own history after switching",
+        "No cross-contamination between sessions"
+      ],
+      "steps": [
+        { "action": "selectSession", "name": "TestSimpleSend" },
+        { "action": "assertResponseNotEmpty", "session": "TestSimpleSend", "note": "First session has history" },
+        { "action": "selectSession", "name": "TestLifecycle" },
+        { "action": "wait", "ms": 1000 },
+        { "action": "selectSession", "name": "TestSimpleSend" },
+        { "action": "assertResponseNotEmpty", "session": "TestSimpleSend", "note": "History preserved after switch" }
+      ]
+    },
+    {
+      "id": "close-session-removed",
+      "name": "Closed session doesn't appear after removal",
+      "description": "Close a session and verify it's removed from the sidebar.",
+      "category": "session-lifecycle",
+      "invariants": [
+        "Session removed from sidebar",
+        "Not restored on next check"
+      ],
+      "steps": [
+        { "action": "createSession", "name": "TestClose", "repo": "PureWeen-PolyPilot" },
+        { "action": "assertSessionExists", "name": "TestClose" },
+        { "action": "closeSession", "name": "TestClose" },
+        { "action": "wait", "ms": 2000 },
+        { "action": "assertSessionNotExists", "name": "TestClose" }
+      ]
+    },
+    {
+      "id": "session-history-persists-to-disk",
+      "name": "Chat history persists across interactions",
+      "description": "Send a message, then verify the chat database has the message stored.",
+      "category": "session-lifecycle",
+      "invariants": [
+        "Message appears in chat history",
+        "History survives session switch and return"
+      ],
+      "steps": [
+        { "action": "selectSession", "name": "TestSimpleSend" },
+        { "action": "sendMessage", "text": "Say 'persistence test 12345'. Just those exact words." },
+        { "action": "waitForIdle", "timeout": 60 },
+        { "action": "selectSession", "name": "TestLifecycle" },
+        { "action": "wait", "ms": 1000 },
+        { "action": "selectSession", "name": "TestSimpleSend" },
+        { "action": "evaluate", "script": "document.querySelector('.expanded-view-container')?.textContent?.includes('persistence test 12345') || document.querySelector('.chat-messages')?.textContent?.includes('persistence test 12345')", "expect": true }
+      ]
+    },
+    {
+      "id": "session-survives-relaunch",
+      "name": "Session restores correctly after app relaunch",
+      "description": "Send a message, relaunch the app, verify session is restored with history.",
+      "category": "connection-recovery",
+      "invariants": [
+        "Session appears in sidebar after relaunch",
+        "Chat history is preserved",
+        "Session is idle (not stuck in processing)"
+      ],
+      "steps": [
+        { "action": "captureSessionList", "capture": "beforeRelaunch" },
+        { "action": "relaunchApp" },
+        { "action": "waitForAgent", "timeout": 120 },
+        { "action": "assertSessionExists", "name": "TestSimpleSend" },
+        { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false }
+      ]
+    },
+    {
+      "id": "idle-session-no-resumed-tag",
+      "name": "Idle session doesn't get '(resumed)' tag after relaunch",
+      "description": "After relaunch, idle sessions should not have phantom '(resumed)' markers.",
+      "category": "connection-recovery",
+      "invariants": [
+        "No session names contain '(resumed)' text",
+        "All sessions have their original names"
+      ],
+      "steps": [
+        { "action": "evaluate", "script": "Array.from(document.querySelectorAll('.session-name, .session-title')).filter(e => e.textContent.includes('resumed')).length", "expect": 0, "note": "No phantom (resumed) sessions" }
+      ]
+    },
+    {
+      "id": "no-phantom-sessions-after-relaunch",
+      "name": "No duplicate or phantom sessions after relaunch",
+      "description": "After relaunch, verify no duplicate sessions appeared.",
+      "category": "connection-recovery",
+      "invariants": [
+        "Session count matches pre-relaunch count (±1 for any cleaned up)",
+        "No duplicate session names"
+      ],
+      "steps": [
+        { "action": "evaluate", "script": "new Set(Array.from(document.querySelectorAll('.session-name')).map(e => e.textContent.trim())).size === document.querySelectorAll('.session-name').length", "expect": true, "note": "No duplicate session names" }
+      ]
+    },
+    {
+      "id": "post-relaunch-send-works",
+      "name": "Session is functional after relaunch",
+      "description": "After relaunch, send a message to a restored session and verify it completes.",
+      "category": "connection-recovery",
+      "invariants": [
+        "Message sends successfully",
+        "Response arrives",
+        "IsProcessing = false after completion"
+      ],
+      "steps": [
+        { "action": "selectSession", "name": "TestSimpleSend" },
+        { "action": "sendMessage", "text": "Say 'post relaunch OK'. Just those exact words." },
+        { "action": "waitForIdle", "timeout": 120 },
+        { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false },
+        { "action": "assertEventLog", "session": "TestSimpleSend", "pattern": "\\[COMPLETE\\]", "expected": true }
+      ]
+    },
+    {
+      "id": "no-stuck-processing-after-relaunch",
+      "name": "No sessions stuck in IsProcessing after relaunch",
+      "description": "After relaunch, verify no sessions are stuck in processing state.",
+      "category": "connection-recovery",
+      "invariants": [
+        "Zero visible stop buttons (no sessions processing)",
+        "No 'Thinking...' indicators visible"
+      ],
+      "steps": [
+        { "action": "wait", "ms": 30000, "note": "Wait for watchdog to clear any stuck sessions" },
+        { "action": "evaluate", "script": "document.querySelectorAll('.send-btn.stop-btn, .card-stop-btn').length", "expect": 0, "note": "No sessions stuck processing" }
+      ]
+    },
+    {
+      "id": "isprocessing-false-when-idle",
+      "name": "IsProcessing is false on all idle sessions",
+      "description": "Check that every session in the list has IsProcessing = false when no messages are being sent.",
+      "category": "state-hygiene",
+      "invariants": [
+        "No stop buttons visible",
+        "No thinking/working indicators"
+      ],
+      "steps": [
+        { "action": "evaluate", "script": "document.querySelectorAll('.send-btn.stop-btn, .card-stop-btn').length", "expect": 0 },
+        { "action": "evaluate", "script": "document.querySelectorAll('.thinking-indicator, .processing-indicator').length", "expect": 0 }
+      ]
+    },
+    {
+      "id": "event-log-send-complete-pairs",
+      "name": "Every [SEND] has a matching [COMPLETE] or [ABORT]",
+      "description": "Audit the event diagnostics log. Every [SEND] entry should have a corresponding completion.",
+      "category": "state-hygiene",
+      "invariants": [
+        "Number of [SEND] entries ≤ [COMPLETE] + [ABORT] entries (for test sessions)",
+        "No orphaned [SEND] without resolution"
+      ],
+      "steps": [
+        { "action": "shellCheck", "command": "sends=$(grep -c '\\[SEND\\].*TestSimpleSend' ~/.polypilot/event-diagnostics.log); completes=$(grep -c '\\[COMPLETE\\].*TestSimpleSend\\|\\[ABORT\\].*TestSimpleSend' ~/.polypilot/event-diagnostics.log); echo \"sends=$sends completes=$completes\"; [ $sends -le $completes ]" }
+      ]
+    },
+    {
+      "id": "no-watchdog-on-normal-sessions",
+      "name": "Watchdog doesn't fire for normally-completing sessions",
+      "description": "Verify that sessions completing normally don't trigger watchdog entries.",
+      "category": "state-hygiene",
+      "invariants": [
+        "No [WATCHDOG] entries for test sessions",
+        "All completions are via [COMPLETE] or [ABORT]"
+      ],
+      "steps": [
+        { "action": "shellCheck", "command": "grep '\\[WATCHDOG\\].*TestSimpleSend' ~/.polypilot/event-diagnostics.log | grep -v 'deferred' | wc -l | tr -d ' '", "expect": "0" }
+      ]
+    },
+    {
+      "id": "processing-indicator-shows-and-clears",
+      "name": "Processing indicator shows during work and clears after",
+      "description": "Send a message, verify processing indicator appears, then clears after completion.",
+      "category": "state-hygiene",
+      "invariants": [
+        "Processing indicator visible during processing",
+        "Processing indicator gone after completion"
+      ],
+      "steps": [
+        { "action": "selectSession", "name": "TestSimpleSend" },
+        { "action": "sendMessage", "text": "List the top-level directories in this repository." },
+        { "action": "wait", "ms": 3000 },
+        { "action": "evaluate", "script": "document.querySelectorAll('.send-btn.stop-btn, .card-stop-btn').length > 0 || true", "expect": true, "note": "May or may not catch the processing window" },
+        { "action": "waitForIdle", "timeout": 120 },
+        { "action": "evaluate", "script": "document.querySelectorAll('.send-btn.stop-btn, .card-stop-btn').length", "expect": 0, "note": "Processing indicator cleared" }
+      ]
+    },
+    {
+      "id": "session-not-stuck-after-idle",
+      "name": "No phantom 'Thinking...' after session completes",
+      "description": "After a message completes, verify the session is truly idle with no lingering indicators.",
+      "category": "state-hygiene",
+      "invariants": [
+        "No thinking/working text visible for session",
+        "Input is enabled and accepts typing",
+        "Send button is not a stop button"
+      ],
+      "steps": [
+        { "action": "selectSession", "name": "TestSimpleSend" },
+        { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false },
+        { "action": "assertInputAvailable", "session": "TestSimpleSend" }
+      ]
+    },
+    {
+      "id": "send-special-characters",
+      "name": "Special characters in prompt don't cause issues",
+      "description": "Send a prompt with quotes, backticks, unicode, and special characters.",
+      "category": "edge-cases",
+      "invariants": [
+        "Message sends successfully",
+        "Response arrives",
+        "No crash or error"
+      ],
+      "steps": [
+        { "action": "selectSession", "name": "TestSimpleSend" },
+        { "action": "sendMessage", "text": "Echo this back: Hello 'world' \"test\" `code`  & © ñ 🎉" },
+        { "action": "waitForIdle", "timeout": 60 },
+        { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false }
+      ]
+    },
+    {
+      "id": "model-switch-then-send",
+      "name": "Session works after model switch",
+      "description": "Switch the model for a session, then send a message. Verify it completes.",
+      "category": "edge-cases",
+      "invariants": [
+        "Model switch succeeds",
+        "Message sends and completes with new model",
+        "IsProcessing = false after completion"
+      ],
+      "steps": [
+        { "action": "selectSession", "name": "TestSimpleSend" },
+        { "action": "switchModel", "note": "Switch to a different available model via UI dropdown" },
+        { "action": "sendMessage", "text": "Say 'model switch OK'. Just those words." },
+        { "action": "waitForIdle", "timeout": 120 },
+        { "action": "assertProcessing", "session": "TestSimpleSend", "expected": false }
+      ]
+    },
+    {
+      "id": "stale-session-loads-ok",
+      "name": "Old sessions load without errors",
+      "description": "Verify that existing sessions from before testing load and don't cause errors.",
+      "category": "edge-cases",
+      "invariants": [
+        "Pre-existing sessions visible in sidebar",
+        "No error messages or crash indicators",
+        "App remains responsive"
+      ],
+      "steps": [
+        { "action": "evaluate", "script": "document.querySelectorAll('.session-list-item, .session-card').length > 0", "expect": true, "note": "Sessions loaded" },
+        { "action": "evaluate", "script": "document.querySelectorAll('.error-banner, .crash-indicator, .fatal-error').length", "expect": 0, "note": "No error banners" }
+      ]
+    }
+  ]
+}
diff --git a/PolyPilot.Tests/SessionPersistenceTests.cs b/PolyPilot.Tests/SessionPersistenceTests.cs
index d57c3e022..6dcbb9283 100644
--- a/PolyPilot.Tests/SessionPersistenceTests.cs
+++ b/PolyPilot.Tests/SessionPersistenceTests.cs
@@ -1721,10 +1721,11 @@ public void Merge_NameCollision_DifferentGroupId_WithoutRecoveryMarker_KeepsPrev
     }
 
     [Fact]
-    public void Merge_NameCollision_SameGroupId_StillCreatesPrevious()
+    public void Merge_NameCollision_SameGroupId_WithoutRecoveryMarker_StillCreatesPrevious()
     {
-        // When the collision happens within the same group (e.g., reconnect replaced
-        // the session), the old entry should still be preserved as "(previous)".
+        // When the collision happens within the same group but there is NO explicit
+        // RecoveredFromSessionId marker, we cannot be sure the new session intentionally
+        // replaced the old one — preserve the old entry as "(previous)" to avoid data loss.
         var active = new List
         {
             new() { SessionId = "new-id", DisplayName = "MyWorker", Model = "m",
@@ -1743,6 +1744,31 @@ public void Merge_NameCollision_SameGroupId_StillCreatesPrevious()
         Assert.Equal("MyWorker (previous)", result[1].DisplayName);
     }
 
+    [Fact]
+    public void Merge_NameCollision_SameGroupId_WithExplicitRecovery_DropsPersistedEntry()
+    {
+        // Worker revival (empty response → fresh session) sets RecoveredFromSessionId on the
+        // new session to record that it explicitly replaced the old one.  The merge must
+        // drop the old persisted entry so it never appears as a "(previous)" phantom.
+        var active = new List
+        {
+            new() { SessionId = "new-id", DisplayName = "MyWorker", Model = "m",
+                     WorkingDirectory = "/w", GroupId = "same-group", RecoveredFromSessionId = "old-id" }
+        };
+        var persisted = new List
+        {
+            new() { SessionId = "old-id", DisplayName = "MyWorker", Model = "m",
+                     WorkingDirectory = "/w", GroupId = "same-group" }
+        };
+
+        var result = CopilotService.MergeSessionEntries(active, persisted, new HashSet(), new HashSet(), _ => true);
+
+        // Only the active (revival) entry should remain — no "(previous)" phantom
+        Assert.Single(result);
+        Assert.Equal("new-id", result[0].SessionId);
+        Assert.Equal("MyWorker", result[0].DisplayName);
+    }
+
     [Fact]
     public void Merge_NameCollision_NullGroupIds_StillCreatesPrevious()
     {
diff --git a/PolyPilot.Tests/SessionStabilityTests.cs b/PolyPilot.Tests/SessionStabilityTests.cs
index 434501b9e..a10b680c5 100644
--- a/PolyPilot.Tests/SessionStabilityTests.cs
+++ b/PolyPilot.Tests/SessionStabilityTests.cs
@@ -303,20 +303,13 @@ public void WatchdogCrashRecovery_ClearsAllCompanionFields()
         var source = File.ReadAllText(TestPaths.EventsCs);
         var watchdogMethod = ExtractMethod(source, "RunProcessingWatchdogAsync");
 
-        // The crash recovery block (Case C kill) must clear companion fields
-        var companionFields = new[]
-        {
-            "IsProcessing = false",
-            "ProcessingPhase",
-            "ProcessingStartedAt",
-            "ToolCallCount",
-        };
-
-        foreach (var field in companionFields)
-        {
-            Assert.True(watchdogMethod.Contains(field, StringComparison.Ordinal),
-                $"Watchdog crash recovery must clear '{field}'");
-        }
+        // The crash recovery block must call ClearProcessingState (which atomically
+        // clears IsProcessing, ProcessingPhase, ProcessingStartedAt, ToolCallCount, etc.)
+        Assert.True(watchdogMethod.Contains("ClearProcessingState(state", StringComparison.Ordinal),
+            "Watchdog crash recovery must call ClearProcessingState to atomically clear all companion fields");
+        // Must also set AllowTurnStartRearm = false (terminal forced stop)
+        Assert.True(watchdogMethod.Contains("AllowTurnStartRearm = false", StringComparison.Ordinal),
+            "Watchdog crash recovery must set AllowTurnStartRearm = false");
     }
 
     // ─── Multi-Agent Fix Prompt Enhancement ───
diff --git a/PolyPilot/Services/CopilotService.Events.cs b/PolyPilot/Services/CopilotService.Events.cs
index 72281e121..bb75f3108 100644
--- a/PolyPilot/Services/CopilotService.Events.cs
+++ b/PolyPilot/Services/CopilotService.Events.cs
@@ -1581,20 +1581,9 @@ private void CompleteResponse(SessionState state, long? expectedGeneration = nul
               $"(responseLen={state.CurrentResponse.Length}, flushedLen={state.FlushedResponse.Length}, thread={Environment.CurrentManagedThreadId})");
         
         CancelProcessingWatchdog(state);
-        // Also cancel any pending TurnEnd→Idle fallback — CompleteResponse is now executing
-        CancelTurnEndFallback(state);
-        CancelToolHealthCheck(state);
-        Interlocked.Exchange(ref state.ActiveToolCallCount, 0);
-        Interlocked.Exchange(ref state.SendingFlag, 0);
-        state.HasUsedToolsThisTurn = false;
-        ClearDeferredIdleTracking(state);
-        state.IsReconnectedSend = false; // Clear reconnect flag on turn completion (defense-in-depth)
         state.FallbackCanceledByTurnStart = false;
-        Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0);
-        Interlocked.Exchange(ref state.ToolHealthStaleChecks, 0);
-        Interlocked.Exchange(ref state.EventCountThisTurn, 0);
-        Interlocked.Exchange(ref state.TurnEndReceivedAtTicks, 0);
-        state.Info.IsResumed = false; // Clear after first successful turn
+        // Per-turn tracking fields (ActiveToolCallCount, HasUsedToolsThisTurn, etc.)
+        // are cleared by ClearProcessingState below. No need to clear them early.
         var response = state.CurrentResponse.ToString();
         var responseAlreadyFlushedThisTurn = WasResponseAlreadyFlushedThisTurn(state, response);
         if (!string.IsNullOrWhiteSpace(response))
@@ -1640,29 +1629,18 @@ private void CompleteResponse(SessionState state, long? expectedGeneration = nul
         // Clear IsProcessing BEFORE completing the TCS — if the continuation runs
         // synchronously (e.g., in orchestrator reflection loops), the next SendPromptAsync
         // call must see IsProcessing=false or it throws "already processing".
-        state.CurrentResponse.Clear();
-        state.FlushedResponse.Clear();
-        ClearFlushedReplayDedup(state);
-        state.PendingReasoningMessages.Clear();
-        // Accumulate API time before clearing ProcessingStartedAt
-        if (state.Info.ProcessingStartedAt is { } started)
-        {
-            state.Info.TotalApiTimeSeconds += (DateTime.UtcNow - started).TotalSeconds;
-            state.Info.PremiumRequestsUsed++;
-        }
-        state.AllowTurnStartRearm = true; // session.idle/turn-end completion can be premature; allow one late TurnStart recovery
-        state.Info.IsProcessing = false;
-        state.Info.IsResumed = false;
-        Interlocked.Exchange(ref state.SendingFlag, 0); // Release atomic send lock
-        state.Info.ConsecutiveStuckCount = 0;
-        // A successful completion proves the server is healthy — reset the
+        ClearProcessingState(state);
+        // Success-only: allow EVT-REARM to re-arm IsProcessing if a late TurnStart arrives
+        // (premature session.idle recovery). This must be set AFTER ClearProcessingState to
+        // avoid the race where a background TurnStart thread reads AllowTurnStartRearm=true
+        // before error/abort callers can override it back to false.
+        state.AllowTurnStartRearm = true;
+        // Success-only: a successful completion proves the server is healthy — reset the
         // service-level watchdog timeout counter to prevent false recovery triggers.
         Interlocked.Exchange(ref _consecutiveWatchdogTimeouts, 0);
-        state.Info.ProcessingStartedAt = null;
-        state.Info.ToolCallCount = 0;
-        state.Info.ProcessingPhase = 0;
-        state.Info.ClearPermissionDenials();
-        state.Info.LastUpdatedAt = DateTime.Now;
+        // Success-only: a successful response proves the session is not stuck — reset the
+        // per-session consecutive stuck counter so the >= 3 threshold can re-accumulate.
+        state.Info.ConsecutiveStuckCount = 0;
         state.ResponseCompletion?.TrySetResult(fullResponse);
         
         // Fire completion notification BEFORE OnStateChanged — this ensures
@@ -2337,7 +2315,6 @@ private void TriggerToolHealthRecovery(SessionState state, string sessionName, s
         if (state.IsOrphaned) return;
         CancelToolHealthCheck(state);
         CancelProcessingWatchdog(state);
-        CancelTurnEndFallback(state);
 
         var activeTools = Volatile.Read(ref state.ActiveToolCallCount);
         var recoveryGeneration = Interlocked.Read(ref state.ProcessingGeneration);
@@ -2352,16 +2329,8 @@ private void TriggerToolHealthRecovery(SessionState state, string sessionName, s
 
             OnError?.Invoke(sessionName, $"Tool execution stuck ({reason}). Session recovered automatically.");
 
-            // Full cleanup mirroring CompleteResponse — missing fields here caused stuck sessions
-            Interlocked.Exchange(ref state.ActiveToolCallCount, 0);
-            state.HasUsedToolsThisTurn = false;
-            ClearDeferredIdleTracking(state);
             state.FallbackCanceledByTurnStart = false;
-            Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0);
             Interlocked.Exchange(ref state.WatchdogCaseAResets, 0);
-            Interlocked.Exchange(ref state.ToolHealthStaleChecks, 0);
-            Interlocked.Exchange(ref state.EventCountThisTurn, 0);
-            Interlocked.Exchange(ref state.TurnEndReceivedAtTicks, 0);
 
             // Build full response: flushed mid-turn text + remaining current text
             var response = state.CurrentResponse.ToString();
@@ -2371,18 +2340,11 @@ private void TriggerToolHealthRecovery(SessionState state, string sessionName, s
                     : state.FlushedResponse + "\n\n" + response)
                 : response;
 
-            state.CurrentResponse.Clear();
-            state.FlushedResponse.Clear();
-            state.PendingReasoningMessages.Clear();
-
-            state.Info.IsProcessing = false;
+            // Accumulate API time but don't count as premium request (recovery, not success)
+            if (state.Info.ProcessingStartedAt is { } healthStarted)
+                state.Info.TotalApiTimeSeconds += (DateTime.UtcNow - healthStarted).TotalSeconds;
+            ClearProcessingState(state, accumulateApiTime: false);
             state.AllowTurnStartRearm = false; // Explicit tool-health recovery should stay completed
-            state.Info.IsResumed = false;
-            Interlocked.Exchange(ref state.SendingFlag, 0);
-            state.Info.ProcessingStartedAt = null;
-            state.Info.ToolCallCount = 0;
-            state.Info.ProcessingPhase = 0;
-            state.Info.ClearPermissionDenials();
 
             state.ResponseCompletion?.TrySetResult(fullResponse);
 
@@ -2961,33 +2923,19 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session
                             return;
                         }
                         CancelProcessingWatchdog(state);
-                        CancelToolHealthCheck(state);
-                        Interlocked.Exchange(ref state.ActiveToolCallCount, 0);
-                        state.HasUsedToolsThisTurn = false;
-                        ClearDeferredIdleTracking(state);
-                        Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0);
-                        Interlocked.Exchange(ref state.ToolHealthStaleChecks, 0);
-                        Interlocked.Exchange(ref state.EventCountThisTurn, 0);
-                        Interlocked.Exchange(ref state.TurnEndReceivedAtTicks, 0);
-                        // Cancel any pending TurnEnd→Idle fallback
-                        CancelTurnEndFallback(state);
-                        state.Info.IsResumed = false;
-                        state.IsReconnectedSend = false; // INV-1: clear all per-turn flags on termination
                         // Flush any accumulated partial response before clearing processing state.
                         // Wrapped in try-catch: if flush fails, IsProcessing MUST still be cleared
                         // (otherwise the session is permanently stuck — the watchdog has already exited).
                         try { FlushCurrentResponse(state); }
                         catch (Exception flushEx) { Debug($"[WATCHDOG] '{sessionName}' flush failed during kill: {flushEx.Message}"); }
                         Debug($"[WATCHDOG] '{sessionName}' IsProcessing=false — watchdog timeout after {totalProcessingSeconds:F0}s total, elapsed={elapsed:F0}s, exceededMaxTime={exceededMaxTime}");
-                        state.Info.IsProcessing = false;
-                        state.AllowTurnStartRearm = false; // Watchdog timeout is an explicit forced stop
-                        Interlocked.Exchange(ref state.SendingFlag, 0);
+                        // Capture flushed response BEFORE ClearProcessingState clears it
+                        var watchdogResponse = state.FlushedResponse.ToString();
+                        // Accumulate API time (request was in-flight) but don't count as premium request
                         if (state.Info.ProcessingStartedAt is { } wdStarted)
                             state.Info.TotalApiTimeSeconds += (DateTime.UtcNow - wdStarted).TotalSeconds;
-                        state.Info.ProcessingStartedAt = null;
-                        state.Info.ToolCallCount = 0;
-                        state.Info.ProcessingPhase = 0;
-                        state.Info.ClearPermissionDenials(); // INV-1: clear on all termination paths
+                        ClearProcessingState(state, accumulateApiTime: false);
+                        state.AllowTurnStartRearm = false; // Watchdog timeout is an explicit forced stop
                         state.Info.ConsecutiveStuckCount++;
                         // Track service-level consecutive watchdog timeouts. When the
                         // persistent server's auth token expires, ALL sessions hang silently.
@@ -3024,9 +2972,6 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session
                             // into a session that's in a repeated-stuck cycle.
                             state.Info.MessageQueue.Clear();
                         }
-                        var watchdogResponse = state.FlushedResponse.ToString();
-                        state.FlushedResponse.Clear();
-                        state.PendingReasoningMessages.Clear();
                         state.ResponseCompletion?.TrySetResult(watchdogResponse);
                         // Fire completion notification so orchestrator loops are unblocked (INV-O4)
                         OnSessionComplete?.Invoke(sessionName, "[Watchdog] timeout");
@@ -3058,28 +3003,15 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session
                     // Best-effort flush before clearing processing state
                     try { FlushCurrentResponse(state); }
                     catch { /* Flush failure must not prevent IsProcessing cleanup */ }
-                    // INV-1: clear IsProcessing and all 9 companion fields
-                    state.Info.IsProcessing = false;
+                    // Capture response BEFORE ClearProcessingState clears it
+                    var crashResponse = state.FlushedResponse.ToString() + state.CurrentResponse.ToString();
+                    // Accumulate API time but don't count as premium request
+                    if (state.Info.ProcessingStartedAt is { } crashStarted)
+                        state.Info.TotalApiTimeSeconds += (DateTime.UtcNow - crashStarted).TotalSeconds;
+                    ClearProcessingState(state, accumulateApiTime: false);
                     state.AllowTurnStartRearm = false; // Watchdog crash cleanup is terminal for this turn
-                    state.Info.IsResumed = false;
-                    Interlocked.Exchange(ref state.SendingFlag, 0);
-                    Interlocked.Exchange(ref state.ActiveToolCallCount, 0);
-                    state.HasUsedToolsThisTurn = false;
-                    ClearDeferredIdleTracking(state);
-                    Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0);
-                    Interlocked.Exchange(ref state.ToolHealthStaleChecks, 0);
-                    Interlocked.Exchange(ref state.EventCountThisTurn, 0);
-                    Interlocked.Exchange(ref state.TurnEndReceivedAtTicks, 0);
-                    state.Info.ProcessingStartedAt = null;
-                    state.Info.ToolCallCount = 0;
-                    state.Info.ProcessingPhase = 0;
-                    state.Info.ClearPermissionDenials();
                     state.Info.ConsecutiveStuckCount++;
                     Interlocked.Increment(ref _consecutiveWatchdogTimeouts);
-                    var crashResponse = state.FlushedResponse.ToString() + state.CurrentResponse.ToString();
-                    state.FlushedResponse.Clear();
-                    state.CurrentResponse.Clear();
-                    state.PendingReasoningMessages.Clear();
                     state.ResponseCompletion?.TrySetResult(crashResponse);
                     OnSessionComplete?.Invoke(sessionName, "[Watchdog] crash recovery");
                     OnError?.Invoke(sessionName, "Internal error in session monitoring. Try sending your message again.");
diff --git a/PolyPilot/Services/CopilotService.Organization.cs b/PolyPilot/Services/CopilotService.Organization.cs
index 698ac072b..b8c790b49 100644
--- a/PolyPilot/Services/CopilotService.Organization.cs
+++ b/PolyPilot/Services/CopilotService.Organization.cs
@@ -42,20 +42,19 @@ public partial class CopilotService
     /// Shorter than WorkerExecutionTimeout — if a worker is stuck, the orchestrator
     /// proceeds with partial results rather than blocking the group forever.
private static readonly TimeSpan OrchestratorCollectionTimeout = TimeSpan.FromMinutes(15); - /// Checks both WasPrematurelyIdled flag (set by EVT-REARM) and events.jsonl freshness - /// (CLI still writing events). The events.jsonl check catches cases where EVT-REARM - /// takes 30-60s to fire. - internal const int PrematureIdleDetectionWindowMs = 10_000; /// If events.jsonl was modified within this many seconds of TCS completion, /// the worker is likely still active despite the premature session.idle. internal const int PrematureIdleEventsFileFreshnessSeconds = 15; - /// Grace period after TCS completion before declaring premature idle based on - /// events.jsonl mtime. During this window we observe whether the file's mtime changes - /// (indicating the CLI is still writing), vs. remaining frozen (normal completion where - /// the idle event itself wrote the file). This prevents false-positive premature idle - /// detection when events.jsonl was just written by the completing idle event. + /// Settle period (first phase of two-phase grace check): waits for the OS to + /// flush the completing event's mtime update before taking the stable baseline snapshot. + /// Must be less than PrematureIdleEventsGracePeriodMs. Observation window = Grace - Settle. + internal const int PrematureIdleEventsSettleMs = 500; + + /// Total observation window: settle (500ms) + observe (1500ms) = 2000ms. + /// After settle, we snapshot stableMtime. After the full window, endMtime > stableMtime + /// means the CLI wrote NEW events (genuine premature idle), not just the completing flush. internal const int PrematureIdleEventsGracePeriodMs = 2000; /// Maximum time to wait for the worker's real completion after detecting a @@ -2411,6 +2410,10 @@ private async Task ExecuteWorkerAsync(string workerName, string ta Debug($"[DISPATCH] Worker '{workerName}' returned empty — attempting fresh session revival"); if (_sessions.TryGetValue(workerName, out var deadState)) { + // Capture the old session ID BEFORE orphaning — used to prevent + // MergeSessionEntries from creating a "(previous)" phantom entry. + var deadSessionId = deadState.Info.SessionId; + // Mark old state as orphaned so any lingering callbacks are no-ops deadState.IsOrphaned = true; try { await deadState.Session.DisposeAsync(); } catch { } @@ -2438,11 +2441,18 @@ private async Task ExecuteWorkerAsync(string workerName, string ta } else { - // Commit SessionId only after TryUpdate succeeds — avoids - // mutating shared Info on a path that might discard the state. + // Commit SessionId + recovery marker only after TryUpdate succeeds. + // RecoveredFromSessionId records that the new session explicitly replaced + // the old one — MergeSessionEntries uses this to drop the old persisted + // entry instead of renaming it "(previous)" (same-group revival case). + deadState.Info.RecoveredFromSessionId ??= deadSessionId; deadState.Info.SessionId = freshSession.SessionId; + // Add the old session ID to the closed set so SaveActiveSessionsToDisk's + // merge logic drops it instead of creating a "(previous)" phantom entry. + if (!string.IsNullOrEmpty(deadSessionId)) + _closedSessionIds[deadSessionId] = 0; DisposePrematureIdleSignal(deadState); - Debug($"[DISPATCH] Worker '{workerName}' revived with fresh session '{freshSession.SessionId}'"); + Debug($"[DISPATCH] Worker '{workerName}' revived with fresh session '{freshSession.SessionId}' (replaced '{deadSessionId}')"); response = await SendPromptAndWaitAsync(workerName, workerPrompt, cancellationToken, originalPrompt: originalPrompt); } } @@ -2675,40 +2685,36 @@ bool WaitForPrematureIdleSignal() if (!detected) { - // Snapshot mtime and wait briefly to detect whether CLI is still writing. - // Without this grace period we'd false-positive: the idle event itself just - // wrote to events.jsonl, making it appear "fresh" even on normal completion. - var mtimeBefore = GetEventsFileMtime(state.Info.SessionId); - try { await Task.Delay(PrematureIdleEventsGracePeriodMs, cancellationToken); } + // Two-phase mtime check to avoid OS mtime-flush false positives. + // + // Problem: the SDK writes assistant.turn_end to events.jsonl, then immediately emits + // session.idle (ephemeral, not written). TCS fires and we capture `mtimeBefore`, but + // the OS may not have flushed the turn_end mtime update yet (APFS can buffer for ~100ms). + // If we compare mtimeBefore (pre-flush) vs stableMtime (post-flush), we see a phantom + // change even though no new processing happened. + // + // Fix: brief settle (500ms) to let the OS flush the completing write's mtime, THEN + // snapshot `stableMtime` as a clean baseline. After 1500ms observation, compare against + // that post-settle baseline — only genuine new writes (tool rounds, etc.) trigger detection. + try { await Task.Delay(PrematureIdleEventsSettleMs, cancellationToken); } catch (OperationCanceledException) { return initialResponse; } + // Post-settle baseline — already includes the completing turn_end write's mtime stableMtime = GetEventsFileMtime(state.Info.SessionId); - // Mtime changed → CLI wrote new events during grace period → genuine premature idle - detected = stableMtime.HasValue && stableMtime.Value > (mtimeBefore ?? DateTime.MinValue); - } - if (!detected) - { - // Wait for PrematureIdleSignal OR poll events.jsonl for mtime changes - var detectStart = DateTime.UtcNow; - while ((DateTime.UtcNow - detectStart).TotalMilliseconds < PrematureIdleDetectionWindowMs) + try { await Task.Delay(PrematureIdleEventsGracePeriodMs - PrematureIdleEventsSettleMs, cancellationToken); } + catch (OperationCanceledException) { return initialResponse; } + + var endMtime = GetEventsFileMtime(state.Info.SessionId); + // endMtime > stableMtime only if the CLI wrote NEW events after the settle + if (endMtime.HasValue && endMtime.Value > (stableMtime ?? DateTime.MinValue)) { - // Wait up to 500ms on the signal (exits immediately if Set()) - var signaled = await Task.Run(WaitForPrematureIdleSignal, cancellationToken) - .ConfigureAwait(false); - - if (signaled || cancellationToken.IsCancellationRequested) - { - detected = signaled; - break; - } - - // Check if events.jsonl mtime advanced past the stable baseline - var currentMtime = GetEventsFileMtime(state.Info.SessionId); - if (currentMtime.HasValue && currentMtime.Value > (stableMtime ?? DateTime.MinValue)) - { - detected = true; - break; - } + var eventsPath = Path.Combine(SessionStatePath, state.Info.SessionId ?? "", "events.jsonl"); + var lastEvent = GetLastEventType(eventsPath); + // session.resume = reconnect write (not new processing) + // session.shutdown = terminal + detected = lastEvent != null && lastEvent != "session.resume" && lastEvent != "session.shutdown"; + if (!detected) + Debug($"[DISPATCH-RECOVER] Skipping grace-period false positive: last event={lastEvent ?? "none"} is a reconnect/terminal write, not premature idle"); } } diff --git a/PolyPilot/Services/CopilotService.Persistence.cs b/PolyPilot/Services/CopilotService.Persistence.cs index 5b8de3589..ab5e6d315 100644 --- a/PolyPilot/Services/CopilotService.Persistence.cs +++ b/PolyPilot/Services/CopilotService.Persistence.cs @@ -242,15 +242,21 @@ private static bool CanSafelyDropSupersededGroupMoveEntry( ActiveSessionEntry existing, ActiveSessionEntry activeCounterpart) { - if (string.IsNullOrEmpty(existing.GroupId) || - string.IsNullOrEmpty(activeCounterpart.GroupId) || - string.Equals(existing.GroupId, activeCounterpart.GroupId, StringComparison.OrdinalIgnoreCase)) + // Require an explicit recovery marker — without it we don't know if the new session + // is really a replacement for the old one (it might just be a same-named unrelated session). + if (string.IsNullOrEmpty(activeCounterpart.RecoveredFromSessionId) || + !string.Equals(activeCounterpart.RecoveredFromSessionId, existing.SessionId, StringComparison.OrdinalIgnoreCase)) { return false; } - return !string.IsNullOrEmpty(activeCounterpart.RecoveredFromSessionId) && - string.Equals(activeCounterpart.RecoveredFromSessionId, existing.SessionId, StringComparison.OrdinalIgnoreCase); + // If RecoveredFromSessionId explicitly matches, allow drop regardless of group. + // Covers two cases: + // 1. Cross-group: scattered team reconstruction moved the session to a new group + // 2. Same-group: worker revival (empty response → fresh session within the same group) + // Both are safe to drop because the active entry explicitly records that it recovered + // history from the old one — there is no user-visible history loss. + return true; } /// diff --git a/PolyPilot/Services/CopilotService.cs b/PolyPilot/Services/CopilotService.cs index d89b23bcf..3d1de493c 100644 --- a/PolyPilot/Services/CopilotService.cs +++ b/PolyPilot/Services/CopilotService.cs @@ -757,6 +757,58 @@ private static void ClearDeferredIdleTracking(SessionState state, bool preserveC } } + /// + /// Atomically clears ALL processing state on a session. This is the single source of truth + /// for what must be reset when a session stops processing. Every code path that sets + /// IsProcessing=false MUST call this method instead of manually clearing individual fields. + /// + /// Historical context: 13+ PRs of fix/regression cycles were caused by 22 sites that set + /// IsProcessing=false but each forgot different companion fields. This method eliminates + /// that bug category entirely. + /// + /// The session state to clear. + /// True to accumulate API time from ProcessingStartedAt (normal completion). + /// False for error/abort paths where timing is not meaningful. + private void ClearProcessingState(SessionState state, bool accumulateApiTime = true) + { + if (accumulateApiTime && state.Info.ProcessingStartedAt is { } started) + { + state.Info.TotalApiTimeSeconds += (DateTime.UtcNow - started).TotalSeconds; + state.Info.PremiumRequestsUsed++; + } + + state.CurrentResponse.Clear(); + state.FlushedResponse.Clear(); + ClearFlushedReplayDedup(state); + state.PendingReasoningMessages.Clear(); + + state.Info.IsProcessing = false; + state.Info.IsResumed = false; + state.Info.ProcessingStartedAt = null; + state.Info.ToolCallCount = 0; + state.Info.ProcessingPhase = 0; + state.Info.ClearPermissionDenials(); + state.Info.LastUpdatedAt = DateTime.Now; + + Interlocked.Exchange(ref state.SendingFlag, 0); + Interlocked.Exchange(ref state.ActiveToolCallCount, 0); + state.HasUsedToolsThisTurn = false; + ClearDeferredIdleTracking(state); + Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); + Interlocked.Exchange(ref state.ToolHealthStaleChecks, 0); + Interlocked.Exchange(ref state.EventCountThisTurn, 0); + Interlocked.Exchange(ref state.TurnEndReceivedAtTicks, 0); + state.IsReconnectedSend = false; + CancelTurnEndFallback(state); + CancelToolHealthCheck(state); + // NOTE: AllowTurnStartRearm, _consecutiveWatchdogTimeouts, and ConsecutiveStuckCount + // are NOT reset here. All three are cross-turn health accumulators: + // - AllowTurnStartRearm = true only belongs on the normal-completion path (CompleteResponse) + // - _consecutiveWatchdogTimeouts resets only on success (server is healthy) + // - ConsecutiveStuckCount resets only on success (session responded normally) + // Resetting them here would break accumulation across consecutive failures. + } + /// Ping interval to prevent the headless server from killing idle sessions. /// The server has a ~35 minute idle timeout; pinging every 15 minutes keeps sessions alive. internal const int KeepalivePingIntervalSeconds = 15 * 60; // 15 minutes @@ -3276,6 +3328,10 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis if (session != null) { session.IsProcessing = false; + session.IsResumed = false; + session.ProcessingStartedAt = null; + session.ToolCallCount = 0; + session.ProcessingPhase = 0; OnStateChanged?.Invoke(); } throw; @@ -3929,22 +3985,9 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis Console.WriteLine($"[DEBUG] Reconnect+retry failed: {retryEx.Message}"); OnError?.Invoke(sessionName, $"Session disconnected and reconnect failed: {Models.ErrorMessageHelper.Humanize(retryEx)}"); CancelProcessingWatchdog(state); - CancelTurnEndFallback(state); - CancelToolHealthCheck(state); FlushCurrentResponse(state); Debug($"[ERROR] '{sessionName}' reconnect+retry failed, clearing IsProcessing"); - Interlocked.Exchange(ref state.ActiveToolCallCount, 0); - state.HasUsedToolsThisTurn = false; - ClearDeferredIdleTracking(state); - Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); - state.Info.IsResumed = false; - state.AllowTurnStartRearm = false; // Explicit reconnect failure is terminal for this turn - state.Info.IsProcessing = false; - if (state.Info.ProcessingStartedAt is { } rcStarted) - state.Info.TotalApiTimeSeconds += (DateTime.UtcNow - rcStarted).TotalSeconds; - state.Info.ProcessingStartedAt = null; - state.Info.ToolCallCount = 0; - state.Info.ProcessingPhase = 0; + ClearProcessingState(state, accumulateApiTime: false); // reconnect failure — no API call consumed OnStateChanged?.Invoke(); throw; } @@ -3953,22 +3996,9 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis { OnError?.Invoke(sessionName, $"SendAsync failed: {Models.ErrorMessageHelper.Humanize(ex)}"); CancelProcessingWatchdog(state); - CancelTurnEndFallback(state); - CancelToolHealthCheck(state); FlushCurrentResponse(state); Debug($"[ERROR] '{sessionName}' SendAsync failed, clearing IsProcessing (error={ex.Message})"); - Interlocked.Exchange(ref state.ActiveToolCallCount, 0); - state.HasUsedToolsThisTurn = false; - ClearDeferredIdleTracking(state); - Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); - state.Info.IsResumed = false; - state.AllowTurnStartRearm = false; // Explicit send failure is terminal for this turn - state.Info.IsProcessing = false; - if (state.Info.ProcessingStartedAt is { } saStarted) - state.Info.TotalApiTimeSeconds += (DateTime.UtcNow - saStarted).TotalSeconds; - state.Info.ProcessingStartedAt = null; - state.Info.ToolCallCount = 0; - state.Info.ProcessingPhase = 0; + ClearProcessingState(state, accumulateApiTime: false); // send failure — request may not have been consumed OnStateChanged?.Invoke(); throw; } @@ -4057,11 +4087,7 @@ public async Task AbortSessionAsync(string sessionName, bool markAsInterrupted = // Optimistically clear processing state and queue if (_sessions.TryGetValue(sessionName, out var remoteState)) { - remoteState.Info.IsProcessing = false; - remoteState.Info.IsResumed = false; - remoteState.Info.ProcessingStartedAt = null; - remoteState.Info.ToolCallCount = 0; - remoteState.Info.ProcessingPhase = 0; + ClearProcessingState(remoteState, accumulateApiTime: false); remoteState.Info.MessageQueue.Clear(); OnStateChanged?.Invoke(); } @@ -4103,34 +4129,21 @@ public async Task AbortSessionAsync(string sessionName, bool markAsInterrupted = state.CurrentResponse.Clear(); Debug($"[ABORT] '{sessionName}' user abort, clearing IsProcessing"); - state.Info.IsProcessing = false; - state.WasUserAborted = true; // Suppress EVT-REARM for in-flight TurnStart events after abort - state.AllowTurnStartRearm = false; // Abort is explicit terminal intent — do not revive on late TurnStart replays - state.Info.IsResumed = false; + // Accumulate API time (the request was in-flight and consumed server resources) + // but don't increment PremiumRequestsUsed — user-initiated aborts shouldn't count + // against the premium request budget. if (state.Info.ProcessingStartedAt is { } abortStarted) state.Info.TotalApiTimeSeconds += (DateTime.UtcNow - abortStarted).TotalSeconds; - state.Info.ProcessingStartedAt = null; - state.Info.ToolCallCount = 0; - state.Info.ProcessingPhase = 0; - Interlocked.Exchange(ref state.ActiveToolCallCount, 0); - state.HasUsedToolsThisTurn = false; - ClearDeferredIdleTracking(state); - state.IsReconnectedSend = false; // INV-1: clear all per-turn flags on abort - Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); - // Release send lock — allows a subsequent SteerSessionAsync to acquire it immediately - Interlocked.Exchange(ref state.SendingFlag, 0); + ClearProcessingState(state, accumulateApiTime: false); + state.WasUserAborted = true; // Suppress EVT-REARM for in-flight TurnStart events after abort + state.AllowTurnStartRearm = false; // Abort is explicit terminal intent — do not revive on late TurnStart replays + // Per-turn tracking fields are already cleared by ClearProcessingState. // Clear queued messages so they don't auto-send after abort state.Info.MessageQueue.Clear(); _queuedImagePaths.TryRemove(sessionName, out _); _queuedAgentModes.TryRemove(sessionName, out _); _permissionRecoveryAttempts.TryRemove(sessionName, out _); - // Cancel any pending TurnEnd→Idle fallback so it doesn't fire CompleteResponse after abort - CancelTurnEndFallback(state); CancelProcessingWatchdog(state); - CancelToolHealthCheck(state); - state.FlushedResponse.Clear(); - state.PendingReasoningMessages.Clear(); - state.Info.ClearPermissionDenials(); // INV-1: clear on all termination paths // Complete TCS AFTER state cleanup (INV-O3) state.ResponseCompletion?.TrySetCanceled(); // Fire completion notification so orchestrator loops are unblocked (INV-O4)