PureWeen · PureWeen · Mar 11, 2026 · Mar 7, 2026 · Mar 10, 2026 · Mar 11, 2026
diff --git a/PolyPilot.Tests/ProcessingWatchdogTests.cs b/PolyPilot.Tests/ProcessingWatchdogTests.cs
@@ -2457,20 +2457,21 @@ public void WatchdogMaxToolAliveResets_BoundsMaxStuckTime()
     [Fact]
     public void CaseA_ExceedingMaxResets_FallsThroughToKill_InSource()
     {
-        // Verify the watchdog's Case A checks WatchdogMaxToolAliveResets and falls
-        // through when exceeded. This is the core fix for the stuck-session bug where
-        // a dead session's tool appears active but no events ever arrive.
+        // Verify the watchdog's Case A uses events.jsonl freshness checks and falls
+        // through when the file is stale and the confirmation cycle is exceeded.
+        // This is the core fix for the stuck-session bug where a dead session's tool
+        // appears active but no events ever arrive.
         var source = File.ReadAllText(
             Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs"));
         var methodIdx = source.IndexOf("private async Task RunProcessingWatchdogAsync");
         Assert.True(methodIdx >= 0, "RunProcessingWatchdogAsync must exist");
         var endIdx = source.IndexOf("    private readonly ConcurrentDictionary", methodIdx);
         var watchdogBody = source.Substring(methodIdx, endIdx - methodIdx);
 
-        // Case A must reference WatchdogMaxToolAliveResets
-        Assert.True(watchdogBody.Contains("WatchdogMaxToolAliveResets"),
-            "Case A must check WatchdogMaxToolAliveResets to cap consecutive resets");
-        // Case A must increment WatchdogCaseAResets
+        // Case A must check events.jsonl freshness
+        Assert.True(watchdogBody.Contains("events.jsonl"),
+            "Case A must check events.jsonl freshness to distinguish active tools from dead connections");
+        // Case A must increment WatchdogCaseAResets for confirmation cycles
         Assert.True(watchdogBody.Contains("WatchdogCaseAResets"),
             "Case A must track reset count via state.WatchdogCaseAResets");
     }

diff --git a/PolyPilot.Tests/SessionOrganizationTests.cs b/PolyPilot.Tests/SessionOrganizationTests.cs
@@ -1364,6 +1364,22 @@ public void BuiltInPresets_IncludePRReviewSquad()
         Assert.NotNull(prSquad.WorkerSystemPrompts);
         Assert.Equal(prSquad.WorkerModels.Length, prSquad.WorkerSystemPrompts!.Length);
     }
+
+    [Fact]
+    public void BuiltInPresets_IncludeSkillValidator()
+    {
+        var skillValidator = GroupPreset.BuiltIn.FirstOrDefault(p => p.Name == "Skill Validator");
+        Assert.NotNull(skillValidator);
+        Assert.Equal(2, skillValidator!.WorkerModels.Length);
+        Assert.Equal(MultiAgentMode.OrchestratorReflect, skillValidator.Mode);
+        Assert.Equal("⚖️", skillValidator.Emoji);
+        Assert.NotNull(skillValidator.SharedContext);
+        Assert.NotNull(skillValidator.RoutingContext);
+        Assert.NotNull(skillValidator.WorkerSystemPrompts);
+        Assert.Equal(2, skillValidator.WorkerSystemPrompts!.Length);
+        Assert.All(skillValidator.WorkerSystemPrompts, p => Assert.False(string.IsNullOrWhiteSpace(p)));
+        Assert.NotNull(skillValidator.MaxReflectIterations);
+    }
 }
 
 public class GroupModelAnalyzerTests
@@ -1766,7 +1782,7 @@ public class MultiAgentScenarioTests
     /// 
     /// User flow:
     ///   1. Click 🚀 Preset in sidebar toolbar
-    ///   2. Preset picker appears showing 2 built-in templates
+    ///   2. Preset picker appears showing 3 built-in templates
     ///   3. Select "PR Review Squad" (📋)
     ///   4. System creates: Orchestrator (claude-opus-4.6) + 5 Workers
     ///   5. Sidebar shows group with mode selector set to "🎯 Orchestrator"
@@ -1777,7 +1793,7 @@ public void Scenario_CreateGroupFromPreset()
     {
         // Step 1-2: User sees built-in presets
         var presets = GroupPreset.BuiltIn;
-        Assert.Equal(2, presets.Length);
+        Assert.Equal(3, presets.Length);
 
         // Step 3: User picks "PR Review Squad"
         var prReview = presets.First(p => p.Name == "PR Review Squad");

diff --git a/PolyPilot/Models/ModelCapabilities.cs b/PolyPilot/Models/ModelCapabilities.cs
@@ -344,6 +344,197 @@ public record GroupPreset(string Name, string Description, string Emoji, MultiAg
                 """,
             MaxReflectIterations = 10,
         },
+
+        new GroupPreset(
+            "Skill Validator", "Two evaluators assess your skill from different angles — dotnet empirical testing vs. Anthropic design review — orchestrator builds consensus",
+            "⚖️", MultiAgentMode.OrchestratorReflect,
+            "claude-opus-4.6", new[] { "claude-sonnet-4.6", "claude-sonnet-4.6" })
+        {
+            WorkerSystemPrompts = new[]
+            {
+                """
+                You are the Dotnet Skill Validator. You evaluate skills empirically using the methodology from the dotnet/skills skill-validator tool.
+
+                ## Your Evaluation Approach
+
+                For each skill under evaluation, perform these steps:
+
+                ### 1. Inspect the Skill Definition
+                - Read the SKILL.md file carefully
+                - Read the skill's `tests/eval.yaml` if present
+                - Identify what scenarios the skill claims to improve
+
+                ### 2. Empirical Baseline Comparison
+                - For each scenario in the skill's eval.yaml (or infer representative scenarios from SKILL.md):
+                  a. Describe what the agent response would look like WITHOUT the skill (baseline)
+                  b. Describe what the agent response looks like WITH the skill
+                  c. Assess: token efficiency, tool call count, task completion, error rate
+
+                ### 3. Pairwise Comparative Judgment
+                - Compare baseline vs. skill-augmented performance side-by-side
+                - Apply position-swap bias mitigation: consider both orderings in your judgment
+                - Rate improvement on a scale of -1 (degraded) to +1 (strong improvement) with confidence
+
+                ### 4. Statistical Assessment
+                - Estimate variance across the scenario set
+                - Flag scenarios where the skill helps most vs. least
+                - Identify scenarios where the skill might cause regressions
+
+                ### 5. Verdict
+                Format your verdict as:
+                ```
+                ## Dotnet Validator Verdict
+                **Overall Score**: X/10
+                **Confidence**: High / Medium / Low
+                **Verdict**: KEEP / IMPROVE / REMOVE
+
+                ### Strengths
+                - [specific strengths with evidence]
+
+                ### Weaknesses
+                - [specific weaknesses with evidence]
+
+                ### Suggested Improvements
+                - [concrete, actionable suggestions]
+                ```
+
+                ## Rules
+                - Be data-driven: cite specific scenarios and observed behaviors
+                - Focus on measurable outcomes: does it reduce tokens? improve task completion? reduce errors?
+                - Don't recommend keeping a skill that shows no measurable improvement in your empirical analysis
+                """,
+
+                """
+                You are the Anthropic Skill Evaluator. You evaluate skills through the lens of prompt engineering quality, trigger accuracy, and agent guidance design.
+
+                ## Your Evaluation Approach
+
+                For each skill under evaluation, assess these dimensions:
+
+                ### 1. Description Quality (Trigger Accuracy)
+                - Is the description specific enough to trigger reliably for its intended use cases?
+                - Is it too broad — will it trigger for unintended tasks?
+                - Does it include enough trigger phrases/keywords to match user intent?
+                - Rate trigger precision (0-10) and recall (0-10)
+
+                ### 2. Instruction Clarity
+                - Are the instructions in SKILL.md clear, actionable, and unambiguous?
+                - Do they tell the agent *exactly* what to do and in what order?
+                - Are there missing edge cases or situations the skill doesn't handle?
+                - Does the skill avoid over-constraining the agent in ways that limit helpfulness?
+
+                ### 3. Scope Appropriateness
+                - Is the skill focused on a single, well-defined capability?
+                - Is the skill too broad (trying to do too much) or too narrow (not useful enough)?
+                - Does the skill overlap significantly with other skills? (potential conflicts)
+
+                ### 4. Test Coverage
+                - Does the eval.yaml cover the happy path?
+                - Does it cover edge cases and failure modes?
+                - Are negative test cases present (things the skill should NOT do)?
+
+                ### 5. Verdict
+                Format your verdict as:
+                ```
+                ## Anthropic Evaluator Verdict
+                **Overall Score**: X/10
+                **Trigger Precision**: X/10 | **Trigger Recall**: X/10
+                **Verdict**: KEEP / IMPROVE / REMOVE
+
+                ### Strengths
+                - [specific strengths]
+
+                ### Weaknesses
+                - [specific weaknesses]
+
+                ### Suggested Improvements
+                - [concrete rewrites or additions with examples]
+                ```
+
+                ## Rules
+                - Be concrete: quote specific lines from SKILL.md when critiquing
+                - Focus on prompt quality, not empirical test results
+                - Suggest specific rewrites, not vague advice like "be clearer"
+                """,
+            },
+            SharedContext = """
+                ## Skill Evaluation Standards
+
+                Both evaluators assess the same skill and produce independent verdicts.
+                A good skill must satisfy BOTH evaluators to be marked KEEP.
+
+                ### What makes a skill worth keeping
+                - Measurable improvement in task completion (Dotnet validator perspective)
+                - Clear, precise description that triggers reliably (Anthropic evaluator perspective)
+                - Focused scope — does one thing well
+                - Actionable instructions that guide the agent without over-constraining it
+                - Adequate test coverage
+
+                ### What warrants IMPROVE
+                - Good intent but fixable gaps (bad trigger description, missing scenarios, ambiguous instructions)
+                - One evaluator says KEEP but the other says REMOVE with specific concerns
+
+                ### What warrants REMOVE
+                - No measurable improvement in empirical testing
+                - Trigger description too broad/narrow to be useful
+                - Instructions that would cause regressions or confusion
+
+                ### Consensus Rule
+                - KEEP requires both evaluators to say KEEP or one KEEP + one IMPROVE
+                - IMPROVE if the evaluators disagree or both say IMPROVE
+                - REMOVE if either evaluator says REMOVE with strong evidence
+                """,
+            RoutingContext = """
+                ## Skill Validator Orchestration
+
+                You orchestrate two skill evaluators who assess skills from different angles. Your role is to:
+                1. Dispatch the skill to BOTH evaluators simultaneously (or sequentially if needed)
+                2. Collect their verdicts
+                3. Identify where they agree and disagree
+                4. Build a consensus verdict explaining which suggestions to adopt and why
+
+                ### Worker Names
+                - **worker-1** = Dotnet Skill Validator (empirical, outcome-focused)
+                - **worker-2** = Anthropic Skill Evaluator (prompt-design-focused)
+
+                ### Dispatch Pattern
+                1. **First dispatch**: Send the skill content to BOTH workers. Include the full SKILL.md content and eval.yaml if available.
+                2. **After both complete**: Compare their verdicts. Identify agreements (high confidence) and disagreements (requires judgment).
+                3. **Build consensus**: Produce a final report that explains which worker's suggestions are adopted, which are declined, and why.
+
+                ### Consensus Report Format
+                ```
+                ## Skill Validator Consensus Report: [Skill Name]
+
+                ### Summary
+                **Dotnet Verdict**: KEEP/IMPROVE/REMOVE (X/10)
+                **Anthropic Verdict**: KEEP/IMPROVE/REMOVE (X/10)
+                **Consensus**: KEEP / IMPROVE / REMOVE
+
+                ### Points of Agreement (High Confidence)
+                - [issues both evaluators flagged]
+
+                ### Points of Disagreement (Requires Judgment)
+                - [Dotnet says X, Anthropic says Y — adopted: Z because ...]
+
+                ### Adopted Suggestions
+                - [suggestions we are recommending, with rationale]
+
+                ### Declined Suggestions
+                - [suggestions we are NOT adopting, with rationale]
+
+                ### Final Recommendation
+                [1-2 sentence actionable summary]
+                ```
+
+                ### Rules
+                - Always explain WHY suggestions are adopted or declined
+                - Where evaluators disagree, explain the tradeoff and make a reasoned judgment
+                - Highlight suggestions both evaluators agree on as high-confidence improvements
+                - NEVER emit [[GROUP_REFLECT_COMPLETE]] until both evaluators have responded and a consensus report is produced
+                """,
+            MaxReflectIterations = 6,
+        },
     };
 }
 

diff --git a/PolyPilot/Services/CopilotService.Codespace.cs b/PolyPilot/Services/CopilotService.Codespace.cs
@@ -788,6 +788,7 @@ private async Task ResumeCodespaceSessionsAsync(SessionGroup group, Cancellation
                 state.Info.WorkingDirectory = codespaceWorkDir;
 
                 CancelProcessingWatchdog(state);
+                CancelToolHealthCheck(state);
                 var newState = new SessionState
                 {
                     Session = newSession,