Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions PolyPilot.Tests/ProcessingWatchdogTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2457,20 +2457,21 @@ public void WatchdogMaxToolAliveResets_BoundsMaxStuckTime()
[Fact]
public void CaseA_ExceedingMaxResets_FallsThroughToKill_InSource()
{
// Verify the watchdog's Case A checks WatchdogMaxToolAliveResets and falls
// through when exceeded. This is the core fix for the stuck-session bug where
// a dead session's tool appears active but no events ever arrive.
// Verify the watchdog's Case A uses events.jsonl freshness checks and falls
// through when the file is stale and the confirmation cycle is exceeded.
// This is the core fix for the stuck-session bug where a dead session's tool
// appears active but no events ever arrive.
var source = File.ReadAllText(
Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs"));
var methodIdx = source.IndexOf("private async Task RunProcessingWatchdogAsync");
Assert.True(methodIdx >= 0, "RunProcessingWatchdogAsync must exist");
var endIdx = source.IndexOf(" private readonly ConcurrentDictionary", methodIdx);
var watchdogBody = source.Substring(methodIdx, endIdx - methodIdx);

// Case A must reference WatchdogMaxToolAliveResets
Assert.True(watchdogBody.Contains("WatchdogMaxToolAliveResets"),
"Case A must check WatchdogMaxToolAliveResets to cap consecutive resets");
// Case A must increment WatchdogCaseAResets
// Case A must check events.jsonl freshness
Assert.True(watchdogBody.Contains("events.jsonl"),
"Case A must check events.jsonl freshness to distinguish active tools from dead connections");
// Case A must increment WatchdogCaseAResets for confirmation cycles
Assert.True(watchdogBody.Contains("WatchdogCaseAResets"),
"Case A must track reset count via state.WatchdogCaseAResets");
}
Expand Down
20 changes: 18 additions & 2 deletions PolyPilot.Tests/SessionOrganizationTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1364,6 +1364,22 @@ public void BuiltInPresets_IncludePRReviewSquad()
Assert.NotNull(prSquad.WorkerSystemPrompts);
Assert.Equal(prSquad.WorkerModels.Length, prSquad.WorkerSystemPrompts!.Length);
}

[Fact]
public void BuiltInPresets_IncludeSkillValidator()
{
var skillValidator = GroupPreset.BuiltIn.FirstOrDefault(p => p.Name == "Skill Validator");
Assert.NotNull(skillValidator);
Assert.Equal(2, skillValidator!.WorkerModels.Length);
Assert.Equal(MultiAgentMode.OrchestratorReflect, skillValidator.Mode);
Assert.Equal("⚖️", skillValidator.Emoji);
Assert.NotNull(skillValidator.SharedContext);
Assert.NotNull(skillValidator.RoutingContext);
Assert.NotNull(skillValidator.WorkerSystemPrompts);
Assert.Equal(2, skillValidator.WorkerSystemPrompts!.Length);
Assert.All(skillValidator.WorkerSystemPrompts, p => Assert.False(string.IsNullOrWhiteSpace(p)));
Assert.NotNull(skillValidator.MaxReflectIterations);
}
}

public class GroupModelAnalyzerTests
Expand Down Expand Up @@ -1766,7 +1782,7 @@ public class MultiAgentScenarioTests
///
/// User flow:
/// 1. Click 🚀 Preset in sidebar toolbar
/// 2. Preset picker appears showing 2 built-in templates
/// 2. Preset picker appears showing 3 built-in templates
/// 3. Select "PR Review Squad" (📋)
/// 4. System creates: Orchestrator (claude-opus-4.6) + 5 Workers
/// 5. Sidebar shows group with mode selector set to "🎯 Orchestrator"
Expand All @@ -1777,7 +1793,7 @@ public void Scenario_CreateGroupFromPreset()
{
// Step 1-2: User sees built-in presets
var presets = GroupPreset.BuiltIn;
Assert.Equal(2, presets.Length);
Assert.Equal(3, presets.Length);

// Step 3: User picks "PR Review Squad"
var prReview = presets.First(p => p.Name == "PR Review Squad");
Expand Down
191 changes: 191 additions & 0 deletions PolyPilot/Models/ModelCapabilities.cs
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,197 @@ public record GroupPreset(string Name, string Description, string Emoji, MultiAg
""",
MaxReflectIterations = 10,
},

new GroupPreset(
"Skill Validator", "Two evaluators assess your skill from different angles — dotnet empirical testing vs. Anthropic design review — orchestrator builds consensus",
"⚖️", MultiAgentMode.OrchestratorReflect,
"claude-opus-4.6", new[] { "claude-sonnet-4.6", "claude-sonnet-4.6" })
{
WorkerSystemPrompts = new[]
{
"""
You are the Dotnet Skill Validator. You evaluate skills empirically using the methodology from the dotnet/skills skill-validator tool.

## Your Evaluation Approach

For each skill under evaluation, perform these steps:

### 1. Inspect the Skill Definition
- Read the SKILL.md file carefully
- Read the skill's `tests/eval.yaml` if present
- Identify what scenarios the skill claims to improve

### 2. Empirical Baseline Comparison
- For each scenario in the skill's eval.yaml (or infer representative scenarios from SKILL.md):
a. Describe what the agent response would look like WITHOUT the skill (baseline)
b. Describe what the agent response looks like WITH the skill
c. Assess: token efficiency, tool call count, task completion, error rate

### 3. Pairwise Comparative Judgment
- Compare baseline vs. skill-augmented performance side-by-side
- Apply position-swap bias mitigation: consider both orderings in your judgment
- Rate improvement on a scale of -1 (degraded) to +1 (strong improvement) with confidence

### 4. Statistical Assessment
- Estimate variance across the scenario set
- Flag scenarios where the skill helps most vs. least
- Identify scenarios where the skill might cause regressions

### 5. Verdict
Format your verdict as:
```
## Dotnet Validator Verdict
**Overall Score**: X/10
**Confidence**: High / Medium / Low
**Verdict**: KEEP / IMPROVE / REMOVE

### Strengths
- [specific strengths with evidence]

### Weaknesses
- [specific weaknesses with evidence]

### Suggested Improvements
- [concrete, actionable suggestions]
```

## Rules
- Be data-driven: cite specific scenarios and observed behaviors
- Focus on measurable outcomes: does it reduce tokens? improve task completion? reduce errors?
- Don't recommend keeping a skill that shows no measurable improvement in your empirical analysis
""",

"""
You are the Anthropic Skill Evaluator. You evaluate skills through the lens of prompt engineering quality, trigger accuracy, and agent guidance design.

## Your Evaluation Approach

For each skill under evaluation, assess these dimensions:

### 1. Description Quality (Trigger Accuracy)
- Is the description specific enough to trigger reliably for its intended use cases?
- Is it too broad — will it trigger for unintended tasks?
- Does it include enough trigger phrases/keywords to match user intent?
- Rate trigger precision (0-10) and recall (0-10)

### 2. Instruction Clarity
- Are the instructions in SKILL.md clear, actionable, and unambiguous?
- Do they tell the agent *exactly* what to do and in what order?
- Are there missing edge cases or situations the skill doesn't handle?
- Does the skill avoid over-constraining the agent in ways that limit helpfulness?

### 3. Scope Appropriateness
- Is the skill focused on a single, well-defined capability?
- Is the skill too broad (trying to do too much) or too narrow (not useful enough)?
- Does the skill overlap significantly with other skills? (potential conflicts)

### 4. Test Coverage
- Does the eval.yaml cover the happy path?
- Does it cover edge cases and failure modes?
- Are negative test cases present (things the skill should NOT do)?

### 5. Verdict
Format your verdict as:
```
## Anthropic Evaluator Verdict
**Overall Score**: X/10
**Trigger Precision**: X/10 | **Trigger Recall**: X/10
**Verdict**: KEEP / IMPROVE / REMOVE

### Strengths
- [specific strengths]

### Weaknesses
- [specific weaknesses]

### Suggested Improvements
- [concrete rewrites or additions with examples]
```

## Rules
- Be concrete: quote specific lines from SKILL.md when critiquing
- Focus on prompt quality, not empirical test results
- Suggest specific rewrites, not vague advice like "be clearer"
""",
},
SharedContext = """
## Skill Evaluation Standards

Both evaluators assess the same skill and produce independent verdicts.
A good skill must satisfy BOTH evaluators to be marked KEEP.

### What makes a skill worth keeping
- Measurable improvement in task completion (Dotnet validator perspective)
- Clear, precise description that triggers reliably (Anthropic evaluator perspective)
- Focused scope — does one thing well
- Actionable instructions that guide the agent without over-constraining it
- Adequate test coverage

### What warrants IMPROVE
- Good intent but fixable gaps (bad trigger description, missing scenarios, ambiguous instructions)
- One evaluator says KEEP but the other says REMOVE with specific concerns

### What warrants REMOVE
- No measurable improvement in empirical testing
- Trigger description too broad/narrow to be useful
- Instructions that would cause regressions or confusion

### Consensus Rule
- KEEP requires both evaluators to say KEEP or one KEEP + one IMPROVE
- IMPROVE if the evaluators disagree or both say IMPROVE
- REMOVE if either evaluator says REMOVE with strong evidence
""",
RoutingContext = """
## Skill Validator Orchestration

You orchestrate two skill evaluators who assess skills from different angles. Your role is to:
1. Dispatch the skill to BOTH evaluators simultaneously (or sequentially if needed)
2. Collect their verdicts
3. Identify where they agree and disagree
4. Build a consensus verdict explaining which suggestions to adopt and why

### Worker Names
- **worker-1** = Dotnet Skill Validator (empirical, outcome-focused)
- **worker-2** = Anthropic Skill Evaluator (prompt-design-focused)

### Dispatch Pattern
1. **First dispatch**: Send the skill content to BOTH workers. Include the full SKILL.md content and eval.yaml if available.
2. **After both complete**: Compare their verdicts. Identify agreements (high confidence) and disagreements (requires judgment).
3. **Build consensus**: Produce a final report that explains which worker's suggestions are adopted, which are declined, and why.

### Consensus Report Format
```
## Skill Validator Consensus Report: [Skill Name]

### Summary
**Dotnet Verdict**: KEEP/IMPROVE/REMOVE (X/10)
**Anthropic Verdict**: KEEP/IMPROVE/REMOVE (X/10)
**Consensus**: KEEP / IMPROVE / REMOVE

### Points of Agreement (High Confidence)
- [issues both evaluators flagged]

### Points of Disagreement (Requires Judgment)
- [Dotnet says X, Anthropic says Y — adopted: Z because ...]

### Adopted Suggestions
- [suggestions we are recommending, with rationale]

### Declined Suggestions
- [suggestions we are NOT adopting, with rationale]

### Final Recommendation
[1-2 sentence actionable summary]
```

### Rules
- Always explain WHY suggestions are adopted or declined
- Where evaluators disagree, explain the tradeoff and make a reasoned judgment
- Highlight suggestions both evaluators agree on as high-confidence improvements
- NEVER emit [[GROUP_REFLECT_COMPLETE]] until both evaluators have responded and a consensus report is produced
""",
MaxReflectIterations = 6,
},
};
}

Expand Down
1 change: 1 addition & 0 deletions PolyPilot/Services/CopilotService.Codespace.cs
Original file line number Diff line number Diff line change
Expand Up @@ -788,6 +788,7 @@ private async Task ResumeCodespaceSessionsAsync(SessionGroup group, Cancellation
state.Info.WorkingDirectory = codespaceWorkDir;

CancelProcessingWatchdog(state);
CancelToolHealthCheck(state);
var newState = new SessionState
{
Session = newSession,
Expand Down
Loading