From 3250789134deb5cbc77f0a9c72988181ff0f76c1 Mon Sep 17 00:00:00 2001 From: Christopher Date: Wed, 27 May 2026 21:06:56 +1000 Subject: [PATCH 1/4] skill-quality: add overspecification anti-pattern + coverage-contract guidance Co-Authored-By: Claude Sonnet 4.6 --- .../references/skill-quality-checklist.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/plugins/agentic-engineering/skills/agent-plugin-review/references/skill-quality-checklist.md b/plugins/agentic-engineering/skills/agent-plugin-review/references/skill-quality-checklist.md index d3e95575c..a7f1c949a 100644 --- a/plugins/agentic-engineering/skills/agent-plugin-review/references/skill-quality-checklist.md +++ b/plugins/agentic-engineering/skills/agent-plugin-review/references/skill-quality-checklist.md @@ -56,6 +56,16 @@ description: Use when tests have race conditions, timing dependencies, or pass/f - [ ] Move heavy reference (100+ lines) to separate files - [ ] Use cross-references instead of repeating content from other skills - [ ] Compress examples — one excellent example beats many mediocre ones +- [ ] When SKILL.md exceeds ~500 words for a standard skill, the heaviest section is almost always inlined reference material — extract it + +### Coverage Contracts vs. Rule Restatement + +When a skill author wants to enforce that the agent doesn't skip rules, the temptation is to inline each rule with its full rationale. Don't. + +- **Coverage contract pattern:** Keep one-line checklist items in SKILL.md naming each rule and citing the reference file (e.g., `"Lifecycle choice — apply large-table rule in references/schema-rules.md"`). Add one sentence: "Silence on any item is itself a review gap." Close the silent-skip loophole with: "If a reference file is unavailable, say so explicitly rather than skipping it." +- **Anti-pattern:** Multi-paragraph items that restate rules and rationale already in `references/`. The fix is structural — the prose is in the wrong file, not the wrong shape. Move operational procedures (how to locate files, `find` syntax, what to record) and output-format meta (citation discipline worked examples) into `references/`. Mark that file as always-load. + +> **Observed case:** A routing SKILL.md grew from ~150 words to ~1168 words when a 5-item "Review Checklist" was inlined — each item restating a rule that also lived in `references/`, ending with "see `references/foo.md`." The reference already existed; the inline was pure duplication. Fix: one-sentence contract per item + a new `references/review-process.md` for the procedures. Result: SKILL.md back to ~388 words; all eval rubric items still satisfied. ### Structure @@ -115,6 +125,8 @@ Match specificity to the task's fragility: | Version printing instructions | Fragile, rely on git history | | Hardcoded local paths | Machine-specific, not portable | | Description summarizes workflow | the agent follows description, skips SKILL.md body | +| SKILL.md inlines rule prose that also lives in `references/` | Two sources of truth — the inline copy drifts from the canonical reference; agent applies the SKILL.md version and ignores the more detailed reference | +| SKILL.md embeds operational procedures or worked-example pairs | Procedures (how to locate files, `find` syntax, what to record) and output-format meta (citation discipline examples) belong in `references/` per progressive disclosure | ## Discipline-Enforcing Skills (Additional Checks) @@ -125,3 +137,5 @@ For skills that enforce rules (TDD, verification, coding standards): - [ ] Red flags list for self-checking - [ ] "Spirit vs letter" addressed: "Violating the letter IS violating the spirit" - [ ] Hard gates at critical decision points +- [ ] Discipline patterns (output-format meta, citation examples, verification procedures) live in `references/` — SKILL.md names them in one line and cites the file +- [ ] Discipline reference file is marked as always-load so the agent cannot bypass it (don't inline to guarantee coverage — mark as unmissable instead) From 9e82d1c72a6401f3790a5a6ec1efe6a75e24641c Mon Sep 17 00:00:00 2001 From: Christopher Date: Wed, 27 May 2026 21:10:01 +1000 Subject: [PATCH 2/4] test(studio): fix tests broken by default-to-projects-dashboard change Co-Authored-By: Claude Sonnet 4.6 --- apps/cli/test/commands/results/serve.test.ts | 4 ++-- apps/cli/test/unit/studio-navigation.test.ts | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts index 678a0f910..e4fd28c79 100644 --- a/apps/cli/test/commands/results/serve.test.ts +++ b/apps/cli/test/commands/results/serve.test.ts @@ -183,9 +183,9 @@ describe('loadResults', () => { // ── resolveDashboardMode ─────────────────────────────────────────────── describe('resolveDashboardMode', () => { - it('defaults to single-project mode when no projects are registered', () => { + it('defaults to project dashboard mode when no projects are registered', () => { expect(resolveDashboardMode(0, {})).toEqual({ - projectDashboard: false, + projectDashboard: true, }); }); diff --git a/apps/cli/test/unit/studio-navigation.test.ts b/apps/cli/test/unit/studio-navigation.test.ts index 41eb8b3a2..561e20ab2 100644 --- a/apps/cli/test/unit/studio-navigation.test.ts +++ b/apps/cli/test/unit/studio-navigation.test.ts @@ -13,8 +13,8 @@ import { } from '../../../studio/src/lib/navigation.ts'; describe('studio navigation helpers', () => { - it('redirects the root entrypoint to the only registered project', () => { - expect(resolveIndexRoute(['demo-project'], undefined, 'analytics')).toEqual({ + it('redirects when the preferred project id matches a registered project', () => { + expect(resolveIndexRoute(['demo-project'], undefined, 'demo-project', 'analytics')).toEqual({ kind: 'redirect', redirectPath: '/projects/demo-project?tab=analytics', }); From 5b77a80fdc5dd664304811207999b5a2b37bf9a5 Mon Sep 17 00:00:00 2001 From: Christopher Date: Wed, 27 May 2026 21:11:53 +1000 Subject: [PATCH 3/4] fix(lint): resolve pre-existing biome format/type errors Co-Authored-By: Claude Sonnet 4.6 --- apps/cli/src/commands/results/remote.ts | 18 ++++++++++-------- apps/cli/src/commands/results/serve.ts | 4 +++- apps/studio/src/components/EvalDetail.tsx | 12 ++---------- apps/studio/src/components/StopRunButton.tsx | 5 ++++- apps/studio/src/routes/index.tsx | 2 +- 5 files changed, 20 insertions(+), 21 deletions(-) diff --git a/apps/cli/src/commands/results/remote.ts b/apps/cli/src/commands/results/remote.ts index 70d0aba5b..3969480ad 100644 --- a/apps/cli/src/commands/results/remote.ts +++ b/apps/cli/src/commands/results/remote.ts @@ -4,6 +4,7 @@ import path from 'node:path'; import { DEFAULT_THRESHOLD, type EvaluationResult, + type GitListedRun, type ResultsConfig, type ResultsRepoStatus, directPushResults, @@ -24,11 +25,10 @@ import { listResultFilesFromRunsDir, } from '../inspect/utils.js'; - // ── In-memory TTL cache for listGitRuns ──────────────────────────── // Avoids repeated expensive git ls-tree + git cat-file --batch operations // on every API request. Cache key is repoDir, TTL is 60 seconds. -const gitRunsCache = new Map(); +const gitRunsCache = new Map; expiresAt: number }>(); const GIT_RUNS_CACHE_TTL_MS = 60_000; function cachedListGitRuns(repoDir: string) { @@ -40,12 +40,14 @@ function cachedListGitRuns(repoDir: string) { const promise = listGitRuns(repoDir); gitRunsCache.set(repoDir, { data: promise, expiresAt: now + GIT_RUNS_CACHE_TTL_MS }); // Evict stale entry once the promise settles so a fresh fetch replaces it - promise.catch(() => {}).finally(() => { - const entry = gitRunsCache.get(repoDir); - if (entry && entry.expiresAt <= Date.now()) { - gitRunsCache.delete(repoDir); - } - }); + promise + .catch(() => {}) + .finally(() => { + const entry = gitRunsCache.get(repoDir); + if (entry && entry.expiresAt <= Date.now()) { + gitRunsCache.delete(repoDir); + } + }); return promise; } diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index 34de9b080..746d487e2 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -1633,7 +1633,9 @@ export const resultsServeCommand = command({ // Clone or pull any project entries that declare a source. // Non-blocking: fire-and-forget so startup is instant even when some // project paths are missing or slow (e.g. /tmp paths that timeout). - syncProjects(registry.projects).catch((err) => console.error("Background project sync failed:", err)); + syncProjects(registry.projects).catch((err) => + console.error('Background project sync failed:', err), + ); try { let results: EvaluationResult[] = []; diff --git a/apps/studio/src/components/EvalDetail.tsx b/apps/studio/src/components/EvalDetail.tsx index 9b114d0cc..5ba44e12e 100644 --- a/apps/studio/src/components/EvalDetail.tsx +++ b/apps/studio/src/components/EvalDetail.tsx @@ -287,11 +287,7 @@ function FilesTab({ return (
{/* FileTree panel — desktop: side-by-side, mobile: full-width slide-over */} -
+
{/* MonacoViewer panel — desktop: side-by-side, mobile: full-width */} -
+
diff --git a/apps/studio/src/components/StopRunButton.tsx b/apps/studio/src/components/StopRunButton.tsx index 534a77103..3d1ae636f 100644 --- a/apps/studio/src/components/StopRunButton.tsx +++ b/apps/studio/src/components/StopRunButton.tsx @@ -59,7 +59,10 @@ export function StopRunButton({ runId, status, isReadOnly, projectId }: StopRunB 'Stopping…' ) : ( <> -