From a03d564ccb6ba447cba8a1f8f5c03d04f70e5d0c Mon Sep 17 00:00:00 2001
From: Sandy Tao <sandytao520@icloud.com>
Date: Fri, 10 Apr 2026 09:31:29 -0700
Subject: [PATCH 1/6] improve prompt to only extract reusable workflows

---
 evals/skill_extraction.eval.ts                | 384 ++++++++++++++++++
 .../src/agents/skill-extraction-agent.test.ts |  90 ++++
 .../core/src/agents/skill-extraction-agent.ts |  75 +++-
 3 files changed, 528 insertions(+), 21 deletions(-)
 create mode 100644 evals/skill_extraction.eval.ts
 create mode 100644 packages/core/src/agents/skill-extraction-agent.test.ts
diff --git a/evals/skill_extraction.eval.ts b/evals/skill_extraction.eval.ts
new file mode 100644
index 00000000000..5feaa29a20a
--- /dev/null
+++ b/evals/skill_extraction.eval.ts
@@ -0,0 +1,384 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import fs from 'node:fs';
+import fsp from 'node:fs/promises';
+import path from 'node:path';
+import { randomUUID } from 'node:crypto';
+import { describe, expect } from 'vitest';
+import {
+  Storage,
+  SESSION_FILE_PREFIX,
+  getProjectHash,
+  startMemoryService,
+} from '@google/gemini-cli-core';
+import {
+  loadCliConfig,
+  type CliArgs,
+} from '../packages/cli/src/config/config.js';
+import {
+  loadSettings,
+  resetSettingsCacheForTesting,
+} from '../packages/cli/src/config/settings.js';
+import { validateNonInteractiveAuth } from '../packages/cli/src/validateNonInterActiveAuth.js';
+import { evalTest, assertModelHasOutput, type TestRig } from './test-helper.js';
+
+interface SeedSession {
+  sessionId: string;
+  summary: string;
+  userTurns: string[];
+  timestampOffsetMinutes: number;
+}
+
+const MEMORY_EXTRACTION_ARGV: CliArgs = {
+  query: undefined,
+  model: undefined,
+  sandbox: undefined,
+  debug: false,
+  prompt: undefined,
+  promptInteractive: undefined,
+  yolo: true,
+  approvalMode: 'yolo',
+  policy: undefined,
+  adminPolicy: undefined,
+  allowedMcpServerNames: undefined,
+  allowedTools: undefined,
+  acp: false,
+  experimentalAcp: false,
+  extensions: undefined,
+  listExtensions: false,
+  resume: undefined,
+  listSessions: false,
+  deleteSession: undefined,
+  includeDirectories: undefined,
+  screenReader: false,
+  useWriteTodos: undefined,
+  outputFormat: undefined,
+  fakeResponses: undefined,
+  recordResponses: undefined,
+  startupMessages: [],
+  rawOutput: false,
+  acceptRawOutputRisk: false,
+  isCommand: false,
+};
+
+const WORKSPACE_FILES = {
+  'package.json': JSON.stringify(
+    {
+      name: 'skill-extraction-eval',
+      private: true,
+      scripts: {
+        build: 'echo build',
+        lint: 'echo lint',
+        test: 'echo test',
+      },
+    },
+    null,
+    2,
+  ),
+  'README.md': `# Skill Extraction Eval
+
+This workspace exists to exercise background skill extraction from prior chats.
+`,
+};
+
+function restoreGeminiHome(previousValue: string | undefined): void {
+  if (previousValue === undefined) {
+    delete process.env['GEMINI_CLI_HOME'];
+  } else {
+    process.env['GEMINI_CLI_HOME'] = previousValue;
+  }
+}
+
+async function withRigStorage<T>(
+  rig: TestRig,
+  fn: (storage: Storage, projectRoot: string) => Promise<T>,
+): Promise<T> {
+  const previousGeminiHome = process.env['GEMINI_CLI_HOME'];
+  process.env['GEMINI_CLI_HOME'] = rig.homeDir!;
+
+  try {
+    const projectRoot = fs.realpathSync(rig.testDir!);
+    const storage = new Storage(projectRoot);
+    await storage.initialize();
+    return await fn(storage, projectRoot);
+  } finally {
+    restoreGeminiHome(previousGeminiHome);
+  }
+}
+
+function buildMessages(userTurns: string[]) {
+  const baseTime = new Date(Date.now() - 6 * 60 * 60 * 1000).toISOString();
+  return userTurns.flatMap((text, index) => [
+    {
+      id: `u${index + 1}`,
+      timestamp: baseTime,
+      type: 'user',
+      content: [{ text }],
+    },
+    {
+      id: `a${index + 1}`,
+      timestamp: baseTime,
+      type: 'gemini',
+      content: [{ text: `Acknowledged: ${index + 1}` }],
+    },
+  ]);
+}
+
+async function seedSessions(
+  rig: TestRig,
+  sessions: SeedSession[],
+): Promise<void> {
+  await withRigStorage(rig, async (storage, projectRoot) => {
+    const chatsDir = path.join(storage.getProjectTempDir(), 'chats');
+    await fsp.mkdir(chatsDir, { recursive: true });
+
+    for (const session of sessions) {
+      const timestamp = new Date(
+        Date.now() - session.timestampOffsetMinutes * 60 * 1000,
+      )
+        .toISOString()
+        .slice(0, 16)
+        .replace(/:/g, '-');
+      const filename = `${SESSION_FILE_PREFIX}${timestamp}-${session.sessionId.slice(0, 8)}.json`;
+      const conversation = {
+        sessionId: session.sessionId,
+        projectHash: getProjectHash(projectRoot),
+        summary: session.summary,
+        startTime: new Date(Date.now() - 7 * 60 * 60 * 1000).toISOString(),
+        lastUpdated: new Date(Date.now() - 4 * 60 * 60 * 1000).toISOString(),
+        messages: buildMessages(session.userTurns),
+      };
+
+      await fsp.writeFile(
+        path.join(chatsDir, filename),
+        JSON.stringify(conversation, null, 2),
+      );
+    }
+  });
+}
+
+async function waitForExtractionState(rig: TestRig): Promise<{
+  state: { runs: Array<{ sessionIds: string[]; skillsCreated: string[] }> };
+  skillsDir: string;
+}> {
+  return withRigStorage(rig, async (storage, projectRoot) => {
+    // The headless CLI eval finishes and exits before its fire-and-forget
+    // memory task can complete, so invoke the real memory service directly.
+    const previousCwd = process.cwd();
+    let config: Awaited<ReturnType<typeof loadCliConfig>> | undefined;
+
+    process.chdir(projectRoot);
+
+    try {
+      resetSettingsCacheForTesting();
+      const settings = loadSettings(projectRoot);
+      config = await loadCliConfig(
+        settings.merged,
+        `skill-extraction-eval-${randomUUID().slice(0, 8)}`,
+        MEMORY_EXTRACTION_ARGV,
+        { cwd: projectRoot },
+      );
+      await config.initialize();
+
+      const authType = await validateNonInteractiveAuth(
+        settings.merged.security.auth.selectedType,
+        settings.merged.security.auth.useExternal,
+        config,
+        settings,
+      );
+      await config.refreshAuth(authType);
+      await startMemoryService(config);
+    } finally {
+      process.chdir(previousCwd);
+      resetSettingsCacheForTesting();
+      await config?.dispose();
+    }
+
+    const statePath = path.join(
+      storage.getProjectMemoryTempDir(),
+      '.extraction-state.json',
+    );
+    const skillsDir = storage.getProjectSkillsMemoryDir();
+
+    const raw = await fsp.readFile(statePath, 'utf-8');
+    const state = JSON.parse(raw) as {
+      runs?: Array<{ sessionIds?: string[]; skillsCreated?: string[] }>;
+    };
+    if (!Array.isArray(state.runs) || state.runs.length === 0) {
+      throw new Error(
+        'Skill extraction finished without writing any run state',
+      );
+    }
+
+    return {
+      state: {
+        runs: state.runs.map((run) => ({
+          sessionIds: Array.isArray(run.sessionIds) ? run.sessionIds : [],
+          skillsCreated: Array.isArray(run.skillsCreated)
+            ? run.skillsCreated
+            : [],
+        })),
+      },
+      skillsDir,
+    };
+  });
+}
+
+async function readSkillBodies(skillsDir: string): Promise<string[]> {
+  try {
+    const entries = await fsp.readdir(skillsDir, { withFileTypes: true });
+    const skillDirs = entries.filter((entry) => entry.isDirectory());
+    const bodies = await Promise.all(
+      skillDirs.map((entry) =>
+        fsp.readFile(path.join(skillsDir, entry.name, 'SKILL.md'), 'utf-8'),
+      ),
+    );
+    return bodies;
+  } catch {
+    return [];
+  }
+}
+
+describe('Skill Extraction', () => {
+  evalTest('USUALLY_PASSES', {
+    suiteName: 'skill-extraction',
+    suiteType: 'behavioral',
+    name: 'ignores one-off incidents even when session summaries look similar',
+    files: WORKSPACE_FILES,
+    timeout: 180000,
+    params: {
+      settings: {
+        experimental: {
+          memoryManager: true,
+        },
+      },
+    },
+    setup: async (rig) => {
+      await seedSessions(rig, [
+        {
+          sessionId: 'incident-login-redirect',
+          summary: 'Debug login redirect loop in staging',
+          timestampOffsetMinutes: 420,
+          userTurns: [
+            'We only need a one-off fix for incident INC-4412 on branch hotfix/login-loop.',
+            'The exact failing string is ERR_REDIRECT_4412 and this workaround is incident-specific.',
+            'Patch packages/auth/src/redirect.ts just for this branch and do not generalize it.',
+            'The thing that worked was deleting the stale staging cookie before retrying.',
+            'This is not a normal workflow and should not become a reusable instruction.',
+            'It only reproduced against the 2026-04-08 staging rollout.',
+            'After the cookie clear, the branch-specific redirect logic passed.',
+            'Do not turn this incident writeup into a standing process.',
+            'Yes, the hotfix worked for this exact redirect-loop incident.',
+            'Close out INC-4412 once the staging login succeeds again.',
+          ],
+        },
+        {
+          sessionId: 'incident-login-timeout',
+          summary: 'Debug login callback timeout in staging',
+          timestampOffsetMinutes: 360,
+          userTurns: [
+            'This is another one-off staging incident, this time TICKET-991 for callback timeout.',
+            'The exact failing string is ERR_CALLBACK_TIMEOUT_991 and it is unrelated to the redirect loop.',
+            'The temporary fix was rotating the staging secret and deleting a bad feature-flag row.',
+            'Do not write a generic login-debugging playbook from this.',
+            'This only applied to the callback timeout during the April rollout.',
+            'The successful fix was specific to the stale secret in staging.',
+            'It does not define a durable repo workflow for future tasks.',
+            'After rotating the secret, the callback timeout stopped reproducing.',
+            'Treat this as incident response only, not a reusable skill.',
+            'Once staging passed again, we closed TICKET-991.',
+          ],
+        },
+      ]);
+    },
+    prompt:
+      'Read the local workspace files and summarize this repository in two short sentences.',
+    assert: async (rig, result) => {
+      assertModelHasOutput(result);
+
+      const { state, skillsDir } = await waitForExtractionState(rig);
+      const skillBodies = await readSkillBodies(skillsDir);
+
+      expect(state.runs).toHaveLength(1);
+      expect(state.runs[0].sessionIds).toHaveLength(2);
+      expect(state.runs[0].skillsCreated).toEqual([]);
+      expect(skillBodies).toEqual([]);
+    },
+  });
+
+  evalTest('USUALLY_PASSES', {
+    suiteName: 'skill-extraction',
+    suiteType: 'behavioral',
+    name: 'extracts a repeated project-specific workflow into a skill',
+    files: WORKSPACE_FILES,
+    timeout: 180000,
+    params: {
+      settings: {
+        experimental: {
+          memoryManager: true,
+        },
+      },
+    },
+    setup: async (rig) => {
+      await seedSessions(rig, [
+        {
+          sessionId: 'settings-docs-regen-1',
+          summary: 'Update settings docs after adding a config option',
+          timestampOffsetMinutes: 420,
+          userTurns: [
+            'When we add a new config option, we have to regenerate the settings docs in a specific order.',
+            'The sequence that worked was npm run predocs:settings, npm run schema:settings, then npm run docs:settings.',
+            'Do not hand-edit generated settings docs.',
+            'If predocs is skipped, the generated schema docs miss the new defaults.',
+            'Update the source first, then run that generation sequence.',
+            'After regenerating, verify the schema output and docs changed together.',
+            'We used this same sequence the last time we touched settings docs.',
+            'That ordered workflow passed and produced the expected generated files.',
+            'Please keep the exact command order because reversing it breaks the output.',
+            'Yes, the generated settings docs were correct after those three commands.',
+          ],
+        },
+        {
+          sessionId: 'settings-docs-regen-2',
+          summary: 'Regenerate settings schema docs for another new setting',
+          timestampOffsetMinutes: 360,
+          userTurns: [
+            'We are touching another setting, so follow the same settings-doc regeneration workflow again.',
+            'Run npm run predocs:settings before npm run schema:settings and npm run docs:settings.',
+            'The project keeps generated settings docs in sync through those commands, not manual edits.',
+            'Skipping predocs caused stale defaults in the generated output before.',
+            'Change the source, then execute the same three commands in order.',
+            'Verify both the schema artifact and docs update together after regeneration.',
+            'This is the recurring workflow we use whenever a setting changes.',
+            'The exact order worked again on this second settings update.',
+            'Please preserve that ordering constraint for future settings changes.',
+            'Confirmed: the settings docs regenerated correctly with the same command sequence.',
+          ],
+        },
+      ]);
+    },
+    prompt:
+      'Read the local workspace files and summarize this repository in two short sentences.',
+    assert: async (rig, result) => {
+      assertModelHasOutput(result);
+
+      const { state, skillsDir } = await waitForExtractionState(rig);
+      const skillBodies = await readSkillBodies(skillsDir);
+      const combinedSkills = skillBodies.join('\n\n');
+
+      expect(state.runs).toHaveLength(1);
+      expect(state.runs[0].sessionIds).toHaveLength(2);
+      expect(state.runs[0].skillsCreated.length).toBeGreaterThanOrEqual(1);
+      expect(skillBodies.length).toBeGreaterThanOrEqual(1);
+      expect(combinedSkills).toContain('npm run predocs:settings');
+      expect(combinedSkills).toContain('npm run schema:settings');
+      expect(combinedSkills).toContain('npm run docs:settings');
+      expect(combinedSkills).toMatch(/When to Use/i);
+      expect(combinedSkills).toMatch(/Verification/i);
+    },
+  });
+});
diff --git a/packages/core/src/agents/skill-extraction-agent.test.ts b/packages/core/src/agents/skill-extraction-agent.test.ts
new file mode 100644
index 00000000000..a67c7db270e
--- /dev/null
+++ b/packages/core/src/agents/skill-extraction-agent.test.ts
@@ -0,0 +1,90 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, expect, it } from 'vitest';
+import { SkillExtractionAgent } from './skill-extraction-agent.js';
+import {
+  EDIT_TOOL_NAME,
+  GLOB_TOOL_NAME,
+  GREP_TOOL_NAME,
+  LS_TOOL_NAME,
+  READ_FILE_TOOL_NAME,
+  WRITE_FILE_TOOL_NAME,
+} from '../tools/tool-names.js';
+import { PREVIEW_GEMINI_FLASH_MODEL } from '../config/models.js';
+
+describe('SkillExtractionAgent', () => {
+  const skillsDir = '/tmp/skills';
+  const sessionIndex =
+    '[NEW] Debug login flow (12 user msgs) — /tmp/chats/session-1.json';
+  const existingSkillsSummary =
+    '## Workspace Skills (.gemini/skills — do NOT duplicate)\n- **existing-skill**: Existing description';
+
+  const agent = SkillExtractionAgent(
+    skillsDir,
+    sessionIndex,
+    existingSkillsSummary,
+  );
+
+  it('should expose expected metadata, model, and tools', () => {
+    expect(agent.kind).toBe('local');
+    expect(agent.name).toBe('confucius');
+    expect(agent.displayName).toBe('Skill Extractor');
+    expect(agent.modelConfig.model).toBe(PREVIEW_GEMINI_FLASH_MODEL);
+    expect(agent.toolConfig?.tools).toEqual(
+      expect.arrayContaining([
+        READ_FILE_TOOL_NAME,
+        WRITE_FILE_TOOL_NAME,
+        EDIT_TOOL_NAME,
+        LS_TOOL_NAME,
+        GLOB_TOOL_NAME,
+        GREP_TOOL_NAME,
+      ]),
+    );
+  });
+
+  it('should default to no skill unless recurrence and durability are proven', () => {
+    const prompt = agent.promptConfig.systemPrompt;
+
+    expect(prompt).toContain('Default to NO SKILL.');
+    expect(prompt).toContain(
+      'strong evidence this will recur for future agents in this repo/workflow',
+    );
+    expect(prompt).toContain('broader than a single incident');
+    expect(prompt).toContain('A skill MUST meet ALL of these criteria:');
+    expect(prompt).toContain(
+      'Future agents in this repo/workflow are likely to need it',
+    );
+  });
+
+  it('should explicitly reject one-off incidents and single-session preferences', () => {
+    const prompt = agent.promptConfig.systemPrompt;
+
+    expect(prompt).toContain('Single-session preferences');
+    expect(prompt).toContain('One-off incidents');
+    expect(prompt).toContain('Output-style preferences');
+    expect(prompt).toContain('cannot survive renaming the specific');
+  });
+
+  it('should warn that session summaries are user-intent summaries, not workflow evidence', () => {
+    const query = agent.promptConfig.query ?? '';
+
+    expect(query).toContain(existingSkillsSummary);
+    expect(query).toContain(sessionIndex);
+    expect(query).toContain(
+      'The summary is a user-intent summary, not a workflow summary.',
+    );
+    expect(query).toContain(
+      'The session summaries describe user intent, not workflow details.',
+    );
+    expect(query).toContain(
+      'Only write a skill if the evidence shows a durable, recurring workflow',
+    );
+    expect(query).toContain(
+      'If recurrence or future reuse is unclear, create no skill and explain why.',
+    );
+  });
+});
diff --git a/packages/core/src/agents/skill-extraction-agent.ts b/packages/core/src/agents/skill-extraction-agent.ts
index 2678bd206dc..771c94eb2f2 100644
--- a/packages/core/src/agents/skill-extraction-agent.ts
+++ b/packages/core/src/agents/skill-extraction-agent.ts
@@ -36,7 +36,7 @@ function buildSystemPrompt(skillsDir: string): string {
     '- solve similar tasks with fewer tool calls and fewer reasoning tokens',
     '- reuse proven workflows and verification checklists',
     '- avoid known failure modes and landmines',
-    '- anticipate user preferences without being reminded',
+    '- capture durable workflow constraints that future agents are likely to encounter again',
     '',
     '============================================================',
     'SAFETY AND HYGIENE (STRICT)',
@@ -59,6 +59,10 @@ function buildSystemPrompt(skillsDir: string): string {
     '1. "Is this something a competent agent would NOT already know?" If no, STOP.',
     '2. "Does an existing skill (listed below) already cover this?" If yes, STOP.',
     '3. "Can I write a concrete, step-by-step procedure?" If no, STOP.',
+    '4. "Is there strong evidence this will recur for future agents in this repo/workflow?" If no, STOP.',
+    '5. "Is this broader than a single incident (one bug, one ticket, one branch, one date, one exact error)?" If no, STOP.',
+    '',
+    'Default to NO SKILL.',
     '',
     'Do NOT create skills for:',
     '',
@@ -67,6 +71,10 @@ function buildSystemPrompt(skillsDir: string): string {
     '- **Pure Q&A**: The user asked "how does X work?" and got an answer. No procedure.',
     '- **Brainstorming/design**: Discussion of how to build something, without a validated',
     '  implementation that produced a reusable procedure.',
+    '- **Single-session preferences**: User-specific style/output preferences or workflow',
+    '  preferences mentioned only once.',
+    '- **One-off incidents**: Debugging or incident response tied to a single bug, ticket,',
+    '  branch, date, or exact error string.',
     '- **Anything already covered by an existing skill** (global, workspace, builtin, or',
     '  previously extracted). Check the "Existing Skills" section carefully.',
     '',
@@ -74,31 +82,40 @@ function buildSystemPrompt(skillsDir: string): string {
     'WHAT COUNTS AS A SKILL',
     '============================================================',
     '',
-    'A skill MUST meet BOTH of these criteria:',
+    'A skill MUST meet ALL of these criteria:',
     '',
     '1. **Procedural and concrete**: It can be expressed as numbered steps with specific',
     '   commands, paths, or code patterns. If you can only write vague guidance, it is NOT',
     '   a skill. "Be careful with X" is advice, not a skill.',
     '',
-    '2. **Non-obvious and project-specific**: A competent agent would NOT already know this.',
-    '   It encodes project-specific knowledge, non-obvious ordering constraints, or',
-    '   hard-won failure shields that cannot be inferred from the codebase alone.',
+    '2. **Durable and reusable**: Future agents in this repo/workflow are likely to need it',
+    '   again. If it only solved one incident, it is NOT a skill.',
+    '',
+    '3. **Evidence-backed and project-specific**: It encodes project-specific knowledge,',
+    '   repeated operational constraints, or hard-won failure shields supported by session',
+    '   evidence. Do not assume something is non-obvious just because it sounds detailed.',
     '',
-    'Confidence tiers (prefer higher tiers):',
+    'Confidence tiers:',
     '',
-    '**High confidence** — create the skill:',
-    '- The same workflow appeared in multiple sessions (cross-session repetition)',
-    '- A multi-step procedure was validated (tests passed, user confirmed success)',
+    '**High confidence** — create the skill only when recurrence/durability is clear:',
+    '- The same workflow appeared in multiple sessions (cross-session repetition), OR it is',
+    '  a stable recurring repo workflow (for example setup/build/test/deploy/release) with a',
+    '  clear future trigger',
+    '- The workflow was validated (tests passed, user confirmed success, or the same fix',
+    '  worked repeatedly)',
+    '- The skill can be named without referencing a specific incident, bug, branch, or date',
     '',
-    '**Medium confidence** — create the skill if it is clearly project-specific:',
-    '- A project-specific build/test/deploy/release procedure was established',
-    '- A non-obvious ordering constraint or prerequisite was discovered',
-    '- A failure mode was hit and a concrete fix was found and verified',
+    '**Medium confidence** — usually do NOT create the skill yet:',
+    '- A project-specific procedure appeared once and seems useful, but recurrence is not yet',
+    '  clear',
+    '- A verified fix exists, but it is still tied to one incident',
+    '- A user correction changed the approach once, but durability is uncertain',
     '',
     '**Low confidence** — do NOT create the skill:',
     '- A one-off debugging session with no reusable procedure',
     '- Generic workflows any agent could figure out from the codebase',
     '- A code review or investigation with no durable takeaway',
+    '- Output-style preferences that do not materially change procedure',
     '',
     'Aim for 0-2 skills per run. Quality over quantity.',
     '',
@@ -117,8 +134,10 @@ function buildSystemPrompt(skillsDir: string): string {
     '',
     'What to look for:',
     '',
-    '- User corrections: "No, do it this way" -> preference signal',
+    '- User corrections that change procedure in a durable way, especially when repeated',
+    '  across sessions',
     '- Repeated patterns across sessions: same commands, same file paths, same workflow',
+    '- Stable recurring repo lifecycle workflows with clear future triggers',
     '- Failed attempts followed by successful ones -> failure shield',
     '- Multi-step procedures that were validated (tests passed, user confirmed)',
     '- User interruptions: "Stop, you need to X first" -> ordering constraint',
@@ -129,6 +148,8 @@ function buildSystemPrompt(skillsDir: string): string {
     '- Tool outputs that are just data (file contents, search results)',
     '- Speculative plans that were never executed',
     "- Temporary context (current branch name, today's date, specific error IDs)",
+    '- Similar session summaries without matching workflow evidence',
+    '- One-off artifact names: bug IDs, branch names, timestamps, exact incident strings',
     '',
     '============================================================',
     'SKILL FORMAT',
@@ -214,7 +235,10 @@ function buildSystemPrompt(skillsDir: string): string {
     '- Keep scopes distinct. Avoid overlapping "do-everything" skills.',
     '- Every skill MUST have: triggers, procedure, at least one pitfall or verification step.',
     '- If you cannot write a reliable procedure (too many unknowns), do NOT create the skill.',
-    '- Do not create skills for generic advice that any competent agent would already know.',
+    '- If the candidate is tied to one incident or cannot survive renaming the specific',
+    '  bug/ticket, do NOT create it.',
+    '- Do not create skills for generic advice, output-style preferences, or ephemeral',
+    '  choices that any competent agent would already know or adapt to on the fly.',
     '- Prefer fewer, higher-quality skills. 0-2 skills per run is typical. 3+ is unusual.',
     '',
     '============================================================',
@@ -224,17 +248,23 @@ function buildSystemPrompt(skillsDir: string): string {
     `1. Use list_directory on ${skillsDir} to see existing skills.`,
     '2. If skills exist, read their SKILL.md files to understand what is already captured.',
     '3. Scan the session index provided in the query. Look for [NEW] sessions whose summaries',
-    '   suggest workflows that ALSO appear in other sessions (either [NEW] or [old]).',
-    '4. Apply the minimum signal gate. If no repeated patterns are visible, report that and finish.',
+    '   hint at workflows that ALSO appear in other sessions (either [NEW] or [old]) or at a',
+    '   stable recurring repo workflow. Remember: summary similarity alone is NOT enough.',
+    '4. Apply the minimum signal gate. If recurrence or durability is not visible, report that',
+    '   no skill should be created and finish.',
     '5. For promising patterns, use read_file on the session file paths to inspect the full',
-    '   conversation. Confirm the workflow was actually repeated and validated.',
-    '6. For each confirmed skill, verify it meets ALL criteria (repeatable, procedural, high-leverage).',
+    '   conversation. Confirm the workflow was actually repeated and validated. Read at least',
+    '   two sessions unless the candidate is clearly a stable recurring repo lifecycle workflow.',
+    '6. For each candidate, verify it meets ALL criteria. Before writing, make sure you can',
+    '   state: future trigger, evidence sessions, recurrence signal, validation signal, and',
+    '   why it is not generic.',
     '7. Write new SKILL.md files or update existing ones in your directory using write_file.',
     '   For skills that live OUTSIDE your directory, write a .patch file instead (see UPDATING EXISTING SKILLS).',
     '8. Write COMPLETE files — never partially update a SKILL.md.',
     '',
     'IMPORTANT: Do NOT read every session. Only read sessions whose summaries suggest a',
-    'repeated pattern worth investigating. Most runs should read 0-3 sessions and create 0 skills.',
+    'repeated pattern or a stable recurring repo workflow worth investigating. Most runs',
+    'should read 0-3 sessions and create 0 skills.',
     'Do not explore the codebase. Work only with the session index, session files, and the skills directory.',
   ].join('\n');
 }
@@ -301,6 +331,9 @@ export const SkillExtractionAgent = (
         'Below is an index of past conversation sessions. Each line shows:',
         '[NEW] or [old] status, a 1-line summary, message count, and the file path.',
         '',
+        'The summary is a user-intent summary, not a workflow summary.',
+        'Matching summary text alone is never enough evidence for a reusable skill.',
+        '',
         '[NEW] = not yet processed for skill extraction (focus on these)',
         '[old] = previously processed (read only if a [NEW] session hints at a repeated pattern)',
         '',
@@ -319,7 +352,7 @@ export const SkillExtractionAgent = (
 
     return {
       systemPrompt: buildSystemPrompt(skillsDir),
-      query: `${initialContext}\n\nAnalyze the session index above. Read sessions that suggest repeated workflows using read_file. Extract reusable skills to ${skillsDir}/.`,
+      query: `${initialContext}\n\nAnalyze the session index above. The session summaries describe user intent, not workflow details. Read sessions that suggest repeated workflows using read_file. Only write a skill if the evidence shows a durable, recurring workflow or a stable recurring repo procedure. If recurrence or future reuse is unclear, create no skill and explain why.`,
     };
   },
   runConfig: {

From 42a4dcbae95636999d8ee3aa1387d33dbcb80b5f Mon Sep 17 00:00:00 2001
From: Sandy Tao <sandytao520@icloud.com>
Date: Mon, 13 Apr 2026 13:28:15 -0700
Subject: [PATCH 2/6] refactor(evals): use vi.stubEnv for GEMINI_CLI_HOME in
 skill extraction eval

---
 evals/skill_extraction.eval.ts | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/evals/skill_extraction.eval.ts b/evals/skill_extraction.eval.ts
index 5feaa29a20a..ee4464b7246 100644
--- a/evals/skill_extraction.eval.ts
+++ b/evals/skill_extraction.eval.ts
@@ -8,7 +8,7 @@ import fs from 'node:fs';
 import fsp from 'node:fs/promises';
 import path from 'node:path';
 import { randomUUID } from 'node:crypto';
-import { describe, expect } from 'vitest';
+import { describe, expect, vi } from 'vitest';
 import {
   Storage,
   SESSION_FILE_PREFIX,
@@ -85,20 +85,11 @@ This workspace exists to exercise background skill extraction from prior chats.
 `,
 };
 
-function restoreGeminiHome(previousValue: string | undefined): void {
-  if (previousValue === undefined) {
-    delete process.env['GEMINI_CLI_HOME'];
-  } else {
-    process.env['GEMINI_CLI_HOME'] = previousValue;
-  }
-}
-
 async function withRigStorage<T>(
   rig: TestRig,
   fn: (storage: Storage, projectRoot: string) => Promise<T>,
 ): Promise<T> {
-  const previousGeminiHome = process.env['GEMINI_CLI_HOME'];
-  process.env['GEMINI_CLI_HOME'] = rig.homeDir!;
+  vi.stubEnv('GEMINI_CLI_HOME', rig.homeDir!);
 
   try {
     const projectRoot = fs.realpathSync(rig.testDir!);
@@ -106,7 +97,7 @@ async function withRigStorage<T>(
     await storage.initialize();
     return await fn(storage, projectRoot);
   } finally {
-    restoreGeminiHome(previousGeminiHome);
+    vi.unstubAllEnvs();
   }
 }
 

From 794d04274a16b8fa1f279263c9d517537ba4c5ce Mon Sep 17 00:00:00 2001
From: Sandy Tao <sandytao520@icloud.com>
Date: Mon, 13 Apr 2026 14:25:54 -0700
Subject: [PATCH 3/6] test(evals): add migration workflow extraction eval case

---
 evals/skill_extraction.eval.ts | 72 ++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/evals/skill_extraction.eval.ts b/evals/skill_extraction.eval.ts
index ee4464b7246..ef32edcaca2 100644
--- a/evals/skill_extraction.eval.ts
+++ b/evals/skill_extraction.eval.ts
@@ -372,4 +372,76 @@ describe('Skill Extraction', () => {
       expect(combinedSkills).toMatch(/Verification/i);
     },
   });
+
+  evalTest('USUALLY_PASSES', {
+    suiteName: 'skill-extraction',
+    suiteType: 'behavioral',
+    name: 'extracts a repeated multi-step migration workflow with ordering constraints',
+    files: WORKSPACE_FILES,
+    timeout: 180000,
+    params: {
+      settings: {
+        experimental: {
+          memoryManager: true,
+        },
+      },
+    },
+    setup: async (rig) => {
+      await seedSessions(rig, [
+        {
+          sessionId: 'db-migration-v12',
+          summary: 'Run database migration for v12 schema update',
+          timestampOffsetMinutes: 420,
+          userTurns: [
+            'Every time we change the database schema we follow a specific migration workflow.',
+            'First run npm run db:check to verify no pending migrations conflict.',
+            'Then run npm run db:migrate to apply the new migration files.',
+            'After migration, always run npm run db:validate to confirm schema integrity.',
+            'If db:validate fails, immediately run npm run db:rollback before anything else.',
+            'Never skip db:check — last time we did, two migrations collided and corrupted the index.',
+            'The ordering is critical: check, migrate, validate. Reversing migrate and validate caused silent data loss before.',
+            'This v12 migration passed after following that exact sequence.',
+            'We use this same three-step workflow every time the schema changes.',
+            'Confirmed: db:check, db:migrate, db:validate completed successfully for v12.',
+          ],
+        },
+        {
+          sessionId: 'db-migration-v13',
+          summary: 'Run database migration for v13 schema update',
+          timestampOffsetMinutes: 360,
+          userTurns: [
+            'New schema change for v13, following the same database migration workflow as before.',
+            'Start with npm run db:check to ensure no conflicting pending migrations.',
+            'Then npm run db:migrate to apply the v13 migration files.',
+            'Then npm run db:validate to confirm the schema is consistent.',
+            'If validation fails, run npm run db:rollback immediately — do not attempt manual fixes.',
+            'We learned the hard way that skipping db:check causes index corruption.',
+            'The check-migrate-validate order is mandatory for every schema change.',
+            'This is the same recurring workflow we used for v12 and earlier migrations.',
+            'The v13 migration passed with the same three-step sequence.',
+            'Confirmed: the standard db migration workflow succeeded again for v13.',
+          ],
+        },
+      ]);
+    },
+    prompt:
+      'Read the local workspace files and summarize this repository in two short sentences.',
+    assert: async (rig, result) => {
+      assertModelHasOutput(result);
+
+      const { state, skillsDir } = await waitForExtractionState(rig);
+      const skillBodies = await readSkillBodies(skillsDir);
+      const combinedSkills = skillBodies.join('\n\n');
+
+      expect(state.runs).toHaveLength(1);
+      expect(state.runs[0].sessionIds).toHaveLength(2);
+      expect(state.runs[0].skillsCreated.length).toBeGreaterThanOrEqual(1);
+      expect(skillBodies.length).toBeGreaterThanOrEqual(1);
+      expect(combinedSkills).toContain('npm run db:check');
+      expect(combinedSkills).toContain('npm run db:migrate');
+      expect(combinedSkills).toContain('npm run db:validate');
+      expect(combinedSkills).toMatch(/rollback/i);
+      expect(combinedSkills).toMatch(/When to Use/i);
+    },
+  });
 });

From ef522bb5d84f62768c70737e56a26c4c856ee0b5 Mon Sep 17 00:00:00 2001
From: Sandy Tao <sandytao520@icloud.com>
Date: Mon, 13 Apr 2026 16:50:52 -0700
Subject: [PATCH 4/6] fix(evals): add return type to buildMessages and simplify
 Config type annotation

---
 evals/skill_extraction.eval.ts | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/evals/skill_extraction.eval.ts b/evals/skill_extraction.eval.ts
index ef32edcaca2..b7815134156 100644
--- a/evals/skill_extraction.eval.ts
+++ b/evals/skill_extraction.eval.ts
@@ -14,6 +14,7 @@ import {
   SESSION_FILE_PREFIX,
   getProjectHash,
   startMemoryService,
+  Config,
 } from '@google/gemini-cli-core';
 import {
   loadCliConfig,
@@ -33,6 +34,13 @@ interface SeedSession {
   timestampOffsetMinutes: number;
 }
 
+interface MessageRecord {
+  id: string;
+  timestamp: string;
+  type: string;
+  content: Array<{ text: string }>;
+}
+
 const MEMORY_EXTRACTION_ARGV: CliArgs = {
   query: undefined,
   model: undefined,
@@ -101,7 +109,7 @@ async function withRigStorage<T>(
   }
 }
 
-function buildMessages(userTurns: string[]) {
+function buildMessages(userTurns: string[]): MessageRecord[] {
   const baseTime = new Date(Date.now() - 6 * 60 * 60 * 1000).toISOString();
   return userTurns.flatMap((text, index) => [
     {
@@ -160,7 +168,7 @@ async function waitForExtractionState(rig: TestRig): Promise<{
     // The headless CLI eval finishes and exits before its fire-and-forget
     // memory task can complete, so invoke the real memory service directly.
     const previousCwd = process.cwd();
-    let config: Awaited<ReturnType<typeof loadCliConfig>> | undefined;
+    let config: Config | undefined;
 
     process.chdir(projectRoot);
 

From 26987f747f75dcf6d746f52ba0af6983e8cc5744 Mon Sep 17 00:00:00 2001
From: Sandy Tao <sandytao520@icloud.com>
Date: Mon, 13 Apr 2026 20:53:42 -0700
Subject: [PATCH 5/6] refactor(evals): migrate skill extraction to
 componentEvalTest

Replaces the evalTest approach (full CLI subprocess + loadCliConfig) with
componentEvalTest (in-process makeFakeConfig + direct startMemoryService).

Key changes:
- ComponentRig now creates an isolated homeDir and stubs GEMINI_CLI_HOME
  after auth to isolate storage paths (sessions, skills, extraction state).
- ComponentRig.cleanup() calls config.dispose() and vi.unstubAllEnvs().
- Skill extraction evals pass approvalMode: YOLO to auto-approve tool
  calls (write_file/read_file) in non-interactive mode.
- Removes ~100 lines of boilerplate (withRigStorage, waitForExtractionState,
  loadCliConfig, loadSettings, process.chdir).
---
 evals/component-test-helper.ts |  18 ++-
 evals/skill_extraction.eval.ts | 284 ++++++++++-----------------------
 2 files changed, 102 insertions(+), 200 deletions(-)

diff --git a/evals/component-test-helper.ts b/evals/component-test-helper.ts
index 9be68e6936a..097f6e3d05e 100644
--- a/evals/component-test-helper.ts
+++ b/evals/component-test-helper.ts
@@ -16,6 +16,7 @@ import fs from 'node:fs';
 import path from 'node:path';
 import os from 'node:os';
 import { randomUUID } from 'node:crypto';
+import { vi } from 'vitest';
 import {
   Config,
   type ConfigParameters,
@@ -52,6 +53,7 @@ export interface ComponentEvalCase extends BaseEvalCase {
 export class ComponentRig {
   public config: Config | undefined;
   public testDir: string;
+  public homeDir: string;
   public sessionId: string;
 
   constructor(
@@ -61,6 +63,9 @@ export class ComponentRig {
     this.testDir = fs.mkdtempSync(
       path.join(os.tmpdir(), `gemini-component-rig-${uniqueId.slice(0, 8)}-`),
     );
+    this.homeDir = fs.mkdtempSync(
+      path.join(os.tmpdir(), `gemini-component-home-${uniqueId.slice(0, 8)}-`),
+    );
     this.sessionId = `test-session-${uniqueId}`;
   }
 
@@ -89,12 +94,23 @@ export class ComponentRig {
     this.config = makeFakeConfig(configParams);
     await this.config.initialize();
 
-    // Refresh auth using USE_GEMINI to initialize the real BaseLlmClient
+    // Refresh auth using USE_GEMINI to initialize the real BaseLlmClient.
+    // This must happen BEFORE stubbing GEMINI_CLI_HOME because OAuth credential
+    // lookup resolves through homedir() → GEMINI_CLI_HOME.
     await this.config.refreshAuth(AuthType.USE_GEMINI);
+
+    // Isolate storage paths (session files, skills, extraction state) by
+    // pointing GEMINI_CLI_HOME at a per-test temp directory.  Storage resolves
+    // global paths through `homedir()` which reads this env var.  This is set
+    // after auth so credential lookup uses the real home directory.
+    vi.stubEnv('GEMINI_CLI_HOME', this.homeDir);
   }
 
   async cleanup() {
+    await this.config?.dispose();
+    vi.unstubAllEnvs();
     fs.rmSync(this.testDir, { recursive: true, force: true });
+    fs.rmSync(this.homeDir, { recursive: true, force: true });
   }
 }
 
diff --git a/evals/skill_extraction.eval.ts b/evals/skill_extraction.eval.ts
index b7815134156..f22b8696484 100644
--- a/evals/skill_extraction.eval.ts
+++ b/evals/skill_extraction.eval.ts
@@ -4,28 +4,17 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-import fs from 'node:fs';
 import fsp from 'node:fs/promises';
 import path from 'node:path';
-import { randomUUID } from 'node:crypto';
-import { describe, expect, vi } from 'vitest';
+import { describe, expect } from 'vitest';
 import {
-  Storage,
+  type Config,
+  ApprovalMode,
   SESSION_FILE_PREFIX,
   getProjectHash,
   startMemoryService,
-  Config,
 } from '@google/gemini-cli-core';
-import {
-  loadCliConfig,
-  type CliArgs,
-} from '../packages/cli/src/config/config.js';
-import {
-  loadSettings,
-  resetSettingsCacheForTesting,
-} from '../packages/cli/src/config/settings.js';
-import { validateNonInteractiveAuth } from '../packages/cli/src/validateNonInterActiveAuth.js';
-import { evalTest, assertModelHasOutput, type TestRig } from './test-helper.js';
+import { componentEvalTest } from './component-test-helper.js';
 
 interface SeedSession {
   sessionId: string;
@@ -41,38 +30,6 @@ interface MessageRecord {
   content: Array<{ text: string }>;
 }
 
-const MEMORY_EXTRACTION_ARGV: CliArgs = {
-  query: undefined,
-  model: undefined,
-  sandbox: undefined,
-  debug: false,
-  prompt: undefined,
-  promptInteractive: undefined,
-  yolo: true,
-  approvalMode: 'yolo',
-  policy: undefined,
-  adminPolicy: undefined,
-  allowedMcpServerNames: undefined,
-  allowedTools: undefined,
-  acp: false,
-  experimentalAcp: false,
-  extensions: undefined,
-  listExtensions: false,
-  resume: undefined,
-  listSessions: false,
-  deleteSession: undefined,
-  includeDirectories: undefined,
-  screenReader: false,
-  useWriteTodos: undefined,
-  outputFormat: undefined,
-  fakeResponses: undefined,
-  recordResponses: undefined,
-  startupMessages: [],
-  rawOutput: false,
-  acceptRawOutputRisk: false,
-  isCommand: false,
-};
-
 const WORKSPACE_FILES = {
   'package.json': JSON.stringify(
     {
@@ -93,22 +50,6 @@ This workspace exists to exercise background skill extraction from prior chats.
 `,
 };
 
-async function withRigStorage<T>(
-  rig: TestRig,
-  fn: (storage: Storage, projectRoot: string) => Promise<T>,
-): Promise<T> {
-  vi.stubEnv('GEMINI_CLI_HOME', rig.homeDir!);
-
-  try {
-    const projectRoot = fs.realpathSync(rig.testDir!);
-    const storage = new Storage(projectRoot);
-    await storage.initialize();
-    return await fn(storage, projectRoot);
-  } finally {
-    vi.unstubAllEnvs();
-  }
-}
-
 function buildMessages(userTurns: string[]): MessageRecord[] {
   const baseTime = new Date(Date.now() - 6 * 60 * 60 * 1000).toISOString();
   return userTurns.flatMap((text, index) => [
@@ -128,103 +69,67 @@ function buildMessages(userTurns: string[]): MessageRecord[] {
 }
 
 async function seedSessions(
-  rig: TestRig,
+  config: Config,
   sessions: SeedSession[],
 ): Promise<void> {
-  await withRigStorage(rig, async (storage, projectRoot) => {
-    const chatsDir = path.join(storage.getProjectTempDir(), 'chats');
-    await fsp.mkdir(chatsDir, { recursive: true });
+  const chatsDir = path.join(config.storage.getProjectTempDir(), 'chats');
+  await fsp.mkdir(chatsDir, { recursive: true });
 
-    for (const session of sessions) {
-      const timestamp = new Date(
-        Date.now() - session.timestampOffsetMinutes * 60 * 1000,
-      )
-        .toISOString()
-        .slice(0, 16)
-        .replace(/:/g, '-');
-      const filename = `${SESSION_FILE_PREFIX}${timestamp}-${session.sessionId.slice(0, 8)}.json`;
-      const conversation = {
-        sessionId: session.sessionId,
-        projectHash: getProjectHash(projectRoot),
-        summary: session.summary,
-        startTime: new Date(Date.now() - 7 * 60 * 60 * 1000).toISOString(),
-        lastUpdated: new Date(Date.now() - 4 * 60 * 60 * 1000).toISOString(),
-        messages: buildMessages(session.userTurns),
-      };
+  const projectRoot = config.storage.getProjectRoot();
 
-      await fsp.writeFile(
-        path.join(chatsDir, filename),
-        JSON.stringify(conversation, null, 2),
-      );
-    }
-  });
+  for (const session of sessions) {
+    const timestamp = new Date(
+      Date.now() - session.timestampOffsetMinutes * 60 * 1000,
+    )
+      .toISOString()
+      .slice(0, 16)
+      .replace(/:/g, '-');
+    const filename = `${SESSION_FILE_PREFIX}${timestamp}-${session.sessionId.slice(0, 8)}.json`;
+    const conversation = {
+      sessionId: session.sessionId,
+      projectHash: getProjectHash(projectRoot),
+      summary: session.summary,
+      startTime: new Date(Date.now() - 7 * 60 * 60 * 1000).toISOString(),
+      lastUpdated: new Date(Date.now() - 4 * 60 * 60 * 1000).toISOString(),
+      messages: buildMessages(session.userTurns),
+    };
+
+    await fsp.writeFile(
+      path.join(chatsDir, filename),
+      JSON.stringify(conversation, null, 2),
+    );
+  }
 }
 
-async function waitForExtractionState(rig: TestRig): Promise<{
+async function runExtractionAndReadState(config: Config): Promise<{
   state: { runs: Array<{ sessionIds: string[]; skillsCreated: string[] }> };
   skillsDir: string;
 }> {
-  return withRigStorage(rig, async (storage, projectRoot) => {
-    // The headless CLI eval finishes and exits before its fire-and-forget
-    // memory task can complete, so invoke the real memory service directly.
-    const previousCwd = process.cwd();
-    let config: Config | undefined;
-
-    process.chdir(projectRoot);
+  await startMemoryService(config);
 
-    try {
-      resetSettingsCacheForTesting();
-      const settings = loadSettings(projectRoot);
-      config = await loadCliConfig(
-        settings.merged,
-        `skill-extraction-eval-${randomUUID().slice(0, 8)}`,
-        MEMORY_EXTRACTION_ARGV,
-        { cwd: projectRoot },
-      );
-      await config.initialize();
+  const memoryDir = config.storage.getProjectMemoryTempDir();
+  const skillsDir = config.storage.getProjectSkillsMemoryDir();
+  const statePath = path.join(memoryDir, '.extraction-state.json');
 
-      const authType = await validateNonInteractiveAuth(
-        settings.merged.security.auth.selectedType,
-        settings.merged.security.auth.useExternal,
-        config,
-        settings,
-      );
-      await config.refreshAuth(authType);
-      await startMemoryService(config);
-    } finally {
-      process.chdir(previousCwd);
-      resetSettingsCacheForTesting();
-      await config?.dispose();
-    }
-
-    const statePath = path.join(
-      storage.getProjectMemoryTempDir(),
-      '.extraction-state.json',
-    );
-    const skillsDir = storage.getProjectSkillsMemoryDir();
-
-    const raw = await fsp.readFile(statePath, 'utf-8');
-    const state = JSON.parse(raw) as {
-      runs?: Array<{ sessionIds?: string[]; skillsCreated?: string[] }>;
-    };
-    if (!Array.isArray(state.runs) || state.runs.length === 0) {
-      throw new Error(
-        'Skill extraction finished without writing any run state',
-      );
-    }
+  const raw = await fsp.readFile(statePath, 'utf-8');
+  const state = JSON.parse(raw) as {
+    runs?: Array<{ sessionIds?: string[]; skillsCreated?: string[] }>;
+  };
+  if (!Array.isArray(state.runs) || state.runs.length === 0) {
+    throw new Error('Skill extraction finished without writing any run state');
+  }
 
-    return {
-      state: {
-        runs: state.runs.map((run) => ({
-          sessionIds: Array.isArray(run.sessionIds) ? run.sessionIds : [],
-          skillsCreated: Array.isArray(run.skillsCreated)
-            ? run.skillsCreated
-            : [],
-        })),
-      },
-      skillsDir,
-    };
-  });
+  return {
+    state: {
+      runs: state.runs.map((run) => ({
+        sessionIds: Array.isArray(run.sessionIds) ? run.sessionIds : [],
+        skillsCreated: Array.isArray(run.skillsCreated)
+          ? run.skillsCreated
+          : [],
+      })),
+    },
+    skillsDir,
+  };
 }
 
 async function readSkillBodies(skillsDir: string): Promise<string[]> {
@@ -242,22 +147,27 @@ async function readSkillBodies(skillsDir: string): Promise<string[]> {
   }
 }
 
+/**
+ * Shared configOverrides for all skill extraction component evals.
+ * - experimentalMemoryManager: enables the memory extraction pipeline.
+ * - approvalMode: YOLO auto-approves tool calls (write_file, read_file) so the
+ *   background agent can execute without interactive confirmation.
+ */
+const EXTRACTION_CONFIG_OVERRIDES = {
+  experimentalMemoryManager: true,
+  approvalMode: ApprovalMode.YOLO,
+};
+
 describe('Skill Extraction', () => {
-  evalTest('USUALLY_PASSES', {
+  componentEvalTest('USUALLY_PASSES', {
     suiteName: 'skill-extraction',
-    suiteType: 'behavioral',
+    suiteType: 'component',
     name: 'ignores one-off incidents even when session summaries look similar',
     files: WORKSPACE_FILES,
     timeout: 180000,
-    params: {
-      settings: {
-        experimental: {
-          memoryManager: true,
-        },
-      },
-    },
-    setup: async (rig) => {
-      await seedSessions(rig, [
+    configOverrides: EXTRACTION_CONFIG_OVERRIDES,
+    setup: async (config) => {
+      await seedSessions(config, [
         {
           sessionId: 'incident-login-redirect',
           summary: 'Debug login redirect loop in staging',
@@ -294,12 +204,8 @@ describe('Skill Extraction', () => {
         },
       ]);
     },
-    prompt:
-      'Read the local workspace files and summarize this repository in two short sentences.',
-    assert: async (rig, result) => {
-      assertModelHasOutput(result);
-
-      const { state, skillsDir } = await waitForExtractionState(rig);
+    assert: async (config) => {
+      const { state, skillsDir } = await runExtractionAndReadState(config);
       const skillBodies = await readSkillBodies(skillsDir);
 
       expect(state.runs).toHaveLength(1);
@@ -309,21 +215,15 @@ describe('Skill Extraction', () => {
     },
   });
 
-  evalTest('USUALLY_PASSES', {
+  componentEvalTest('USUALLY_PASSES', {
     suiteName: 'skill-extraction',
-    suiteType: 'behavioral',
+    suiteType: 'component',
     name: 'extracts a repeated project-specific workflow into a skill',
     files: WORKSPACE_FILES,
     timeout: 180000,
-    params: {
-      settings: {
-        experimental: {
-          memoryManager: true,
-        },
-      },
-    },
-    setup: async (rig) => {
-      await seedSessions(rig, [
+    configOverrides: EXTRACTION_CONFIG_OVERRIDES,
+    setup: async (config) => {
+      await seedSessions(config, [
         {
           sessionId: 'settings-docs-regen-1',
           summary: 'Update settings docs after adding a config option',
@@ -360,12 +260,8 @@ describe('Skill Extraction', () => {
         },
       ]);
     },
-    prompt:
-      'Read the local workspace files and summarize this repository in two short sentences.',
-    assert: async (rig, result) => {
-      assertModelHasOutput(result);
-
-      const { state, skillsDir } = await waitForExtractionState(rig);
+    assert: async (config) => {
+      const { state, skillsDir } = await runExtractionAndReadState(config);
       const skillBodies = await readSkillBodies(skillsDir);
       const combinedSkills = skillBodies.join('\n\n');
 
@@ -381,21 +277,15 @@ describe('Skill Extraction', () => {
     },
   });
 
-  evalTest('USUALLY_PASSES', {
+  componentEvalTest('USUALLY_PASSES', {
     suiteName: 'skill-extraction',
-    suiteType: 'behavioral',
+    suiteType: 'component',
     name: 'extracts a repeated multi-step migration workflow with ordering constraints',
     files: WORKSPACE_FILES,
     timeout: 180000,
-    params: {
-      settings: {
-        experimental: {
-          memoryManager: true,
-        },
-      },
-    },
-    setup: async (rig) => {
-      await seedSessions(rig, [
+    configOverrides: EXTRACTION_CONFIG_OVERRIDES,
+    setup: async (config) => {
+      await seedSessions(config, [
         {
           sessionId: 'db-migration-v12',
           summary: 'Run database migration for v12 schema update',
@@ -432,12 +322,8 @@ describe('Skill Extraction', () => {
         },
       ]);
     },
-    prompt:
-      'Read the local workspace files and summarize this repository in two short sentences.',
-    assert: async (rig, result) => {
-      assertModelHasOutput(result);
-
-      const { state, skillsDir } = await waitForExtractionState(rig);
+    assert: async (config) => {
+      const { state, skillsDir } = await runExtractionAndReadState(config);
       const skillBodies = await readSkillBodies(skillsDir);
       const combinedSkills = skillBodies.join('\n\n');
 

From ac0418a462879709555502020021525d1144f9fe Mon Sep 17 00:00:00 2001
From: Sandy Tao <sandytao520@icloud.com>
Date: Wed, 15 Apr 2026 11:33:41 -0700
Subject: [PATCH 6/6] fix(evals): correct suiteType from 'component' to
 'component-level'

---
 evals/skill_extraction.eval.ts | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/evals/skill_extraction.eval.ts b/evals/skill_extraction.eval.ts
index f22b8696484..4149f29a67a 100644
--- a/evals/skill_extraction.eval.ts
+++ b/evals/skill_extraction.eval.ts
@@ -161,7 +161,7 @@ const EXTRACTION_CONFIG_OVERRIDES = {
 describe('Skill Extraction', () => {
   componentEvalTest('USUALLY_PASSES', {
     suiteName: 'skill-extraction',
-    suiteType: 'component',
+    suiteType: 'component-level',
     name: 'ignores one-off incidents even when session summaries look similar',
     files: WORKSPACE_FILES,
     timeout: 180000,
@@ -217,7 +217,7 @@ describe('Skill Extraction', () => {
 
   componentEvalTest('USUALLY_PASSES', {
     suiteName: 'skill-extraction',
-    suiteType: 'component',
+    suiteType: 'component-level',
     name: 'extracts a repeated project-specific workflow into a skill',
     files: WORKSPACE_FILES,
     timeout: 180000,
@@ -279,7 +279,7 @@ describe('Skill Extraction', () => {
 
   componentEvalTest('USUALLY_PASSES', {
     suiteName: 'skill-extraction',
-    suiteType: 'component',
+    suiteType: 'component-level',
     name: 'extracts a repeated multi-step migration workflow with ordering constraints',
     files: WORKSPACE_FILES,
     timeout: 180000,