From a03d564ccb6ba447cba8a1f8f5c03d04f70e5d0c Mon Sep 17 00:00:00 2001 From: Sandy Tao Date: Fri, 10 Apr 2026 09:31:29 -0700 Subject: [PATCH 1/6] improve prompt to only extract reusable workflows --- evals/skill_extraction.eval.ts | 384 ++++++++++++++++++ .../src/agents/skill-extraction-agent.test.ts | 90 ++++ .../core/src/agents/skill-extraction-agent.ts | 75 +++- 3 files changed, 528 insertions(+), 21 deletions(-) create mode 100644 evals/skill_extraction.eval.ts create mode 100644 packages/core/src/agents/skill-extraction-agent.test.ts diff --git a/evals/skill_extraction.eval.ts b/evals/skill_extraction.eval.ts new file mode 100644 index 00000000000..5feaa29a20a --- /dev/null +++ b/evals/skill_extraction.eval.ts @@ -0,0 +1,384 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import fs from 'node:fs'; +import fsp from 'node:fs/promises'; +import path from 'node:path'; +import { randomUUID } from 'node:crypto'; +import { describe, expect } from 'vitest'; +import { + Storage, + SESSION_FILE_PREFIX, + getProjectHash, + startMemoryService, +} from '@google/gemini-cli-core'; +import { + loadCliConfig, + type CliArgs, +} from '../packages/cli/src/config/config.js'; +import { + loadSettings, + resetSettingsCacheForTesting, +} from '../packages/cli/src/config/settings.js'; +import { validateNonInteractiveAuth } from '../packages/cli/src/validateNonInterActiveAuth.js'; +import { evalTest, assertModelHasOutput, type TestRig } from './test-helper.js'; + +interface SeedSession { + sessionId: string; + summary: string; + userTurns: string[]; + timestampOffsetMinutes: number; +} + +const MEMORY_EXTRACTION_ARGV: CliArgs = { + query: undefined, + model: undefined, + sandbox: undefined, + debug: false, + prompt: undefined, + promptInteractive: undefined, + yolo: true, + approvalMode: 'yolo', + policy: undefined, + adminPolicy: undefined, + allowedMcpServerNames: undefined, + allowedTools: undefined, + acp: false, + experimentalAcp: false, + extensions: undefined, + listExtensions: false, + resume: undefined, + listSessions: false, + deleteSession: undefined, + includeDirectories: undefined, + screenReader: false, + useWriteTodos: undefined, + outputFormat: undefined, + fakeResponses: undefined, + recordResponses: undefined, + startupMessages: [], + rawOutput: false, + acceptRawOutputRisk: false, + isCommand: false, +}; + +const WORKSPACE_FILES = { + 'package.json': JSON.stringify( + { + name: 'skill-extraction-eval', + private: true, + scripts: { + build: 'echo build', + lint: 'echo lint', + test: 'echo test', + }, + }, + null, + 2, + ), + 'README.md': `# Skill Extraction Eval + +This workspace exists to exercise background skill extraction from prior chats. +`, +}; + +function restoreGeminiHome(previousValue: string | undefined): void { + if (previousValue === undefined) { + delete process.env['GEMINI_CLI_HOME']; + } else { + process.env['GEMINI_CLI_HOME'] = previousValue; + } +} + +async function withRigStorage( + rig: TestRig, + fn: (storage: Storage, projectRoot: string) => Promise, +): Promise { + const previousGeminiHome = process.env['GEMINI_CLI_HOME']; + process.env['GEMINI_CLI_HOME'] = rig.homeDir!; + + try { + const projectRoot = fs.realpathSync(rig.testDir!); + const storage = new Storage(projectRoot); + await storage.initialize(); + return await fn(storage, projectRoot); + } finally { + restoreGeminiHome(previousGeminiHome); + } +} + +function buildMessages(userTurns: string[]) { + const baseTime = new Date(Date.now() - 6 * 60 * 60 * 1000).toISOString(); + return userTurns.flatMap((text, index) => [ + { + id: `u${index + 1}`, + timestamp: baseTime, + type: 'user', + content: [{ text }], + }, + { + id: `a${index + 1}`, + timestamp: baseTime, + type: 'gemini', + content: [{ text: `Acknowledged: ${index + 1}` }], + }, + ]); +} + +async function seedSessions( + rig: TestRig, + sessions: SeedSession[], +): Promise { + await withRigStorage(rig, async (storage, projectRoot) => { + const chatsDir = path.join(storage.getProjectTempDir(), 'chats'); + await fsp.mkdir(chatsDir, { recursive: true }); + + for (const session of sessions) { + const timestamp = new Date( + Date.now() - session.timestampOffsetMinutes * 60 * 1000, + ) + .toISOString() + .slice(0, 16) + .replace(/:/g, '-'); + const filename = `${SESSION_FILE_PREFIX}${timestamp}-${session.sessionId.slice(0, 8)}.json`; + const conversation = { + sessionId: session.sessionId, + projectHash: getProjectHash(projectRoot), + summary: session.summary, + startTime: new Date(Date.now() - 7 * 60 * 60 * 1000).toISOString(), + lastUpdated: new Date(Date.now() - 4 * 60 * 60 * 1000).toISOString(), + messages: buildMessages(session.userTurns), + }; + + await fsp.writeFile( + path.join(chatsDir, filename), + JSON.stringify(conversation, null, 2), + ); + } + }); +} + +async function waitForExtractionState(rig: TestRig): Promise<{ + state: { runs: Array<{ sessionIds: string[]; skillsCreated: string[] }> }; + skillsDir: string; +}> { + return withRigStorage(rig, async (storage, projectRoot) => { + // The headless CLI eval finishes and exits before its fire-and-forget + // memory task can complete, so invoke the real memory service directly. + const previousCwd = process.cwd(); + let config: Awaited> | undefined; + + process.chdir(projectRoot); + + try { + resetSettingsCacheForTesting(); + const settings = loadSettings(projectRoot); + config = await loadCliConfig( + settings.merged, + `skill-extraction-eval-${randomUUID().slice(0, 8)}`, + MEMORY_EXTRACTION_ARGV, + { cwd: projectRoot }, + ); + await config.initialize(); + + const authType = await validateNonInteractiveAuth( + settings.merged.security.auth.selectedType, + settings.merged.security.auth.useExternal, + config, + settings, + ); + await config.refreshAuth(authType); + await startMemoryService(config); + } finally { + process.chdir(previousCwd); + resetSettingsCacheForTesting(); + await config?.dispose(); + } + + const statePath = path.join( + storage.getProjectMemoryTempDir(), + '.extraction-state.json', + ); + const skillsDir = storage.getProjectSkillsMemoryDir(); + + const raw = await fsp.readFile(statePath, 'utf-8'); + const state = JSON.parse(raw) as { + runs?: Array<{ sessionIds?: string[]; skillsCreated?: string[] }>; + }; + if (!Array.isArray(state.runs) || state.runs.length === 0) { + throw new Error( + 'Skill extraction finished without writing any run state', + ); + } + + return { + state: { + runs: state.runs.map((run) => ({ + sessionIds: Array.isArray(run.sessionIds) ? run.sessionIds : [], + skillsCreated: Array.isArray(run.skillsCreated) + ? run.skillsCreated + : [], + })), + }, + skillsDir, + }; + }); +} + +async function readSkillBodies(skillsDir: string): Promise { + try { + const entries = await fsp.readdir(skillsDir, { withFileTypes: true }); + const skillDirs = entries.filter((entry) => entry.isDirectory()); + const bodies = await Promise.all( + skillDirs.map((entry) => + fsp.readFile(path.join(skillsDir, entry.name, 'SKILL.md'), 'utf-8'), + ), + ); + return bodies; + } catch { + return []; + } +} + +describe('Skill Extraction', () => { + evalTest('USUALLY_PASSES', { + suiteName: 'skill-extraction', + suiteType: 'behavioral', + name: 'ignores one-off incidents even when session summaries look similar', + files: WORKSPACE_FILES, + timeout: 180000, + params: { + settings: { + experimental: { + memoryManager: true, + }, + }, + }, + setup: async (rig) => { + await seedSessions(rig, [ + { + sessionId: 'incident-login-redirect', + summary: 'Debug login redirect loop in staging', + timestampOffsetMinutes: 420, + userTurns: [ + 'We only need a one-off fix for incident INC-4412 on branch hotfix/login-loop.', + 'The exact failing string is ERR_REDIRECT_4412 and this workaround is incident-specific.', + 'Patch packages/auth/src/redirect.ts just for this branch and do not generalize it.', + 'The thing that worked was deleting the stale staging cookie before retrying.', + 'This is not a normal workflow and should not become a reusable instruction.', + 'It only reproduced against the 2026-04-08 staging rollout.', + 'After the cookie clear, the branch-specific redirect logic passed.', + 'Do not turn this incident writeup into a standing process.', + 'Yes, the hotfix worked for this exact redirect-loop incident.', + 'Close out INC-4412 once the staging login succeeds again.', + ], + }, + { + sessionId: 'incident-login-timeout', + summary: 'Debug login callback timeout in staging', + timestampOffsetMinutes: 360, + userTurns: [ + 'This is another one-off staging incident, this time TICKET-991 for callback timeout.', + 'The exact failing string is ERR_CALLBACK_TIMEOUT_991 and it is unrelated to the redirect loop.', + 'The temporary fix was rotating the staging secret and deleting a bad feature-flag row.', + 'Do not write a generic login-debugging playbook from this.', + 'This only applied to the callback timeout during the April rollout.', + 'The successful fix was specific to the stale secret in staging.', + 'It does not define a durable repo workflow for future tasks.', + 'After rotating the secret, the callback timeout stopped reproducing.', + 'Treat this as incident response only, not a reusable skill.', + 'Once staging passed again, we closed TICKET-991.', + ], + }, + ]); + }, + prompt: + 'Read the local workspace files and summarize this repository in two short sentences.', + assert: async (rig, result) => { + assertModelHasOutput(result); + + const { state, skillsDir } = await waitForExtractionState(rig); + const skillBodies = await readSkillBodies(skillsDir); + + expect(state.runs).toHaveLength(1); + expect(state.runs[0].sessionIds).toHaveLength(2); + expect(state.runs[0].skillsCreated).toEqual([]); + expect(skillBodies).toEqual([]); + }, + }); + + evalTest('USUALLY_PASSES', { + suiteName: 'skill-extraction', + suiteType: 'behavioral', + name: 'extracts a repeated project-specific workflow into a skill', + files: WORKSPACE_FILES, + timeout: 180000, + params: { + settings: { + experimental: { + memoryManager: true, + }, + }, + }, + setup: async (rig) => { + await seedSessions(rig, [ + { + sessionId: 'settings-docs-regen-1', + summary: 'Update settings docs after adding a config option', + timestampOffsetMinutes: 420, + userTurns: [ + 'When we add a new config option, we have to regenerate the settings docs in a specific order.', + 'The sequence that worked was npm run predocs:settings, npm run schema:settings, then npm run docs:settings.', + 'Do not hand-edit generated settings docs.', + 'If predocs is skipped, the generated schema docs miss the new defaults.', + 'Update the source first, then run that generation sequence.', + 'After regenerating, verify the schema output and docs changed together.', + 'We used this same sequence the last time we touched settings docs.', + 'That ordered workflow passed and produced the expected generated files.', + 'Please keep the exact command order because reversing it breaks the output.', + 'Yes, the generated settings docs were correct after those three commands.', + ], + }, + { + sessionId: 'settings-docs-regen-2', + summary: 'Regenerate settings schema docs for another new setting', + timestampOffsetMinutes: 360, + userTurns: [ + 'We are touching another setting, so follow the same settings-doc regeneration workflow again.', + 'Run npm run predocs:settings before npm run schema:settings and npm run docs:settings.', + 'The project keeps generated settings docs in sync through those commands, not manual edits.', + 'Skipping predocs caused stale defaults in the generated output before.', + 'Change the source, then execute the same three commands in order.', + 'Verify both the schema artifact and docs update together after regeneration.', + 'This is the recurring workflow we use whenever a setting changes.', + 'The exact order worked again on this second settings update.', + 'Please preserve that ordering constraint for future settings changes.', + 'Confirmed: the settings docs regenerated correctly with the same command sequence.', + ], + }, + ]); + }, + prompt: + 'Read the local workspace files and summarize this repository in two short sentences.', + assert: async (rig, result) => { + assertModelHasOutput(result); + + const { state, skillsDir } = await waitForExtractionState(rig); + const skillBodies = await readSkillBodies(skillsDir); + const combinedSkills = skillBodies.join('\n\n'); + + expect(state.runs).toHaveLength(1); + expect(state.runs[0].sessionIds).toHaveLength(2); + expect(state.runs[0].skillsCreated.length).toBeGreaterThanOrEqual(1); + expect(skillBodies.length).toBeGreaterThanOrEqual(1); + expect(combinedSkills).toContain('npm run predocs:settings'); + expect(combinedSkills).toContain('npm run schema:settings'); + expect(combinedSkills).toContain('npm run docs:settings'); + expect(combinedSkills).toMatch(/When to Use/i); + expect(combinedSkills).toMatch(/Verification/i); + }, + }); +}); diff --git a/packages/core/src/agents/skill-extraction-agent.test.ts b/packages/core/src/agents/skill-extraction-agent.test.ts new file mode 100644 index 00000000000..a67c7db270e --- /dev/null +++ b/packages/core/src/agents/skill-extraction-agent.test.ts @@ -0,0 +1,90 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, expect, it } from 'vitest'; +import { SkillExtractionAgent } from './skill-extraction-agent.js'; +import { + EDIT_TOOL_NAME, + GLOB_TOOL_NAME, + GREP_TOOL_NAME, + LS_TOOL_NAME, + READ_FILE_TOOL_NAME, + WRITE_FILE_TOOL_NAME, +} from '../tools/tool-names.js'; +import { PREVIEW_GEMINI_FLASH_MODEL } from '../config/models.js'; + +describe('SkillExtractionAgent', () => { + const skillsDir = '/tmp/skills'; + const sessionIndex = + '[NEW] Debug login flow (12 user msgs) — /tmp/chats/session-1.json'; + const existingSkillsSummary = + '## Workspace Skills (.gemini/skills — do NOT duplicate)\n- **existing-skill**: Existing description'; + + const agent = SkillExtractionAgent( + skillsDir, + sessionIndex, + existingSkillsSummary, + ); + + it('should expose expected metadata, model, and tools', () => { + expect(agent.kind).toBe('local'); + expect(agent.name).toBe('confucius'); + expect(agent.displayName).toBe('Skill Extractor'); + expect(agent.modelConfig.model).toBe(PREVIEW_GEMINI_FLASH_MODEL); + expect(agent.toolConfig?.tools).toEqual( + expect.arrayContaining([ + READ_FILE_TOOL_NAME, + WRITE_FILE_TOOL_NAME, + EDIT_TOOL_NAME, + LS_TOOL_NAME, + GLOB_TOOL_NAME, + GREP_TOOL_NAME, + ]), + ); + }); + + it('should default to no skill unless recurrence and durability are proven', () => { + const prompt = agent.promptConfig.systemPrompt; + + expect(prompt).toContain('Default to NO SKILL.'); + expect(prompt).toContain( + 'strong evidence this will recur for future agents in this repo/workflow', + ); + expect(prompt).toContain('broader than a single incident'); + expect(prompt).toContain('A skill MUST meet ALL of these criteria:'); + expect(prompt).toContain( + 'Future agents in this repo/workflow are likely to need it', + ); + }); + + it('should explicitly reject one-off incidents and single-session preferences', () => { + const prompt = agent.promptConfig.systemPrompt; + + expect(prompt).toContain('Single-session preferences'); + expect(prompt).toContain('One-off incidents'); + expect(prompt).toContain('Output-style preferences'); + expect(prompt).toContain('cannot survive renaming the specific'); + }); + + it('should warn that session summaries are user-intent summaries, not workflow evidence', () => { + const query = agent.promptConfig.query ?? ''; + + expect(query).toContain(existingSkillsSummary); + expect(query).toContain(sessionIndex); + expect(query).toContain( + 'The summary is a user-intent summary, not a workflow summary.', + ); + expect(query).toContain( + 'The session summaries describe user intent, not workflow details.', + ); + expect(query).toContain( + 'Only write a skill if the evidence shows a durable, recurring workflow', + ); + expect(query).toContain( + 'If recurrence or future reuse is unclear, create no skill and explain why.', + ); + }); +}); diff --git a/packages/core/src/agents/skill-extraction-agent.ts b/packages/core/src/agents/skill-extraction-agent.ts index 2678bd206dc..771c94eb2f2 100644 --- a/packages/core/src/agents/skill-extraction-agent.ts +++ b/packages/core/src/agents/skill-extraction-agent.ts @@ -36,7 +36,7 @@ function buildSystemPrompt(skillsDir: string): string { '- solve similar tasks with fewer tool calls and fewer reasoning tokens', '- reuse proven workflows and verification checklists', '- avoid known failure modes and landmines', - '- anticipate user preferences without being reminded', + '- capture durable workflow constraints that future agents are likely to encounter again', '', '============================================================', 'SAFETY AND HYGIENE (STRICT)', @@ -59,6 +59,10 @@ function buildSystemPrompt(skillsDir: string): string { '1. "Is this something a competent agent would NOT already know?" If no, STOP.', '2. "Does an existing skill (listed below) already cover this?" If yes, STOP.', '3. "Can I write a concrete, step-by-step procedure?" If no, STOP.', + '4. "Is there strong evidence this will recur for future agents in this repo/workflow?" If no, STOP.', + '5. "Is this broader than a single incident (one bug, one ticket, one branch, one date, one exact error)?" If no, STOP.', + '', + 'Default to NO SKILL.', '', 'Do NOT create skills for:', '', @@ -67,6 +71,10 @@ function buildSystemPrompt(skillsDir: string): string { '- **Pure Q&A**: The user asked "how does X work?" and got an answer. No procedure.', '- **Brainstorming/design**: Discussion of how to build something, without a validated', ' implementation that produced a reusable procedure.', + '- **Single-session preferences**: User-specific style/output preferences or workflow', + ' preferences mentioned only once.', + '- **One-off incidents**: Debugging or incident response tied to a single bug, ticket,', + ' branch, date, or exact error string.', '- **Anything already covered by an existing skill** (global, workspace, builtin, or', ' previously extracted). Check the "Existing Skills" section carefully.', '', @@ -74,31 +82,40 @@ function buildSystemPrompt(skillsDir: string): string { 'WHAT COUNTS AS A SKILL', '============================================================', '', - 'A skill MUST meet BOTH of these criteria:', + 'A skill MUST meet ALL of these criteria:', '', '1. **Procedural and concrete**: It can be expressed as numbered steps with specific', ' commands, paths, or code patterns. If you can only write vague guidance, it is NOT', ' a skill. "Be careful with X" is advice, not a skill.', '', - '2. **Non-obvious and project-specific**: A competent agent would NOT already know this.', - ' It encodes project-specific knowledge, non-obvious ordering constraints, or', - ' hard-won failure shields that cannot be inferred from the codebase alone.', + '2. **Durable and reusable**: Future agents in this repo/workflow are likely to need it', + ' again. If it only solved one incident, it is NOT a skill.', + '', + '3. **Evidence-backed and project-specific**: It encodes project-specific knowledge,', + ' repeated operational constraints, or hard-won failure shields supported by session', + ' evidence. Do not assume something is non-obvious just because it sounds detailed.', '', - 'Confidence tiers (prefer higher tiers):', + 'Confidence tiers:', '', - '**High confidence** — create the skill:', - '- The same workflow appeared in multiple sessions (cross-session repetition)', - '- A multi-step procedure was validated (tests passed, user confirmed success)', + '**High confidence** — create the skill only when recurrence/durability is clear:', + '- The same workflow appeared in multiple sessions (cross-session repetition), OR it is', + ' a stable recurring repo workflow (for example setup/build/test/deploy/release) with a', + ' clear future trigger', + '- The workflow was validated (tests passed, user confirmed success, or the same fix', + ' worked repeatedly)', + '- The skill can be named without referencing a specific incident, bug, branch, or date', '', - '**Medium confidence** — create the skill if it is clearly project-specific:', - '- A project-specific build/test/deploy/release procedure was established', - '- A non-obvious ordering constraint or prerequisite was discovered', - '- A failure mode was hit and a concrete fix was found and verified', + '**Medium confidence** — usually do NOT create the skill yet:', + '- A project-specific procedure appeared once and seems useful, but recurrence is not yet', + ' clear', + '- A verified fix exists, but it is still tied to one incident', + '- A user correction changed the approach once, but durability is uncertain', '', '**Low confidence** — do NOT create the skill:', '- A one-off debugging session with no reusable procedure', '- Generic workflows any agent could figure out from the codebase', '- A code review or investigation with no durable takeaway', + '- Output-style preferences that do not materially change procedure', '', 'Aim for 0-2 skills per run. Quality over quantity.', '', @@ -117,8 +134,10 @@ function buildSystemPrompt(skillsDir: string): string { '', 'What to look for:', '', - '- User corrections: "No, do it this way" -> preference signal', + '- User corrections that change procedure in a durable way, especially when repeated', + ' across sessions', '- Repeated patterns across sessions: same commands, same file paths, same workflow', + '- Stable recurring repo lifecycle workflows with clear future triggers', '- Failed attempts followed by successful ones -> failure shield', '- Multi-step procedures that were validated (tests passed, user confirmed)', '- User interruptions: "Stop, you need to X first" -> ordering constraint', @@ -129,6 +148,8 @@ function buildSystemPrompt(skillsDir: string): string { '- Tool outputs that are just data (file contents, search results)', '- Speculative plans that were never executed', "- Temporary context (current branch name, today's date, specific error IDs)", + '- Similar session summaries without matching workflow evidence', + '- One-off artifact names: bug IDs, branch names, timestamps, exact incident strings', '', '============================================================', 'SKILL FORMAT', @@ -214,7 +235,10 @@ function buildSystemPrompt(skillsDir: string): string { '- Keep scopes distinct. Avoid overlapping "do-everything" skills.', '- Every skill MUST have: triggers, procedure, at least one pitfall or verification step.', '- If you cannot write a reliable procedure (too many unknowns), do NOT create the skill.', - '- Do not create skills for generic advice that any competent agent would already know.', + '- If the candidate is tied to one incident or cannot survive renaming the specific', + ' bug/ticket, do NOT create it.', + '- Do not create skills for generic advice, output-style preferences, or ephemeral', + ' choices that any competent agent would already know or adapt to on the fly.', '- Prefer fewer, higher-quality skills. 0-2 skills per run is typical. 3+ is unusual.', '', '============================================================', @@ -224,17 +248,23 @@ function buildSystemPrompt(skillsDir: string): string { `1. Use list_directory on ${skillsDir} to see existing skills.`, '2. If skills exist, read their SKILL.md files to understand what is already captured.', '3. Scan the session index provided in the query. Look for [NEW] sessions whose summaries', - ' suggest workflows that ALSO appear in other sessions (either [NEW] or [old]).', - '4. Apply the minimum signal gate. If no repeated patterns are visible, report that and finish.', + ' hint at workflows that ALSO appear in other sessions (either [NEW] or [old]) or at a', + ' stable recurring repo workflow. Remember: summary similarity alone is NOT enough.', + '4. Apply the minimum signal gate. If recurrence or durability is not visible, report that', + ' no skill should be created and finish.', '5. For promising patterns, use read_file on the session file paths to inspect the full', - ' conversation. Confirm the workflow was actually repeated and validated.', - '6. For each confirmed skill, verify it meets ALL criteria (repeatable, procedural, high-leverage).', + ' conversation. Confirm the workflow was actually repeated and validated. Read at least', + ' two sessions unless the candidate is clearly a stable recurring repo lifecycle workflow.', + '6. For each candidate, verify it meets ALL criteria. Before writing, make sure you can', + ' state: future trigger, evidence sessions, recurrence signal, validation signal, and', + ' why it is not generic.', '7. Write new SKILL.md files or update existing ones in your directory using write_file.', ' For skills that live OUTSIDE your directory, write a .patch file instead (see UPDATING EXISTING SKILLS).', '8. Write COMPLETE files — never partially update a SKILL.md.', '', 'IMPORTANT: Do NOT read every session. Only read sessions whose summaries suggest a', - 'repeated pattern worth investigating. Most runs should read 0-3 sessions and create 0 skills.', + 'repeated pattern or a stable recurring repo workflow worth investigating. Most runs', + 'should read 0-3 sessions and create 0 skills.', 'Do not explore the codebase. Work only with the session index, session files, and the skills directory.', ].join('\n'); } @@ -301,6 +331,9 @@ export const SkillExtractionAgent = ( 'Below is an index of past conversation sessions. Each line shows:', '[NEW] or [old] status, a 1-line summary, message count, and the file path.', '', + 'The summary is a user-intent summary, not a workflow summary.', + 'Matching summary text alone is never enough evidence for a reusable skill.', + '', '[NEW] = not yet processed for skill extraction (focus on these)', '[old] = previously processed (read only if a [NEW] session hints at a repeated pattern)', '', @@ -319,7 +352,7 @@ export const SkillExtractionAgent = ( return { systemPrompt: buildSystemPrompt(skillsDir), - query: `${initialContext}\n\nAnalyze the session index above. Read sessions that suggest repeated workflows using read_file. Extract reusable skills to ${skillsDir}/.`, + query: `${initialContext}\n\nAnalyze the session index above. The session summaries describe user intent, not workflow details. Read sessions that suggest repeated workflows using read_file. Only write a skill if the evidence shows a durable, recurring workflow or a stable recurring repo procedure. If recurrence or future reuse is unclear, create no skill and explain why.`, }; }, runConfig: { From 42a4dcbae95636999d8ee3aa1387d33dbcb80b5f Mon Sep 17 00:00:00 2001 From: Sandy Tao Date: Mon, 13 Apr 2026 13:28:15 -0700 Subject: [PATCH 2/6] refactor(evals): use vi.stubEnv for GEMINI_CLI_HOME in skill extraction eval --- evals/skill_extraction.eval.ts | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/evals/skill_extraction.eval.ts b/evals/skill_extraction.eval.ts index 5feaa29a20a..ee4464b7246 100644 --- a/evals/skill_extraction.eval.ts +++ b/evals/skill_extraction.eval.ts @@ -8,7 +8,7 @@ import fs from 'node:fs'; import fsp from 'node:fs/promises'; import path from 'node:path'; import { randomUUID } from 'node:crypto'; -import { describe, expect } from 'vitest'; +import { describe, expect, vi } from 'vitest'; import { Storage, SESSION_FILE_PREFIX, @@ -85,20 +85,11 @@ This workspace exists to exercise background skill extraction from prior chats. `, }; -function restoreGeminiHome(previousValue: string | undefined): void { - if (previousValue === undefined) { - delete process.env['GEMINI_CLI_HOME']; - } else { - process.env['GEMINI_CLI_HOME'] = previousValue; - } -} - async function withRigStorage( rig: TestRig, fn: (storage: Storage, projectRoot: string) => Promise, ): Promise { - const previousGeminiHome = process.env['GEMINI_CLI_HOME']; - process.env['GEMINI_CLI_HOME'] = rig.homeDir!; + vi.stubEnv('GEMINI_CLI_HOME', rig.homeDir!); try { const projectRoot = fs.realpathSync(rig.testDir!); @@ -106,7 +97,7 @@ async function withRigStorage( await storage.initialize(); return await fn(storage, projectRoot); } finally { - restoreGeminiHome(previousGeminiHome); + vi.unstubAllEnvs(); } } From 794d04274a16b8fa1f279263c9d517537ba4c5ce Mon Sep 17 00:00:00 2001 From: Sandy Tao Date: Mon, 13 Apr 2026 14:25:54 -0700 Subject: [PATCH 3/6] test(evals): add migration workflow extraction eval case --- evals/skill_extraction.eval.ts | 72 ++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/evals/skill_extraction.eval.ts b/evals/skill_extraction.eval.ts index ee4464b7246..ef32edcaca2 100644 --- a/evals/skill_extraction.eval.ts +++ b/evals/skill_extraction.eval.ts @@ -372,4 +372,76 @@ describe('Skill Extraction', () => { expect(combinedSkills).toMatch(/Verification/i); }, }); + + evalTest('USUALLY_PASSES', { + suiteName: 'skill-extraction', + suiteType: 'behavioral', + name: 'extracts a repeated multi-step migration workflow with ordering constraints', + files: WORKSPACE_FILES, + timeout: 180000, + params: { + settings: { + experimental: { + memoryManager: true, + }, + }, + }, + setup: async (rig) => { + await seedSessions(rig, [ + { + sessionId: 'db-migration-v12', + summary: 'Run database migration for v12 schema update', + timestampOffsetMinutes: 420, + userTurns: [ + 'Every time we change the database schema we follow a specific migration workflow.', + 'First run npm run db:check to verify no pending migrations conflict.', + 'Then run npm run db:migrate to apply the new migration files.', + 'After migration, always run npm run db:validate to confirm schema integrity.', + 'If db:validate fails, immediately run npm run db:rollback before anything else.', + 'Never skip db:check — last time we did, two migrations collided and corrupted the index.', + 'The ordering is critical: check, migrate, validate. Reversing migrate and validate caused silent data loss before.', + 'This v12 migration passed after following that exact sequence.', + 'We use this same three-step workflow every time the schema changes.', + 'Confirmed: db:check, db:migrate, db:validate completed successfully for v12.', + ], + }, + { + sessionId: 'db-migration-v13', + summary: 'Run database migration for v13 schema update', + timestampOffsetMinutes: 360, + userTurns: [ + 'New schema change for v13, following the same database migration workflow as before.', + 'Start with npm run db:check to ensure no conflicting pending migrations.', + 'Then npm run db:migrate to apply the v13 migration files.', + 'Then npm run db:validate to confirm the schema is consistent.', + 'If validation fails, run npm run db:rollback immediately — do not attempt manual fixes.', + 'We learned the hard way that skipping db:check causes index corruption.', + 'The check-migrate-validate order is mandatory for every schema change.', + 'This is the same recurring workflow we used for v12 and earlier migrations.', + 'The v13 migration passed with the same three-step sequence.', + 'Confirmed: the standard db migration workflow succeeded again for v13.', + ], + }, + ]); + }, + prompt: + 'Read the local workspace files and summarize this repository in two short sentences.', + assert: async (rig, result) => { + assertModelHasOutput(result); + + const { state, skillsDir } = await waitForExtractionState(rig); + const skillBodies = await readSkillBodies(skillsDir); + const combinedSkills = skillBodies.join('\n\n'); + + expect(state.runs).toHaveLength(1); + expect(state.runs[0].sessionIds).toHaveLength(2); + expect(state.runs[0].skillsCreated.length).toBeGreaterThanOrEqual(1); + expect(skillBodies.length).toBeGreaterThanOrEqual(1); + expect(combinedSkills).toContain('npm run db:check'); + expect(combinedSkills).toContain('npm run db:migrate'); + expect(combinedSkills).toContain('npm run db:validate'); + expect(combinedSkills).toMatch(/rollback/i); + expect(combinedSkills).toMatch(/When to Use/i); + }, + }); }); From ef522bb5d84f62768c70737e56a26c4c856ee0b5 Mon Sep 17 00:00:00 2001 From: Sandy Tao Date: Mon, 13 Apr 2026 16:50:52 -0700 Subject: [PATCH 4/6] fix(evals): add return type to buildMessages and simplify Config type annotation --- evals/skill_extraction.eval.ts | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/evals/skill_extraction.eval.ts b/evals/skill_extraction.eval.ts index ef32edcaca2..b7815134156 100644 --- a/evals/skill_extraction.eval.ts +++ b/evals/skill_extraction.eval.ts @@ -14,6 +14,7 @@ import { SESSION_FILE_PREFIX, getProjectHash, startMemoryService, + Config, } from '@google/gemini-cli-core'; import { loadCliConfig, @@ -33,6 +34,13 @@ interface SeedSession { timestampOffsetMinutes: number; } +interface MessageRecord { + id: string; + timestamp: string; + type: string; + content: Array<{ text: string }>; +} + const MEMORY_EXTRACTION_ARGV: CliArgs = { query: undefined, model: undefined, @@ -101,7 +109,7 @@ async function withRigStorage( } } -function buildMessages(userTurns: string[]) { +function buildMessages(userTurns: string[]): MessageRecord[] { const baseTime = new Date(Date.now() - 6 * 60 * 60 * 1000).toISOString(); return userTurns.flatMap((text, index) => [ { @@ -160,7 +168,7 @@ async function waitForExtractionState(rig: TestRig): Promise<{ // The headless CLI eval finishes and exits before its fire-and-forget // memory task can complete, so invoke the real memory service directly. const previousCwd = process.cwd(); - let config: Awaited> | undefined; + let config: Config | undefined; process.chdir(projectRoot); From 26987f747f75dcf6d746f52ba0af6983e8cc5744 Mon Sep 17 00:00:00 2001 From: Sandy Tao Date: Mon, 13 Apr 2026 20:53:42 -0700 Subject: [PATCH 5/6] refactor(evals): migrate skill extraction to componentEvalTest Replaces the evalTest approach (full CLI subprocess + loadCliConfig) with componentEvalTest (in-process makeFakeConfig + direct startMemoryService). Key changes: - ComponentRig now creates an isolated homeDir and stubs GEMINI_CLI_HOME after auth to isolate storage paths (sessions, skills, extraction state). - ComponentRig.cleanup() calls config.dispose() and vi.unstubAllEnvs(). - Skill extraction evals pass approvalMode: YOLO to auto-approve tool calls (write_file/read_file) in non-interactive mode. - Removes ~100 lines of boilerplate (withRigStorage, waitForExtractionState, loadCliConfig, loadSettings, process.chdir). --- evals/component-test-helper.ts | 18 ++- evals/skill_extraction.eval.ts | 284 ++++++++++----------------------- 2 files changed, 102 insertions(+), 200 deletions(-) diff --git a/evals/component-test-helper.ts b/evals/component-test-helper.ts index 9be68e6936a..097f6e3d05e 100644 --- a/evals/component-test-helper.ts +++ b/evals/component-test-helper.ts @@ -16,6 +16,7 @@ import fs from 'node:fs'; import path from 'node:path'; import os from 'node:os'; import { randomUUID } from 'node:crypto'; +import { vi } from 'vitest'; import { Config, type ConfigParameters, @@ -52,6 +53,7 @@ export interface ComponentEvalCase extends BaseEvalCase { export class ComponentRig { public config: Config | undefined; public testDir: string; + public homeDir: string; public sessionId: string; constructor( @@ -61,6 +63,9 @@ export class ComponentRig { this.testDir = fs.mkdtempSync( path.join(os.tmpdir(), `gemini-component-rig-${uniqueId.slice(0, 8)}-`), ); + this.homeDir = fs.mkdtempSync( + path.join(os.tmpdir(), `gemini-component-home-${uniqueId.slice(0, 8)}-`), + ); this.sessionId = `test-session-${uniqueId}`; } @@ -89,12 +94,23 @@ export class ComponentRig { this.config = makeFakeConfig(configParams); await this.config.initialize(); - // Refresh auth using USE_GEMINI to initialize the real BaseLlmClient + // Refresh auth using USE_GEMINI to initialize the real BaseLlmClient. + // This must happen BEFORE stubbing GEMINI_CLI_HOME because OAuth credential + // lookup resolves through homedir() → GEMINI_CLI_HOME. await this.config.refreshAuth(AuthType.USE_GEMINI); + + // Isolate storage paths (session files, skills, extraction state) by + // pointing GEMINI_CLI_HOME at a per-test temp directory. Storage resolves + // global paths through `homedir()` which reads this env var. This is set + // after auth so credential lookup uses the real home directory. + vi.stubEnv('GEMINI_CLI_HOME', this.homeDir); } async cleanup() { + await this.config?.dispose(); + vi.unstubAllEnvs(); fs.rmSync(this.testDir, { recursive: true, force: true }); + fs.rmSync(this.homeDir, { recursive: true, force: true }); } } diff --git a/evals/skill_extraction.eval.ts b/evals/skill_extraction.eval.ts index b7815134156..f22b8696484 100644 --- a/evals/skill_extraction.eval.ts +++ b/evals/skill_extraction.eval.ts @@ -4,28 +4,17 @@ * SPDX-License-Identifier: Apache-2.0 */ -import fs from 'node:fs'; import fsp from 'node:fs/promises'; import path from 'node:path'; -import { randomUUID } from 'node:crypto'; -import { describe, expect, vi } from 'vitest'; +import { describe, expect } from 'vitest'; import { - Storage, + type Config, + ApprovalMode, SESSION_FILE_PREFIX, getProjectHash, startMemoryService, - Config, } from '@google/gemini-cli-core'; -import { - loadCliConfig, - type CliArgs, -} from '../packages/cli/src/config/config.js'; -import { - loadSettings, - resetSettingsCacheForTesting, -} from '../packages/cli/src/config/settings.js'; -import { validateNonInteractiveAuth } from '../packages/cli/src/validateNonInterActiveAuth.js'; -import { evalTest, assertModelHasOutput, type TestRig } from './test-helper.js'; +import { componentEvalTest } from './component-test-helper.js'; interface SeedSession { sessionId: string; @@ -41,38 +30,6 @@ interface MessageRecord { content: Array<{ text: string }>; } -const MEMORY_EXTRACTION_ARGV: CliArgs = { - query: undefined, - model: undefined, - sandbox: undefined, - debug: false, - prompt: undefined, - promptInteractive: undefined, - yolo: true, - approvalMode: 'yolo', - policy: undefined, - adminPolicy: undefined, - allowedMcpServerNames: undefined, - allowedTools: undefined, - acp: false, - experimentalAcp: false, - extensions: undefined, - listExtensions: false, - resume: undefined, - listSessions: false, - deleteSession: undefined, - includeDirectories: undefined, - screenReader: false, - useWriteTodos: undefined, - outputFormat: undefined, - fakeResponses: undefined, - recordResponses: undefined, - startupMessages: [], - rawOutput: false, - acceptRawOutputRisk: false, - isCommand: false, -}; - const WORKSPACE_FILES = { 'package.json': JSON.stringify( { @@ -93,22 +50,6 @@ This workspace exists to exercise background skill extraction from prior chats. `, }; -async function withRigStorage( - rig: TestRig, - fn: (storage: Storage, projectRoot: string) => Promise, -): Promise { - vi.stubEnv('GEMINI_CLI_HOME', rig.homeDir!); - - try { - const projectRoot = fs.realpathSync(rig.testDir!); - const storage = new Storage(projectRoot); - await storage.initialize(); - return await fn(storage, projectRoot); - } finally { - vi.unstubAllEnvs(); - } -} - function buildMessages(userTurns: string[]): MessageRecord[] { const baseTime = new Date(Date.now() - 6 * 60 * 60 * 1000).toISOString(); return userTurns.flatMap((text, index) => [ @@ -128,103 +69,67 @@ function buildMessages(userTurns: string[]): MessageRecord[] { } async function seedSessions( - rig: TestRig, + config: Config, sessions: SeedSession[], ): Promise { - await withRigStorage(rig, async (storage, projectRoot) => { - const chatsDir = path.join(storage.getProjectTempDir(), 'chats'); - await fsp.mkdir(chatsDir, { recursive: true }); + const chatsDir = path.join(config.storage.getProjectTempDir(), 'chats'); + await fsp.mkdir(chatsDir, { recursive: true }); - for (const session of sessions) { - const timestamp = new Date( - Date.now() - session.timestampOffsetMinutes * 60 * 1000, - ) - .toISOString() - .slice(0, 16) - .replace(/:/g, '-'); - const filename = `${SESSION_FILE_PREFIX}${timestamp}-${session.sessionId.slice(0, 8)}.json`; - const conversation = { - sessionId: session.sessionId, - projectHash: getProjectHash(projectRoot), - summary: session.summary, - startTime: new Date(Date.now() - 7 * 60 * 60 * 1000).toISOString(), - lastUpdated: new Date(Date.now() - 4 * 60 * 60 * 1000).toISOString(), - messages: buildMessages(session.userTurns), - }; + const projectRoot = config.storage.getProjectRoot(); - await fsp.writeFile( - path.join(chatsDir, filename), - JSON.stringify(conversation, null, 2), - ); - } - }); + for (const session of sessions) { + const timestamp = new Date( + Date.now() - session.timestampOffsetMinutes * 60 * 1000, + ) + .toISOString() + .slice(0, 16) + .replace(/:/g, '-'); + const filename = `${SESSION_FILE_PREFIX}${timestamp}-${session.sessionId.slice(0, 8)}.json`; + const conversation = { + sessionId: session.sessionId, + projectHash: getProjectHash(projectRoot), + summary: session.summary, + startTime: new Date(Date.now() - 7 * 60 * 60 * 1000).toISOString(), + lastUpdated: new Date(Date.now() - 4 * 60 * 60 * 1000).toISOString(), + messages: buildMessages(session.userTurns), + }; + + await fsp.writeFile( + path.join(chatsDir, filename), + JSON.stringify(conversation, null, 2), + ); + } } -async function waitForExtractionState(rig: TestRig): Promise<{ +async function runExtractionAndReadState(config: Config): Promise<{ state: { runs: Array<{ sessionIds: string[]; skillsCreated: string[] }> }; skillsDir: string; }> { - return withRigStorage(rig, async (storage, projectRoot) => { - // The headless CLI eval finishes and exits before its fire-and-forget - // memory task can complete, so invoke the real memory service directly. - const previousCwd = process.cwd(); - let config: Config | undefined; - - process.chdir(projectRoot); + await startMemoryService(config); - try { - resetSettingsCacheForTesting(); - const settings = loadSettings(projectRoot); - config = await loadCliConfig( - settings.merged, - `skill-extraction-eval-${randomUUID().slice(0, 8)}`, - MEMORY_EXTRACTION_ARGV, - { cwd: projectRoot }, - ); - await config.initialize(); + const memoryDir = config.storage.getProjectMemoryTempDir(); + const skillsDir = config.storage.getProjectSkillsMemoryDir(); + const statePath = path.join(memoryDir, '.extraction-state.json'); - const authType = await validateNonInteractiveAuth( - settings.merged.security.auth.selectedType, - settings.merged.security.auth.useExternal, - config, - settings, - ); - await config.refreshAuth(authType); - await startMemoryService(config); - } finally { - process.chdir(previousCwd); - resetSettingsCacheForTesting(); - await config?.dispose(); - } - - const statePath = path.join( - storage.getProjectMemoryTempDir(), - '.extraction-state.json', - ); - const skillsDir = storage.getProjectSkillsMemoryDir(); - - const raw = await fsp.readFile(statePath, 'utf-8'); - const state = JSON.parse(raw) as { - runs?: Array<{ sessionIds?: string[]; skillsCreated?: string[] }>; - }; - if (!Array.isArray(state.runs) || state.runs.length === 0) { - throw new Error( - 'Skill extraction finished without writing any run state', - ); - } + const raw = await fsp.readFile(statePath, 'utf-8'); + const state = JSON.parse(raw) as { + runs?: Array<{ sessionIds?: string[]; skillsCreated?: string[] }>; + }; + if (!Array.isArray(state.runs) || state.runs.length === 0) { + throw new Error('Skill extraction finished without writing any run state'); + } - return { - state: { - runs: state.runs.map((run) => ({ - sessionIds: Array.isArray(run.sessionIds) ? run.sessionIds : [], - skillsCreated: Array.isArray(run.skillsCreated) - ? run.skillsCreated - : [], - })), - }, - skillsDir, - }; - }); + return { + state: { + runs: state.runs.map((run) => ({ + sessionIds: Array.isArray(run.sessionIds) ? run.sessionIds : [], + skillsCreated: Array.isArray(run.skillsCreated) + ? run.skillsCreated + : [], + })), + }, + skillsDir, + }; } async function readSkillBodies(skillsDir: string): Promise { @@ -242,22 +147,27 @@ async function readSkillBodies(skillsDir: string): Promise { } } +/** + * Shared configOverrides for all skill extraction component evals. + * - experimentalMemoryManager: enables the memory extraction pipeline. + * - approvalMode: YOLO auto-approves tool calls (write_file, read_file) so the + * background agent can execute without interactive confirmation. + */ +const EXTRACTION_CONFIG_OVERRIDES = { + experimentalMemoryManager: true, + approvalMode: ApprovalMode.YOLO, +}; + describe('Skill Extraction', () => { - evalTest('USUALLY_PASSES', { + componentEvalTest('USUALLY_PASSES', { suiteName: 'skill-extraction', - suiteType: 'behavioral', + suiteType: 'component', name: 'ignores one-off incidents even when session summaries look similar', files: WORKSPACE_FILES, timeout: 180000, - params: { - settings: { - experimental: { - memoryManager: true, - }, - }, - }, - setup: async (rig) => { - await seedSessions(rig, [ + configOverrides: EXTRACTION_CONFIG_OVERRIDES, + setup: async (config) => { + await seedSessions(config, [ { sessionId: 'incident-login-redirect', summary: 'Debug login redirect loop in staging', @@ -294,12 +204,8 @@ describe('Skill Extraction', () => { }, ]); }, - prompt: - 'Read the local workspace files and summarize this repository in two short sentences.', - assert: async (rig, result) => { - assertModelHasOutput(result); - - const { state, skillsDir } = await waitForExtractionState(rig); + assert: async (config) => { + const { state, skillsDir } = await runExtractionAndReadState(config); const skillBodies = await readSkillBodies(skillsDir); expect(state.runs).toHaveLength(1); @@ -309,21 +215,15 @@ describe('Skill Extraction', () => { }, }); - evalTest('USUALLY_PASSES', { + componentEvalTest('USUALLY_PASSES', { suiteName: 'skill-extraction', - suiteType: 'behavioral', + suiteType: 'component', name: 'extracts a repeated project-specific workflow into a skill', files: WORKSPACE_FILES, timeout: 180000, - params: { - settings: { - experimental: { - memoryManager: true, - }, - }, - }, - setup: async (rig) => { - await seedSessions(rig, [ + configOverrides: EXTRACTION_CONFIG_OVERRIDES, + setup: async (config) => { + await seedSessions(config, [ { sessionId: 'settings-docs-regen-1', summary: 'Update settings docs after adding a config option', @@ -360,12 +260,8 @@ describe('Skill Extraction', () => { }, ]); }, - prompt: - 'Read the local workspace files and summarize this repository in two short sentences.', - assert: async (rig, result) => { - assertModelHasOutput(result); - - const { state, skillsDir } = await waitForExtractionState(rig); + assert: async (config) => { + const { state, skillsDir } = await runExtractionAndReadState(config); const skillBodies = await readSkillBodies(skillsDir); const combinedSkills = skillBodies.join('\n\n'); @@ -381,21 +277,15 @@ describe('Skill Extraction', () => { }, }); - evalTest('USUALLY_PASSES', { + componentEvalTest('USUALLY_PASSES', { suiteName: 'skill-extraction', - suiteType: 'behavioral', + suiteType: 'component', name: 'extracts a repeated multi-step migration workflow with ordering constraints', files: WORKSPACE_FILES, timeout: 180000, - params: { - settings: { - experimental: { - memoryManager: true, - }, - }, - }, - setup: async (rig) => { - await seedSessions(rig, [ + configOverrides: EXTRACTION_CONFIG_OVERRIDES, + setup: async (config) => { + await seedSessions(config, [ { sessionId: 'db-migration-v12', summary: 'Run database migration for v12 schema update', @@ -432,12 +322,8 @@ describe('Skill Extraction', () => { }, ]); }, - prompt: - 'Read the local workspace files and summarize this repository in two short sentences.', - assert: async (rig, result) => { - assertModelHasOutput(result); - - const { state, skillsDir } = await waitForExtractionState(rig); + assert: async (config) => { + const { state, skillsDir } = await runExtractionAndReadState(config); const skillBodies = await readSkillBodies(skillsDir); const combinedSkills = skillBodies.join('\n\n'); From ac0418a462879709555502020021525d1144f9fe Mon Sep 17 00:00:00 2001 From: Sandy Tao Date: Wed, 15 Apr 2026 11:33:41 -0700 Subject: [PATCH 6/6] fix(evals): correct suiteType from 'component' to 'component-level' --- evals/skill_extraction.eval.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/evals/skill_extraction.eval.ts b/evals/skill_extraction.eval.ts index f22b8696484..4149f29a67a 100644 --- a/evals/skill_extraction.eval.ts +++ b/evals/skill_extraction.eval.ts @@ -161,7 +161,7 @@ const EXTRACTION_CONFIG_OVERRIDES = { describe('Skill Extraction', () => { componentEvalTest('USUALLY_PASSES', { suiteName: 'skill-extraction', - suiteType: 'component', + suiteType: 'component-level', name: 'ignores one-off incidents even when session summaries look similar', files: WORKSPACE_FILES, timeout: 180000, @@ -217,7 +217,7 @@ describe('Skill Extraction', () => { componentEvalTest('USUALLY_PASSES', { suiteName: 'skill-extraction', - suiteType: 'component', + suiteType: 'component-level', name: 'extracts a repeated project-specific workflow into a skill', files: WORKSPACE_FILES, timeout: 180000, @@ -279,7 +279,7 @@ describe('Skill Extraction', () => { componentEvalTest('USUALLY_PASSES', { suiteName: 'skill-extraction', - suiteType: 'component', + suiteType: 'component-level', name: 'extracts a repeated multi-step migration workflow with ordering constraints', files: WORKSPACE_FILES, timeout: 180000,