Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion evals/app-test-helper.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,26 @@ import fs from 'node:fs';
import path from 'node:path';
import { DEFAULT_GEMINI_MODEL } from '@google/gemini-cli-core';

/**
* Config overrides for evals, with tool-restriction fields explicitly
* forbidden. Evals must test against the full, default tool set to ensure
* realistic behavior.
*/
interface EvalConfigOverrides {
/** Restricting tools via excludeTools in evals is forbidden. */
excludeTools?: never;
/** Restricting tools via coreTools in evals is forbidden. */
coreTools?: never;
/** Restricting tools via allowedTools in evals is forbidden. */
allowedTools?: never;
/** Restricting tools via mainAgentTools in evals is forbidden. */
mainAgentTools?: never;
[key: string]: unknown;
}

export interface AppEvalCase {
name: string;
configOverrides?: any;
configOverrides?: EvalConfigOverrides;
prompt: string;
timeout?: number;
files?: Record<string, string>;
Expand Down
4 changes: 0 additions & 4 deletions evals/generalist_delegation.eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ describe('generalist_delegation', () => {
experimental: {
enableAgents: true,
},
excludeTools: ['run_shell_command'],
},
files: {
'file1.ts': 'console.log("no semi")',
Expand Down Expand Up @@ -65,7 +64,6 @@ describe('generalist_delegation', () => {
experimental: {
enableAgents: true,
},
excludeTools: ['run_shell_command'],
},
files: {
'src/a.ts': 'export const a = 1;',
Expand Down Expand Up @@ -106,7 +104,6 @@ describe('generalist_delegation', () => {
experimental: {
enableAgents: true,
},
excludeTools: ['run_shell_command'],
},
files: {
'README.md': 'This is a proyect.',
Expand Down Expand Up @@ -141,7 +138,6 @@ describe('generalist_delegation', () => {
experimental: {
enableAgents: true,
},
excludeTools: ['run_shell_command'],
},
files: {
'src/VERSION': '1.2.3',
Expand Down
2 changes: 0 additions & 2 deletions evals/model_steering.eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ describe('Model Steering Behavioral Evals', () => {
appEvalTest('USUALLY_PASSES', {
name: 'Corrective Hint: Model switches task based on hint during tool turn',
configOverrides: {
excludeTools: ['run_shell_command', 'ls', 'google_web_search'],
modelSteering: true,
},
files: {
Expand Down Expand Up @@ -55,7 +54,6 @@ describe('Model Steering Behavioral Evals', () => {
appEvalTest('USUALLY_PASSES', {
name: 'Suggestive Hint: Model incorporates user guidance mid-stream',
configOverrides: {
excludeTools: ['run_shell_command', 'ls', 'google_web_search'],
modelSteering: true,
},
files: {},
Expand Down
68 changes: 8 additions & 60 deletions evals/save_memory.eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,7 @@ describe('save_memory', () => {
const rememberingFavoriteColor = "Agent remembers user's favorite color";
evalTest('ALWAYS_PASSES', {
name: rememberingFavoriteColor,
params: {
settings: { tools: { core: ['save_memory'] } },
},

prompt: `remember that my favorite color is blue.

what is my favorite color? tell me that and surround it with $ symbol`,
Expand All @@ -38,9 +36,7 @@ describe('save_memory', () => {
const rememberingCommandRestrictions = 'Agent remembers command restrictions';
evalTest('USUALLY_PASSES', {
name: rememberingCommandRestrictions,
params: {
settings: { tools: { core: ['save_memory'] } },
},

prompt: `I don't want you to ever run npm commands.`,
assert: async (rig, result) => {
const wasToolCalled = await rig.waitForToolCall('save_memory');
Expand All @@ -59,9 +55,7 @@ describe('save_memory', () => {
const rememberingWorkflow = 'Agent remembers workflow preferences';
evalTest('USUALLY_PASSES', {
name: rememberingWorkflow,
params: {
settings: { tools: { core: ['save_memory'] } },
},

prompt: `I want you to always lint after building.`,
assert: async (rig, result) => {
const wasToolCalled = await rig.waitForToolCall('save_memory');
Expand All @@ -81,9 +75,7 @@ describe('save_memory', () => {
'Agent ignores temporary conversation details';
evalTest('ALWAYS_PASSES', {
name: ignoringTemporaryInformation,
params: {
settings: { tools: { core: ['save_memory'] } },
},

prompt: `I'm going to get a coffee.`,
assert: async (rig, result) => {
await rig.waitForTelemetryReady();
Expand All @@ -106,9 +98,7 @@ describe('save_memory', () => {
const rememberingPetName = "Agent remembers user's pet's name";
evalTest('ALWAYS_PASSES', {
name: rememberingPetName,
params: {
settings: { tools: { core: ['save_memory'] } },
},

prompt: `Please remember that my dog's name is Buddy.`,
assert: async (rig, result) => {
const wasToolCalled = await rig.waitForToolCall('save_memory');
Expand All @@ -127,9 +117,7 @@ describe('save_memory', () => {
const rememberingCommandAlias = 'Agent remembers custom command aliases';
evalTest('ALWAYS_PASSES', {
name: rememberingCommandAlias,
params: {
settings: { tools: { core: ['save_memory'] } },
},

prompt: `When I say 'start server', you should run 'npm run dev'.`,
assert: async (rig, result) => {
const wasToolCalled = await rig.waitForToolCall('save_memory');
Expand All @@ -149,18 +137,6 @@ describe('save_memory', () => {
"Agent ignores workspace's database schema location";
evalTest('USUALLY_PASSES', {
name: ignoringDbSchemaLocation,
params: {
settings: {
tools: {
core: [
'save_memory',
'list_directory',
'read_file',
'run_shell_command',
],
},
},
},
prompt: `The database schema for this workspace is located in \`db/schema.sql\`.`,
assert: async (rig, result) => {
await rig.waitForTelemetryReady();
Expand All @@ -180,9 +156,7 @@ describe('save_memory', () => {
"Agent remembers user's coding style preference";
evalTest('ALWAYS_PASSES', {
name: rememberingCodingStyle,
params: {
settings: { tools: { core: ['save_memory'] } },
},

prompt: `I prefer to use tabs instead of spaces for indentation.`,
assert: async (rig, result) => {
const wasToolCalled = await rig.waitForToolCall('save_memory');
Expand All @@ -202,18 +176,6 @@ describe('save_memory', () => {
'Agent ignores workspace build artifact location';
evalTest('USUALLY_PASSES', {
name: ignoringBuildArtifactLocation,
params: {
settings: {
tools: {
core: [
'save_memory',
'list_directory',
'read_file',
'run_shell_command',
],
},
},
},
prompt: `In this workspace, build artifacts are stored in the \`dist/artifacts\` directory.`,
assert: async (rig, result) => {
await rig.waitForTelemetryReady();
Expand All @@ -232,18 +194,6 @@ describe('save_memory', () => {
const ignoringMainEntryPoint = "Agent ignores workspace's main entry point";
evalTest('USUALLY_PASSES', {
name: ignoringMainEntryPoint,
params: {
settings: {
tools: {
core: [
'save_memory',
'list_directory',
'read_file',
'run_shell_command',
],
},
},
},
prompt: `The main entry point for this workspace is \`src/index.js\`.`,
assert: async (rig, result) => {
await rig.waitForTelemetryReady();
Expand All @@ -262,9 +212,7 @@ describe('save_memory', () => {
const rememberingBirthday = "Agent remembers user's birthday";
evalTest('ALWAYS_PASSES', {
name: rememberingBirthday,
params: {
settings: { tools: { core: ['save_memory'] } },
},

prompt: `My birthday is on June 15th.`,
assert: async (rig, result) => {
const wasToolCalled = await rig.waitForToolCall('save_memory');
Expand Down
18 changes: 17 additions & 1 deletion evals/test-helper.ts
Original file line number Diff line number Diff line change
Expand Up @@ -197,9 +197,25 @@ export function symlinkNodeModules(testDir: string) {
}
}

/**
* Settings that are forbidden in evals. Evals should never restrict which
* tools are available — they must test against the full, default tool set
* to ensure realistic behavior.
*/
interface ForbiddenToolSettings {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you haven't yet, can you try running it in GitHub actions to make sure the ones that previously passed still pass?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tools?: {
/** Restricting core tools in evals is forbidden. */
Comment thread
SandyTao520 marked this conversation as resolved.
core?: never;
[key: string]: unknown;
};
}
Comment thread
SandyTao520 marked this conversation as resolved.

export interface EvalCase {
name: string;
params?: Record<string, any>;
params?: {
settings?: ForbiddenToolSettings & Record<string, unknown>;
[key: string]: unknown;
};
prompt: string;
timeout?: number;
files?: Record<string, string>;
Expand Down
Loading