From 5746424b18dd00ae34edc97a8ed3877952df183d Mon Sep 17 00:00:00 2001 From: galz10 Date: Mon, 20 Apr 2026 16:44:22 -0700 Subject: [PATCH 01/12] feat(core): enhance shell command validation and add core tools allowlist - Implements recursive validation for shell command substitutions ($(...), `...`) and subshells. - Adds support for settings.tools.core to allow explicit, strict allowlisting of core tools. - Refactors PolicyEngine.check to handle shell wrappers (e.g., bash -c) and complex chained commands. - Introduces comprehensive regression tests for shell safety, pipes, and redirection. - Updates shell-utils to better identify and extract shell structure details using Tree-sitter. --- packages/core/src/policy/config.ts | 52 +++++-- .../src/policy/core-tools-mapping.test.ts | 76 +++++++++ .../core/src/policy/policy-engine.test.ts | 36 ++++- packages/core/src/policy/policy-engine.ts | 147 ++++++++---------- .../policy/shell-safety-regression.test.ts | 134 ++++++++++++++++ packages/core/src/policy/shell-safety.test.ts | 24 +++ .../src/policy/shell-substitution.test.ts | 67 ++++++++ packages/core/src/policy/types.ts | 1 + packages/core/src/tools/ripGrep.test.ts | 6 +- packages/core/src/utils/shell-utils.ts | 8 +- 10 files changed, 458 insertions(+), 93 deletions(-) create mode 100644 packages/core/src/policy/core-tools-mapping.test.ts create mode 100644 packages/core/src/policy/shell-safety-regression.test.ts create mode 100644 packages/core/src/policy/shell-substitution.test.ts diff --git a/packages/core/src/policy/config.ts b/packages/core/src/policy/config.ts index 359054add33..d7b06024a69 100644 --- a/packages/core/src/policy/config.ts +++ b/packages/core/src/policy/config.ts @@ -75,6 +75,7 @@ export const ADMIN_POLICY_TIER = 5; export const MCP_EXCLUDED_PRIORITY = USER_POLICY_TIER + 0.9; export const EXCLUDE_TOOLS_FLAG_PRIORITY = USER_POLICY_TIER + 0.4; export const ALLOWED_TOOLS_FLAG_PRIORITY = USER_POLICY_TIER + 0.3; +export const CORE_TOOLS_FLAG_PRIORITY = USER_POLICY_TIER + 0.25; export const TRUSTED_MCP_SERVER_PRIORITY = USER_POLICY_TIER + 0.2; export const ALLOWED_MCP_SERVER_PRIORITY = USER_POLICY_TIER + 0.1; @@ -434,10 +435,12 @@ export async function createPolicyEngineConfig( } } - // Tools that are explicitly allowed in the settings. - // Priority: ALLOWED_TOOLS_FLAG_PRIORITY (user tier - explicit temporary allows) - if (settings.tools?.allowed) { - for (const tool of settings.tools.allowed) { + const mapToolsToRules = ( + tools: string[], + priority: number, + source: string, + ) => { + for (const tool of tools) { // Check for legacy format: toolName(args) const match = tool.match(/^([a-zA-Z0-9_-]+)\((.*)\)$/); if (match) { @@ -455,9 +458,9 @@ export async function createPolicyEngineConfig( rules.push({ toolName, decision: PolicyDecision.ALLOW, - priority: ALLOWED_TOOLS_FLAG_PRIORITY, + priority, argsPattern: new RegExp(pattern), - source: 'Settings (Tools Allowed)', + source, }); } } @@ -467,8 +470,8 @@ export async function createPolicyEngineConfig( rules.push({ toolName, decision: PolicyDecision.ALLOW, - priority: ALLOWED_TOOLS_FLAG_PRIORITY, - source: 'Settings (Tools Allowed)', + priority, + source, }); } } else { @@ -479,11 +482,40 @@ export async function createPolicyEngineConfig( rules.push({ toolName, decision: PolicyDecision.ALLOW, - priority: ALLOWED_TOOLS_FLAG_PRIORITY, - source: 'Settings (Tools Allowed)', + priority, + source, }); } } + }; + + // Tools that are explicitly allowed in the settings. + // Priority: ALLOWED_TOOLS_FLAG_PRIORITY (user tier - explicit temporary allows) + if (settings.tools?.allowed) { + mapToolsToRules( + settings.tools.allowed, + ALLOWED_TOOLS_FLAG_PRIORITY, + 'Settings (Tools Allowed)', + ); + } + + // Core tools that are restricted in the settings. + // Priority: CORE_TOOLS_FLAG_PRIORITY (user tier - core tool allowlist) + if (settings.tools?.core) { + mapToolsToRules( + settings.tools.core, + CORE_TOOLS_FLAG_PRIORITY, + 'Settings (Core Tools)', + ); + + // If core tools are restricted, we should add a default DENY rule for everything else + // at a slightly lower priority than the explicit allows. + rules.push({ + toolName: '*', + decision: PolicyDecision.DENY, + priority: CORE_TOOLS_FLAG_PRIORITY - 0.01, + source: 'Settings (Core Tools Allowlist Enforcement)', + }); } // MCP servers that are trusted in the settings. diff --git a/packages/core/src/policy/core-tools-mapping.test.ts b/packages/core/src/policy/core-tools-mapping.test.ts new file mode 100644 index 00000000000..95877c6ac44 --- /dev/null +++ b/packages/core/src/policy/core-tools-mapping.test.ts @@ -0,0 +1,76 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, it, expect } from 'vitest'; +import { createPolicyEngineConfig } from './config.js'; +import { PolicyEngine } from './policy-engine.js'; +import { PolicyDecision, ApprovalMode } from './types.js'; + +describe('PolicyEngine - Core Tools Mapping', () => { + it('should allow tools explicitly listed in settings.tools.core', async () => { + const settings = { + tools: { + core: ['run_shell_command(ls)', 'run_shell_command(git status)'], + }, + }; + + const config = await createPolicyEngineConfig( + settings, + ApprovalMode.DEFAULT, + undefined, + true, // interactive + ); + + const engine = new PolicyEngine(config); + + // Test simple tool name + const result1 = await engine.check( + { name: 'run_shell_command', args: { command: 'ls' } }, + undefined, + ); + expect(result1.decision).toBe(PolicyDecision.ALLOW); + + // Test tool name with args + const result2 = await engine.check( + { name: 'run_shell_command', args: { command: 'git status' } }, + undefined, + ); + expect(result2.decision).toBe(PolicyDecision.ALLOW); + + // Test tool not in core list + const result3 = await engine.check( + { name: 'run_shell_command', args: { command: 'npm test' } }, + undefined, + ); + // Should be DENIED because of strict allowlist + expect(result3.decision).toBe(PolicyDecision.DENY); + }); + + it('should allow tools in tools.core even if they are restricted by default policies', async () => { + // By default run_shell_command is ASK_USER. + // Putting it in tools.core should make it ALLOW. + const settings = { + tools: { + core: ['run_shell_command'], + }, + }; + + const config = await createPolicyEngineConfig( + settings, + ApprovalMode.DEFAULT, + undefined, + true, + ); + + const engine = new PolicyEngine(config); + + const result = await engine.check( + { name: 'run_shell_command', args: { command: 'any command' } }, + undefined, + ); + expect(result.decision).toBe(PolicyDecision.ALLOW); + }); +}); diff --git a/packages/core/src/policy/policy-engine.test.ts b/packages/core/src/policy/policy-engine.test.ts index 5606c49793f..8604a799616 100644 --- a/packages/core/src/policy/policy-engine.test.ts +++ b/packages/core/src/policy/policy-engine.test.ts @@ -43,6 +43,35 @@ vi.mock('../utils/shell-utils.js', async (importOriginal) => { } return [command]; }), + parseCommandDetails: vi.fn().mockImplementation((command: string) => { + // Basic mock implementation for PolicyEngine test needs + const commands = command.includes('&&') + ? command.split('&&').map((c) => c.trim()) + : [command.trim()]; + + // Detect $(...) or `...` and add as sub-commands for recursion tests + const subCommands = [...commands]; + for (const cmd of commands) { + const subMatch = cmd.match(/\$\((.*)\)/) || cmd.match(/`(.*)`/); + if (subMatch?.[1]) { + subCommands.push(subMatch[1].trim()); + } + } + + return { + details: subCommands.map((c, i) => ({ + name: c.split(' ')[0], + text: c, + startIndex: i === 0 ? 0 : -1, // Simple root indication + })), + hasError: false, + }; + }), + stripShellWrapper: vi.fn().mockImplementation((command: string) => { + // Simple mock for stripping wrappers + const match = command.match(/^(?:bash|sh|zsh)\s+-c\s+["'](.*)["']$/i); + return match ? match[1] : command; + }), hasRedirection: vi.fn().mockImplementation( (command: string) => // Simple mock: true if '>' is present, unless it looks like "-> arrow" @@ -1862,7 +1891,6 @@ describe('PolicyEngine', () => { }); it('should return ASK_USER in non-YOLO mode if shell command parsing fails', async () => { - const { splitCommands } = await import('../utils/shell-utils.js'); const rules: PolicyRule[] = [ { toolName: 'run_shell_command', @@ -1877,7 +1905,11 @@ describe('PolicyEngine', () => { }); // Simulate parsing failure - vi.mocked(splitCommands).mockReturnValueOnce([]); + const { parseCommandDetails } = await import('../utils/shell-utils.js'); + vi.mocked(parseCommandDetails).mockReturnValueOnce({ + details: [], + hasError: true, + }); const result = await engine.check( { name: 'run_shell_command', args: { command: 'complex command' } }, diff --git a/packages/core/src/policy/policy-engine.ts b/packages/core/src/policy/policy-engine.ts index a9e049c74d2..7ff27bd91ad 100644 --- a/packages/core/src/policy/policy-engine.ts +++ b/packages/core/src/policy/policy-engine.ts @@ -7,8 +7,10 @@ import { type FunctionCall } from '@google/genai'; import { SHELL_TOOL_NAMES, + REDIRECTION_NAMES, initializeShellParsers, - splitCommands, + parseCommandDetails, + stripShellWrapper, hasRedirection, extractStringFromParseEntry, } from '../utils/shell-utils.js'; @@ -359,7 +361,8 @@ export class PolicyEngine { } await initializeShellParsers(); - const subCommands = splitCommands(command); + const parsed = parseCommandDetails(command); + const subCommands = parsed?.details ?? []; if (subCommands.length === 0) { // If the matched rule says DENY, we should respect it immediately even if parsing fails. @@ -380,115 +383,103 @@ export class PolicyEngine { ); // Parsing logic failed, we can't trust it. Use default decision ASK_USER (or DENY in non-interactive). - // We return the rule that matched so the evaluation loop terminates. return { decision: this.defaultDecision, rule, }; } - // If there are multiple parts, or if we just want to validate the single part against DENY rules - if (subCommands.length > 0) { + debugLogger.debug( + `[PolicyEngine.check] Validating shell command: ${subCommands.length} parts`, + ); + + if (ruleDecision === PolicyDecision.DENY) { + return { decision: PolicyDecision.DENY, rule }; + } + + // Start optimistically. If all parts are ALLOW, the whole is ALLOW. + // We will downgrade if any part is ASK_USER or DENY. + let aggregateDecision = PolicyDecision.ALLOW; + let responsibleRule: PolicyRule | undefined; + + // Check for redirection on the full command string + if (this.shouldDowngradeForRedirection(command, allowRedirection)) { debugLogger.debug( - `[PolicyEngine.check] Validating shell command: ${subCommands.length} parts`, + `[PolicyEngine.check] Downgrading ALLOW to ASK_USER for redirected command: ${command}`, ); + aggregateDecision = PolicyDecision.ASK_USER; + responsibleRule = undefined; // Inherent policy + } - if (ruleDecision === PolicyDecision.DENY) { - return { decision: PolicyDecision.DENY, rule }; + for (const detail of subCommands) { + if (REDIRECTION_NAMES.has(detail.name)) { + continue; } - // Start optimistically. If all parts are ALLOW, the whole is ALLOW. - // We will downgrade if any part is ASK_USER or DENY. - let aggregateDecision = PolicyDecision.ALLOW; - let responsibleRule: PolicyRule | undefined; + const subCmd = detail.text.trim(); + const isAtomic = + subCmd === command || + (detail.startIndex === 0 && detail.text.length === command.length); - // Check for redirection on the full command string - if (this.shouldDowngradeForRedirection(command, allowRedirection)) { - debugLogger.debug( - `[PolicyEngine.check] Downgrading ALLOW to ASK_USER for redirected command: ${command}`, + // Recursive check for shell wrappers (bash -c, etc.) + const stripped = stripShellWrapper(subCmd); + if (stripped !== subCmd) { + const wrapperResult = await this.check( + { name: toolName, args: { command: stripped, dir_path } }, + serverName, + toolAnnotations, + subagent, + true, ); - aggregateDecision = PolicyDecision.ASK_USER; - responsibleRule = undefined; // Inherent policy - } - for (const rawSubCmd of subCommands) { - const subCmd = rawSubCmd.trim(); - // Prevent infinite recursion for the root command - if (subCmd === command) { - if (this.shouldDowngradeForRedirection(subCmd, allowRedirection)) { - debugLogger.debug( - `[PolicyEngine.check] Downgrading ALLOW to ASK_USER for redirected command: ${subCmd}`, - ); - // Redirection always downgrades ALLOW to ASK_USER - if (aggregateDecision === PolicyDecision.ALLOW) { - aggregateDecision = PolicyDecision.ASK_USER; - responsibleRule = undefined; // Inherent policy - } - } else { - // Atomic command matching the rule. - if ( - ruleDecision === PolicyDecision.ASK_USER && - aggregateDecision === PolicyDecision.ALLOW - ) { - aggregateDecision = PolicyDecision.ASK_USER; - responsibleRule = rule; - } - } - continue; + if (wrapperResult.decision === PolicyDecision.DENY) + return wrapperResult; + if (wrapperResult.decision === PolicyDecision.ASK_USER) { + aggregateDecision = PolicyDecision.ASK_USER; + responsibleRule ??= wrapperResult.rule; } + } + if (isAtomic) { + if ( + ruleDecision === PolicyDecision.ASK_USER && + aggregateDecision === PolicyDecision.ALLOW + ) { + aggregateDecision = PolicyDecision.ASK_USER; + responsibleRule = rule; + } + } else { const subResult = await this.check( { name: toolName, args: { command: subCmd, dir_path } }, serverName, toolAnnotations, subagent, + true, ); - // subResult.decision is already filtered through applyNonInteractiveMode by this.check() - const subDecision = subResult.decision; - - // If any part is DENIED, the whole command is DENY - if (subDecision === PolicyDecision.DENY) { - return { - decision: PolicyDecision.DENY, - rule: subResult.rule, - }; - } + if (subResult.decision === PolicyDecision.DENY) return subResult; - // If any part requires ASK_USER, the whole command requires ASK_USER - if (subDecision === PolicyDecision.ASK_USER) { + if (subResult.decision === PolicyDecision.ASK_USER) { aggregateDecision = PolicyDecision.ASK_USER; - if (!responsibleRule) { - responsibleRule = subResult.rule; - } + responsibleRule ??= subResult.rule; } - // Check for redirection in allowed sub-commands + // Downgrade if sub-command has redirection if ( - subDecision === PolicyDecision.ALLOW && + subResult.decision === PolicyDecision.ALLOW && this.shouldDowngradeForRedirection(subCmd, allowRedirection) ) { - debugLogger.debug( - `[PolicyEngine.check] Downgrading ALLOW to ASK_USER for redirected command: ${subCmd}`, - ); if (aggregateDecision === PolicyDecision.ALLOW) { aggregateDecision = PolicyDecision.ASK_USER; responsibleRule = undefined; } } } - - return { - decision: aggregateDecision, - // If we stayed at ALLOW, we return the original rule (if any). - // If we downgraded, we return the responsible rule (or undefined if implicit). - rule: aggregateDecision === ruleDecision ? rule : responsibleRule, - }; } return { - decision: ruleDecision, - rule, + decision: aggregateDecision, + rule: aggregateDecision === ruleDecision ? rule : responsibleRule, }; } @@ -501,6 +492,7 @@ export class PolicyEngine { serverName: string | undefined, toolAnnotations?: Record, subagent?: string, + skipHeuristics = false, ): Promise { // Case 1: Metadata injection is the primary and safest way to identify an MCP server. // If we have explicit `_serverName` metadata (usually injected by tool-registry for active tools), use it. @@ -594,6 +586,7 @@ export class PolicyEngine { let ruleDecision = rule.decision; if ( + !skipHeuristics && isShellCommand && command && !('commandPrefix' in rule) && @@ -615,12 +608,10 @@ export class PolicyEngine { subagent, ); decision = shellResult.decision; - if (shellResult.rule) { - matchedRule = shellResult.rule; - break; - } + matchedRule = shellResult.rule; + break; } else { - decision = rule.decision; + decision = ruleDecision; matchedRule = rule; break; } @@ -643,7 +634,7 @@ export class PolicyEngine { ); if (toolName && SHELL_TOOL_NAMES.includes(toolName)) { let heuristicDecision = this.defaultDecision; - if (command) { + if (!skipHeuristics && command) { heuristicDecision = await this.applyShellHeuristics( command, heuristicDecision, diff --git a/packages/core/src/policy/shell-safety-regression.test.ts b/packages/core/src/policy/shell-safety-regression.test.ts new file mode 100644 index 00000000000..1a1d6089590 --- /dev/null +++ b/packages/core/src/policy/shell-safety-regression.test.ts @@ -0,0 +1,134 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, it, expect, beforeAll } from 'vitest'; +import { PolicyEngine } from './policy-engine.js'; +import { PolicyDecision, ApprovalMode } from './types.js'; +import { initializeShellParsers } from '../utils/shell-utils.js'; +import { buildArgsPatterns } from './utils.js'; + +describe('PolicyEngine - Shell Safety Regression Suite', () => { + let engine: PolicyEngine; + + beforeAll(async () => { + await initializeShellParsers(); + }); + + const setupEngine = (allowedCommands: string[]) => { + const rules = allowedCommands.map((cmd) => ({ + toolName: 'run_shell_command', + decision: PolicyDecision.ALLOW, + argsPattern: new RegExp(buildArgsPatterns(undefined, cmd)[0]!), + priority: 10, + })); + + return new PolicyEngine({ + rules, + approvalMode: ApprovalMode.DEFAULT, + defaultDecision: PolicyDecision.ASK_USER, + }); + }; + + it('should block unauthorized chained command with &&', async () => { + engine = setupEngine(['echo']); + const result = await engine.check( + { name: 'run_shell_command', args: { command: 'echo hi && ls' } }, + undefined, + ); + expect(result.decision).toBe(PolicyDecision.ASK_USER); + }); + + it('should allow authorized chained command with &&', async () => { + engine = setupEngine(['echo', 'ls']); + const result = await engine.check( + { name: 'run_shell_command', args: { command: 'echo hi && ls' } }, + undefined, + ); + expect(result.decision).toBe(PolicyDecision.ALLOW); + }); + + it('should block unauthorized chained command with ||', async () => { + engine = setupEngine(['false']); + const result = await engine.check( + { name: 'run_shell_command', args: { command: 'false || ls' } }, + undefined, + ); + expect(result.decision).toBe(PolicyDecision.ASK_USER); + }); + + it('should block unauthorized chained command with ;', async () => { + engine = setupEngine(['echo']); + const result = await engine.check( + { name: 'run_shell_command', args: { command: 'echo hi; ls' } }, + undefined, + ); + expect(result.decision).toBe(PolicyDecision.ASK_USER); + }); + + it('should block unauthorized command in pipe |', async () => { + engine = setupEngine(['echo']); + const result = await engine.check( + { name: 'run_shell_command', args: { command: 'echo hi | grep "hi"' } }, + undefined, + ); + expect(result.decision).toBe(PolicyDecision.ASK_USER); + }); + + it('should allow authorized command in pipe |', async () => { + engine = setupEngine(['echo', 'grep']); + const result = await engine.check( + { name: 'run_shell_command', args: { command: 'echo hi | grep "hi"' } }, + undefined, + ); + expect(result.decision).toBe(PolicyDecision.ALLOW); + }); + + it('should block unauthorized chained command with &', async () => { + engine = setupEngine(['echo']); + const result = await engine.check( + { name: 'run_shell_command', args: { command: 'echo hi & ls' } }, + undefined, + ); + expect(result.decision).toBe(PolicyDecision.ASK_USER); + }); + + it('should allow authorized chained command with &', async () => { + engine = setupEngine(['echo', 'ls']); + const result = await engine.check( + { name: 'run_shell_command', args: { command: 'echo hi & ls' } }, + undefined, + ); + expect(result.decision).toBe(PolicyDecision.ALLOW); + }); + + it('should block unauthorized command in nested substitution', async () => { + engine = setupEngine(['echo', 'cat']); + const result = await engine.check( + { name: 'run_shell_command', args: { command: 'echo $(cat $(ls))' } }, + undefined, + ); + expect(result.decision).toBe(PolicyDecision.ASK_USER); + }); + + it('should allow authorized command in nested substitution', async () => { + engine = setupEngine(['echo', 'cat', 'ls']); + const result = await engine.check( + { name: 'run_shell_command', args: { command: 'echo $(cat $(ls))' } }, + undefined, + ); + expect(result.decision).toBe(PolicyDecision.ALLOW); + }); + + it('should block command redirection if not explicitly allowed', async () => { + engine = setupEngine(['echo']); + const result = await engine.check( + { name: 'run_shell_command', args: { command: 'echo hi > /tmp/test' } }, + undefined, + ); + // Inherent policy: redirection downgrades to ASK_USER + expect(result.decision).toBe(PolicyDecision.ASK_USER); + }); +}); diff --git a/packages/core/src/policy/shell-safety.test.ts b/packages/core/src/policy/shell-safety.test.ts index 340264485ec..51d3d26294a 100644 --- a/packages/core/src/policy/shell-safety.test.ts +++ b/packages/core/src/policy/shell-safety.test.ts @@ -59,6 +59,30 @@ vi.mock('../utils/shell-utils.js', async (importOriginal) => { return { ...actual, initializeShellParsers: vi.fn(), + parseCommandDetails: (command: string) => { + if (Object.prototype.hasOwnProperty.call(commandMap, command)) { + const subcommands = commandMap[command]; + return { + details: subcommands.map((text) => ({ + name: text.split(' ')[0], + text, + startIndex: command.indexOf(text), + })), + hasError: subcommands.length === 0 && command.includes('&&&'), + }; + } + return { + details: [ + { + name: command.split(' ')[0], + text: command, + startIndex: 0, + }, + ], + hasError: false, + }; + }, + stripShellWrapper: (command: string) => command, splitCommands: (command: string) => { if (Object.prototype.hasOwnProperty.call(commandMap, command)) { return commandMap[command]; diff --git a/packages/core/src/policy/shell-substitution.test.ts b/packages/core/src/policy/shell-substitution.test.ts new file mode 100644 index 00000000000..d858b517b78 --- /dev/null +++ b/packages/core/src/policy/shell-substitution.test.ts @@ -0,0 +1,67 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { expect, describe, it, beforeAll } from 'vitest'; +import { PolicyEngine } from './policy-engine.js'; +import { PolicyDecision } from './types.js'; +import { initializeShellParsers } from '../utils/shell-utils.js'; + +describe('PolicyEngine Command Substitution Validation', () => { + beforeAll(async () => { + await initializeShellParsers(); + }); + + const setupEngine = (blockedCmd: string) => + new PolicyEngine({ + defaultDecision: PolicyDecision.ALLOW, + rules: [ + { + toolName: 'run_shell_command', + argsPattern: new RegExp(`"command":"${blockedCmd}"`), + decision: PolicyDecision.DENY, + }, + ], + }); + + it('should block echo $(dangerous_cmd) when dangerous_cmd is explicitly blocked', async () => { + const engine = setupEngine('dangerous_cmd'); + const result = await engine.check( + { name: 'run_shell_command', args: { command: 'echo $(dangerous_cmd)' } }, + 'test-server', + ); + expect(result.decision).toBe(PolicyDecision.DENY); + }); + + it('should block backtick substitution `dangerous_cmd`', async () => { + const engine = setupEngine('dangerous_cmd'); + const result = await engine.check( + { name: 'run_shell_command', args: { command: 'echo `dangerous_cmd`' } }, + 'test-server', + ); + expect(result.decision).toBe(PolicyDecision.DENY); + }); + + it('should block commands inside subshells (dangerous_cmd)', async () => { + const engine = setupEngine('dangerous_cmd'); + const result = await engine.check( + { name: 'run_shell_command', args: { command: '(dangerous_cmd)' } }, + 'test-server', + ); + expect(result.decision).toBe(PolicyDecision.DENY); + }); + + it('should handle nested substitutions deeply', async () => { + const engine = setupEngine('deep_danger'); + const result = await engine.check( + { + name: 'run_shell_command', + args: { command: 'echo $(ls $(deep_danger))' }, + }, + 'test-server', + ); + expect(result.decision).toBe(PolicyDecision.DENY); + }); +}); diff --git a/packages/core/src/policy/types.ts b/packages/core/src/policy/types.ts index b843129c99a..aa5275865c9 100644 --- a/packages/core/src/policy/types.ts +++ b/packages/core/src/policy/types.ts @@ -335,6 +335,7 @@ export interface PolicySettings { allowed?: string[]; }; tools?: { + core?: string[]; exclude?: string[]; allowed?: string[]; }; diff --git a/packages/core/src/tools/ripGrep.test.ts b/packages/core/src/tools/ripGrep.test.ts index 9ad575833a7..bd3cd211899 100644 --- a/packages/core/src/tools/ripGrep.test.ts +++ b/packages/core/src/tools/ripGrep.test.ts @@ -1993,13 +1993,15 @@ describe('getRipgrepPath', () => { vi.mocked(fileExists).mockImplementation( async (checkPath) => checkPath.includes(path.normalize('core/vendor/ripgrep')) && - !checkPath.includes('tools'), + !checkPath.includes(path.join(path.sep, 'tools', path.sep)), ); const resolvedPath = await getRipgrepPath(); expect(resolvedPath).not.toBeNull(); expect(resolvedPath).toContain(path.normalize('core/vendor/ripgrep')); - expect(resolvedPath).not.toContain('tools'); + expect(resolvedPath).not.toContain( + path.join(path.sep, 'tools', path.sep), + ); }); it('should return null if binary is missing from both paths', async () => { diff --git a/packages/core/src/utils/shell-utils.ts b/packages/core/src/utils/shell-utils.ts index 46cffa1d35c..9aaa07ecd95 100644 --- a/packages/core/src/utils/shell-utils.ts +++ b/packages/core/src/utils/shell-utils.ts @@ -240,11 +240,13 @@ foreach ($commandAst in $commandAsts) { 'utf16le', ).toString('base64'); -const REDIRECTION_NAMES = new Set([ +export const REDIRECTION_NAMES = new Set([ 'redirection (<)', 'redirection (>)', 'heredoc (<<)', 'herestring (<<<)', + 'command substitution', + 'subshell', ]); function createParser(): Parser | null { @@ -360,6 +362,10 @@ function extractNameFromNode(node: Node): string | null { return 'heredoc (<<)'; case 'herestring_redirect': return 'herestring (<<<)'; + case 'command_substitution': + return 'command substitution'; + case 'subshell': + return 'subshell'; default: return null; } From 8c7b8373d4738765e0cf5c241aa2710d15919ef2 Mon Sep 17 00:00:00 2001 From: davidapierce Date: Tue, 21 Apr 2026 20:13:06 +0000 Subject: [PATCH 02/12] addressed the failure in shell-substitution.test.ts --- packages/core/src/utils/shell-utils.ts | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/packages/core/src/utils/shell-utils.ts b/packages/core/src/utils/shell-utils.ts index 9aaa07ecd95..0e3605e860e 100644 --- a/packages/core/src/utils/shell-utils.ts +++ b/packages/core/src/utils/shell-utils.ts @@ -246,6 +246,8 @@ export const REDIRECTION_NAMES = new Set([ 'heredoc (<<)', 'herestring (<<<)', 'command substitution', + 'backtick substitution', + 'process substitution', 'subshell', ]); @@ -363,7 +365,10 @@ function extractNameFromNode(node: Node): string | null { case 'herestring_redirect': return 'herestring (<<<)'; case 'command_substitution': + case 'backtick_substitution': return 'command substitution'; + case 'process_substitution': + return 'process substitution'; case 'subshell': return 'subshell'; default: From ffd9f9fa15b0df65a48fa6e668a8ac132300b272 Mon Sep 17 00:00:00 2001 From: davidapierce Date: Tue, 21 Apr 2026 21:33:27 +0000 Subject: [PATCH 03/12] test(core): mock getShellConfiguration in shell substitution tests --- .../core/src/policy/shell-substitution.test.ts | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/packages/core/src/policy/shell-substitution.test.ts b/packages/core/src/policy/shell-substitution.test.ts index d858b517b78..f42e30d2079 100644 --- a/packages/core/src/policy/shell-substitution.test.ts +++ b/packages/core/src/policy/shell-substitution.test.ts @@ -4,11 +4,26 @@ * SPDX-License-Identifier: Apache-2.0 */ -import { expect, describe, it, beforeAll } from 'vitest'; +import { expect, describe, it, beforeAll, vi } from 'vitest'; import { PolicyEngine } from './policy-engine.js'; import { PolicyDecision } from './types.js'; import { initializeShellParsers } from '../utils/shell-utils.js'; +// Mock shell-utils to ensure consistent behavior across platforms (especially Windows CI) +// We want to test PolicyEngine logic with Bash syntax rules. +vi.mock('../utils/shell-utils.js', async (importOriginal) => { + const actual = + await importOriginal(); + return { + ...actual, + getShellConfiguration: () => ({ + executable: 'bash', + argsPrefix: ['-c'], + shell: 'bash', + }), + }; +}); + describe('PolicyEngine Command Substitution Validation', () => { beforeAll(async () => { await initializeShellParsers(); From 579ebe08506ca64a29283299d9d87eb333237122 Mon Sep 17 00:00:00 2001 From: davidapierce Date: Wed, 22 Apr 2026 15:39:33 +0000 Subject: [PATCH 04/12] fix failing unit test for windows. --- .../core/src/policy/shell-substitution.test.ts | 15 +++++++++++++++ packages/core/src/utils/shell-utils.ts | 3 ++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/packages/core/src/policy/shell-substitution.test.ts b/packages/core/src/policy/shell-substitution.test.ts index f42e30d2079..cc28847233c 100644 --- a/packages/core/src/policy/shell-substitution.test.ts +++ b/packages/core/src/policy/shell-substitution.test.ts @@ -9,6 +9,21 @@ import { PolicyEngine } from './policy-engine.js'; import { PolicyDecision } from './types.js'; import { initializeShellParsers } from '../utils/shell-utils.js'; +// Mock node:os to ensure shell-utils logic always thinks it's on a POSIX-like system. +// This ensures that internal calls to getShellConfiguration() and isWindows() +// within the shell-utils module return 'bash' configuration, even on Windows CI. +vi.mock('node:os', async (importOriginal) => { + const actual = await importOriginal(); + return { + ...actual, + default: { + ...actual, + platform: () => 'linux', + }, + platform: () => 'linux', + }; +}); + // Mock shell-utils to ensure consistent behavior across platforms (especially Windows CI) // We want to test PolicyEngine logic with Bash syntax rules. vi.mock('../utils/shell-utils.js', async (importOriginal) => { diff --git a/packages/core/src/utils/shell-utils.ts b/packages/core/src/utils/shell-utils.ts index 0e3605e860e..d2b767788a5 100644 --- a/packages/core/src/utils/shell-utils.ts +++ b/packages/core/src/utils/shell-utils.ts @@ -365,8 +365,9 @@ function extractNameFromNode(node: Node): string | null { case 'herestring_redirect': return 'herestring (<<<)'; case 'command_substitution': - case 'backtick_substitution': return 'command substitution'; + case 'backtick_substitution': + return 'backtick substitution'; case 'process_substitution': return 'process substitution'; case 'subshell': From add751ea25a247ca752551d8fe7b0a3ee74cb556 Mon Sep 17 00:00:00 2001 From: Keith Schaab Date: Thu, 23 Apr 2026 01:55:22 +0000 Subject: [PATCH 05/12] Modifying strategy around planning modes and fixing ask handling --- .../__snapshots__/InputPrompt.test.tsx.snap | 7 +++ packages/core/src/policy/config.ts | 15 ++++++ packages/core/src/policy/policy-engine.ts | 52 +++++++++++-------- 3 files changed, 51 insertions(+), 23 deletions(-) diff --git a/packages/cli/src/ui/components/__snapshots__/InputPrompt.test.tsx.snap b/packages/cli/src/ui/components/__snapshots__/InputPrompt.test.tsx.snap index ab6fe9b9282..26577749ac1 100644 --- a/packages/cli/src/ui/components/__snapshots__/InputPrompt.test.tsx.snap +++ b/packages/cli/src/ui/components/__snapshots__/InputPrompt.test.tsx.snap @@ -168,6 +168,13 @@ exports[`InputPrompt > mouse interaction > should toggle paste expansion on doub " `; +exports[`InputPrompt > mouse interaction > should toggle paste expansion on double-click 4`] = ` +"▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀▀ + > [Pasted Text: 10 lines] +▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄ +" +`; + exports[`InputPrompt > multiline rendering > should correctly render multiline input including blank lines 1`] = ` "──────────────────────────────────────────────────────────────────────────────────────────────────── │ > hello │ diff --git a/packages/core/src/policy/config.ts b/packages/core/src/policy/config.ts index d7b06024a69..fe0184b298c 100644 --- a/packages/core/src/policy/config.ts +++ b/packages/core/src/policy/config.ts @@ -435,10 +435,17 @@ export async function createPolicyEngineConfig( } } + const nonPlanModes = [ + ApprovalMode.DEFAULT, + ApprovalMode.AUTO_EDIT, + ApprovalMode.YOLO, + ]; + const mapToolsToRules = ( tools: string[], priority: number, source: string, + modes?: ApprovalMode[], ) => { for (const tool of tools) { // Check for legacy format: toolName(args) @@ -461,6 +468,7 @@ export async function createPolicyEngineConfig( priority, argsPattern: new RegExp(pattern), source, + modes, }); } } @@ -472,6 +480,7 @@ export async function createPolicyEngineConfig( decision: PolicyDecision.ALLOW, priority, source, + modes, }); } } else { @@ -484,6 +493,7 @@ export async function createPolicyEngineConfig( decision: PolicyDecision.ALLOW, priority, source, + modes, }); } } @@ -496,6 +506,7 @@ export async function createPolicyEngineConfig( settings.tools.allowed, ALLOWED_TOOLS_FLAG_PRIORITY, 'Settings (Tools Allowed)', + nonPlanModes, ); } @@ -506,6 +517,7 @@ export async function createPolicyEngineConfig( settings.tools.core, CORE_TOOLS_FLAG_PRIORITY, 'Settings (Core Tools)', + nonPlanModes, ); // If core tools are restricted, we should add a default DENY rule for everything else @@ -515,6 +527,7 @@ export async function createPolicyEngineConfig( decision: PolicyDecision.DENY, priority: CORE_TOOLS_FLAG_PRIORITY - 0.01, source: 'Settings (Core Tools Allowlist Enforcement)', + modes: nonPlanModes, }); } @@ -533,6 +546,7 @@ export async function createPolicyEngineConfig( decision: PolicyDecision.ALLOW, priority: TRUSTED_MCP_SERVER_PRIORITY, source: 'Settings (MCP Trusted)', + modes: nonPlanModes, }); } } @@ -551,6 +565,7 @@ export async function createPolicyEngineConfig( decision: PolicyDecision.ALLOW, priority: ALLOWED_MCP_SERVER_PRIORITY, source: 'Settings (MCP Allowed)', + modes: nonPlanModes, }); } } diff --git a/packages/core/src/policy/policy-engine.ts b/packages/core/src/policy/policy-engine.ts index 7ff27bd91ad..84109545025 100644 --- a/packages/core/src/policy/policy-engine.ts +++ b/packages/core/src/policy/policy-engine.ts @@ -397,18 +397,24 @@ export class PolicyEngine { return { decision: PolicyDecision.DENY, rule }; } - // Start optimistically. If all parts are ALLOW, the whole is ALLOW. - // We will downgrade if any part is ASK_USER or DENY. - let aggregateDecision = PolicyDecision.ALLOW; - let responsibleRule: PolicyRule | undefined; + // Start with the decision from the rule or heuristics. + // If the tool call was already downgraded (e.g. by heuristics), we start there. + let aggregateDecision = ruleDecision; - // Check for redirection on the full command string + // If heuristics downgraded the decision, we don't blame the rule. + let responsibleRule: PolicyRule | undefined = + rule && ruleDecision === rule.decision ? rule : undefined; + + // Check for redirection on the full command string. + // Redirection always downgrades ALLOW to ASK_USER (it never upgrades). if (this.shouldDowngradeForRedirection(command, allowRedirection)) { - debugLogger.debug( - `[PolicyEngine.check] Downgrading ALLOW to ASK_USER for redirected command: ${command}`, - ); - aggregateDecision = PolicyDecision.ASK_USER; - responsibleRule = undefined; // Inherent policy + if (aggregateDecision === PolicyDecision.ALLOW) { + debugLogger.debug( + `[PolicyEngine.check] Downgrading ALLOW to ASK_USER for redirected command: ${command}`, + ); + aggregateDecision = PolicyDecision.ASK_USER; + responsibleRule = undefined; // Inherent policy + } } for (const detail of subCommands) { @@ -435,20 +441,16 @@ export class PolicyEngine { if (wrapperResult.decision === PolicyDecision.DENY) return wrapperResult; if (wrapperResult.decision === PolicyDecision.ASK_USER) { - aggregateDecision = PolicyDecision.ASK_USER; - responsibleRule ??= wrapperResult.rule; + if (aggregateDecision === PolicyDecision.ALLOW) { + aggregateDecision = PolicyDecision.ASK_USER; + responsibleRule = wrapperResult.rule; + } else { + responsibleRule ??= wrapperResult.rule; + } } } - if (isAtomic) { - if ( - ruleDecision === PolicyDecision.ASK_USER && - aggregateDecision === PolicyDecision.ALLOW - ) { - aggregateDecision = PolicyDecision.ASK_USER; - responsibleRule = rule; - } - } else { + if (!isAtomic) { const subResult = await this.check( { name: toolName, args: { command: subCmd, dir_path } }, serverName, @@ -460,8 +462,12 @@ export class PolicyEngine { if (subResult.decision === PolicyDecision.DENY) return subResult; if (subResult.decision === PolicyDecision.ASK_USER) { - aggregateDecision = PolicyDecision.ASK_USER; - responsibleRule ??= subResult.rule; + if (aggregateDecision === PolicyDecision.ALLOW) { + aggregateDecision = PolicyDecision.ASK_USER; + responsibleRule = subResult.rule; + } else { + responsibleRule ??= subResult.rule; + } } // Downgrade if sub-command has redirection From 5331853d36c587b6fd65501f121d17fe87aeff73 Mon Sep 17 00:00:00 2001 From: davidapierce Date: Thu, 23 Apr 2026 16:06:09 +0000 Subject: [PATCH 06/12] Add a deny policy for tool calls looser than explicetly allowed tool calls. --- packages/core/src/policy/config.ts | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/packages/core/src/policy/config.ts b/packages/core/src/policy/config.ts index fe0184b298c..a980b250e48 100644 --- a/packages/core/src/policy/config.ts +++ b/packages/core/src/policy/config.ts @@ -446,7 +446,9 @@ export async function createPolicyEngineConfig( priority: number, source: string, modes?: ApprovalMode[], + addDefaultDenyForTools = false, ) => { + const toolsWithNarrowing = new Set(); for (const tool of tools) { // Check for legacy format: toolName(args) const match = tool.match(/^([a-zA-Z0-9_-]+)\((.*)\)$/); @@ -459,6 +461,7 @@ export async function createPolicyEngineConfig( // Treat args as a command prefix for shell tool if (toolName === SHELL_TOOL_NAME) { + toolsWithNarrowing.add(toolName); const patterns = buildArgsPatterns(undefined, args); for (const pattern of patterns) { if (pattern) { @@ -497,6 +500,18 @@ export async function createPolicyEngineConfig( }); } } + + if (addDefaultDenyForTools) { + for (const toolName of toolsWithNarrowing) { + rules.push({ + toolName, + decision: PolicyDecision.DENY, + priority: priority - 0.01, + source: `${source} (Narrowing Enforcement)`, + modes, + }); + } + } }; // Tools that are explicitly allowed in the settings. @@ -506,7 +521,8 @@ export async function createPolicyEngineConfig( settings.tools.allowed, ALLOWED_TOOLS_FLAG_PRIORITY, 'Settings (Tools Allowed)', - nonPlanModes, + undefined, + true, ); } From 1734a03f2abbcc0520d96a32670317d3e70338c5 Mon Sep 17 00:00:00 2001 From: davidapierce Date: Thu, 23 Apr 2026 17:08:27 +0000 Subject: [PATCH 07/12] set aggregateDecision to ASK_USER if any result dictates asking the user. --- packages/core/src/policy/policy-engine.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/core/src/policy/policy-engine.ts b/packages/core/src/policy/policy-engine.ts index 84109545025..e0e3c61215c 100644 --- a/packages/core/src/policy/policy-engine.ts +++ b/packages/core/src/policy/policy-engine.ts @@ -442,11 +442,11 @@ export class PolicyEngine { return wrapperResult; if (wrapperResult.decision === PolicyDecision.ASK_USER) { if (aggregateDecision === PolicyDecision.ALLOW) { - aggregateDecision = PolicyDecision.ASK_USER; responsibleRule = wrapperResult.rule; } else { responsibleRule ??= wrapperResult.rule; } + aggregateDecision = PolicyDecision.ASK_USER; } } @@ -463,11 +463,11 @@ export class PolicyEngine { if (subResult.decision === PolicyDecision.ASK_USER) { if (aggregateDecision === PolicyDecision.ALLOW) { - aggregateDecision = PolicyDecision.ASK_USER; responsibleRule = subResult.rule; } else { responsibleRule ??= subResult.rule; } + aggregateDecision = PolicyDecision.ASK_USER; } // Downgrade if sub-command has redirection From ad56ab5069c238d03f15745f22685c2c172bb785 Mon Sep 17 00:00:00 2001 From: davidapierce Date: Thu, 23 Apr 2026 17:46:29 +0000 Subject: [PATCH 08/12] respect confirmation required setting when set by users in updated policy config --- packages/cli/src/config/settingsSchema.ts | 13 +++++++++++++ packages/core/src/policy/config.ts | 14 ++++++++++++++ packages/core/src/policy/types.ts | 1 + 3 files changed, 28 insertions(+) diff --git a/packages/cli/src/config/settingsSchema.ts b/packages/cli/src/config/settingsSchema.ts index f5da86b60af..05d4cfae7f2 100644 --- a/packages/cli/src/config/settingsSchema.ts +++ b/packages/cli/src/config/settingsSchema.ts @@ -1667,6 +1667,19 @@ const SETTINGS_SCHEMA = { showInDialog: false, items: { type: 'string' }, }, + confirmationRequired: { + type: 'array', + label: 'Confirmation Required', + category: 'Advanced', + requiresRestart: true, + default: undefined as string[] | undefined, + description: oneLine` + Tool names that always require user confirmation. + Takes precedence over allowed tools and core tool allowlists. + `, + showInDialog: false, + items: { type: 'string' }, + }, exclude: { type: 'array', label: 'Exclude Tools', diff --git a/packages/core/src/policy/config.ts b/packages/core/src/policy/config.ts index a980b250e48..6d978479cb9 100644 --- a/packages/core/src/policy/config.ts +++ b/packages/core/src/policy/config.ts @@ -74,6 +74,7 @@ export const ADMIN_POLICY_TIER = 5; export const MCP_EXCLUDED_PRIORITY = USER_POLICY_TIER + 0.9; export const EXCLUDE_TOOLS_FLAG_PRIORITY = USER_POLICY_TIER + 0.4; +export const CONFIRMATION_REQUIRED_PRIORITY = USER_POLICY_TIER + 0.35; export const ALLOWED_TOOLS_FLAG_PRIORITY = USER_POLICY_TIER + 0.3; export const CORE_TOOLS_FLAG_PRIORITY = USER_POLICY_TIER + 0.25; export const TRUSTED_MCP_SERVER_PRIORITY = USER_POLICY_TIER + 0.2; @@ -526,6 +527,19 @@ export async function createPolicyEngineConfig( ); } + // Tools that explicitly require confirmation in the settings. + // Priority: CONFIRMATION_REQUIRED_PRIORITY (overrides allowed and core) + if (settings.tools?.confirmationRequired) { + for (const tool of settings.tools.confirmationRequired) { + rules.push({ + toolName: SHELL_TOOL_NAMES.includes(tool) ? SHELL_TOOL_NAME : tool, + decision: PolicyDecision.ASK_USER, + priority: CONFIRMATION_REQUIRED_PRIORITY, + source: 'Settings (Confirmation Required)', + }); + } + } + // Core tools that are restricted in the settings. // Priority: CORE_TOOLS_FLAG_PRIORITY (user tier - core tool allowlist) if (settings.tools?.core) { diff --git a/packages/core/src/policy/types.ts b/packages/core/src/policy/types.ts index aa5275865c9..672e3f84169 100644 --- a/packages/core/src/policy/types.ts +++ b/packages/core/src/policy/types.ts @@ -338,6 +338,7 @@ export interface PolicySettings { core?: string[]; exclude?: string[]; allowed?: string[]; + confirmationRequired?: string[]; }; mcpServers?: Record; // User provided policies that will replace the USER level policies in ~/.gemini/policies From 4c6ec402e66d066151b8238be31eb4962c3b29a6 Mon Sep 17 00:00:00 2001 From: davidapierce Date: Thu, 23 Apr 2026 18:05:34 +0000 Subject: [PATCH 09/12] regenerate settings schema and update configuration.md to include tools confirmation setting --- docs/reference/configuration.md | 6 ++++++ schemas/settings.schema.json | 9 +++++++++ 2 files changed, 15 insertions(+) diff --git a/docs/reference/configuration.md b/docs/reference/configuration.md index 56dd6b4b5d9..46b2ff980ec 100644 --- a/docs/reference/configuration.md +++ b/docs/reference/configuration.md @@ -1467,6 +1467,12 @@ their corresponding top-level category object in your `settings.json` file. - **Default:** `undefined` - **Requires restart:** Yes +- **`tools.confirmationRequired`** (array): + - **Description:** Tool names that always require user confirmation. Takes + precedence over allowed tools and core tool allowlists. + - **Default:** `undefined` + - **Requires restart:** Yes + - **`tools.exclude`** (array): - **Description:** Tool names to exclude from discovery. - **Default:** `undefined` diff --git a/schemas/settings.schema.json b/schemas/settings.schema.json index e7b362fc4e3..0e4d0050378 100644 --- a/schemas/settings.schema.json +++ b/schemas/settings.schema.json @@ -2531,6 +2531,15 @@ "type": "string" } }, + "confirmationRequired": { + "title": "Confirmation Required", + "description": "Tool names that always require user confirmation. Takes precedence over allowed tools and core tool allowlists.", + "markdownDescription": "Tool names that always require user confirmation. Takes precedence over allowed tools and core tool allowlists.\n\n- Category: `Advanced`\n- Requires restart: `yes`", + "type": "array", + "items": { + "type": "string" + } + }, "exclude": { "title": "Exclude Tools", "description": "Tool names to exclude from discovery.", From 2e21f857e3729a8014c58086697041802c2fb711 Mon Sep 17 00:00:00 2001 From: davidapierce Date: Thu, 23 Apr 2026 18:56:10 +0000 Subject: [PATCH 10/12] add trusted workspace setting to evals --- .github/workflows/eval-pr.yml | 1 + .github/workflows/eval.yml | 1 + .github/workflows/evals-nightly.yml | 1 + 3 files changed, 3 insertions(+) diff --git a/.github/workflows/eval-pr.yml b/.github/workflows/eval-pr.yml index 3e6784960ca..84d2dca5945 100644 --- a/.github/workflows/eval-pr.yml +++ b/.github/workflows/eval-pr.yml @@ -153,6 +153,7 @@ jobs: GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}' GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}' MODEL_LIST: '${{ env.MODEL_LIST }}' + GEMINI_CLI_TRUST_WORKSPACE: true run: | # Run the regression check loop. The script saves the report to a file. node scripts/run_eval_regression.js diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml index 23dc1cfdfbd..e2a83eac706 100644 --- a/.github/workflows/eval.yml +++ b/.github/workflows/eval.yml @@ -30,6 +30,7 @@ jobs: GEMINI_API_KEY: '${{ secrets.EVAL_GEMINI_API_KEY }}' GCLI_LOCAL_FILE_TELEMETRY: 'True' EVAL_GCS_BUCKET: '${{ vars.EVAL_GCS_ARTIFACTS_BUCKET }}' + GEMINI_CLI_TRUST_WORKSPACE: true steps: - name: 'Authenticate to Google Cloud' id: 'auth' diff --git a/.github/workflows/evals-nightly.yml b/.github/workflows/evals-nightly.yml index fbb770ac847..99f983b8a68 100644 --- a/.github/workflows/evals-nightly.yml +++ b/.github/workflows/evals-nightly.yml @@ -68,6 +68,7 @@ jobs: GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}' GEMINI_MODEL: '${{ matrix.model }}' RUN_EVALS: 'true' + GEMINI_CLI_TRUST_WORKSPACE: true EVAL_SUITE_TYPE: "${{ github.event.inputs.suite_type || 'behavioral' }}" EVAL_SUITE_NAME: '${{ github.event.inputs.suite_name }}' TEST_NAME_PATTERN: '${{ github.event.inputs.test_name_pattern }}' From bf768cb7a69b970e6dd4ab9e65cb4fff1980e2b3 Mon Sep 17 00:00:00 2001 From: Emily Hedlund Date: Thu, 23 Apr 2026 15:25:37 -0400 Subject: [PATCH 11/12] set GEMINI_CLI_TRUST_WORKSPACE in evals test helper and revert yml changes --- .github/workflows/eval-pr.yml | 1 - .github/workflows/eval.yml | 1 - .github/workflows/evals-nightly.yml | 1 - evals/test-helper.ts | 1 + 4 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/eval-pr.yml b/.github/workflows/eval-pr.yml index 84d2dca5945..3e6784960ca 100644 --- a/.github/workflows/eval-pr.yml +++ b/.github/workflows/eval-pr.yml @@ -153,7 +153,6 @@ jobs: GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}' GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}' MODEL_LIST: '${{ env.MODEL_LIST }}' - GEMINI_CLI_TRUST_WORKSPACE: true run: | # Run the regression check loop. The script saves the report to a file. node scripts/run_eval_regression.js diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml index e2a83eac706..23dc1cfdfbd 100644 --- a/.github/workflows/eval.yml +++ b/.github/workflows/eval.yml @@ -30,7 +30,6 @@ jobs: GEMINI_API_KEY: '${{ secrets.EVAL_GEMINI_API_KEY }}' GCLI_LOCAL_FILE_TELEMETRY: 'True' EVAL_GCS_BUCKET: '${{ vars.EVAL_GCS_ARTIFACTS_BUCKET }}' - GEMINI_CLI_TRUST_WORKSPACE: true steps: - name: 'Authenticate to Google Cloud' id: 'auth' diff --git a/.github/workflows/evals-nightly.yml b/.github/workflows/evals-nightly.yml index 99f983b8a68..fbb770ac847 100644 --- a/.github/workflows/evals-nightly.yml +++ b/.github/workflows/evals-nightly.yml @@ -68,7 +68,6 @@ jobs: GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}' GEMINI_MODEL: '${{ matrix.model }}' RUN_EVALS: 'true' - GEMINI_CLI_TRUST_WORKSPACE: true EVAL_SUITE_TYPE: "${{ github.event.inputs.suite_type || 'behavioral' }}" EVAL_SUITE_NAME: '${{ github.event.inputs.suite_name }}' TEST_NAME_PATTERN: '${{ github.event.inputs.test_name_pattern }}' diff --git a/evals/test-helper.ts b/evals/test-helper.ts index 7369a6919c9..af6bade2015 100644 --- a/evals/test-helper.ts +++ b/evals/test-helper.ts @@ -172,6 +172,7 @@ export async function internalEvalTest(evalCase: EvalCase) { timeout: evalCase.timeout, env: { GEMINI_CLI_ACTIVITY_LOG_TARGET: activityLogFile, + GEMINI_CLI_TRUST_WORKSPACE: 'true', }, }); From 173d4d6af040185756f4db722c1a88c5194285ab Mon Sep 17 00:00:00 2001 From: davidapierce Date: Thu, 23 Apr 2026 19:57:45 +0000 Subject: [PATCH 12/12] disable the memoryV2 experiment in eval tests since they don't play nice with default read write permissions. --- evals/save_memory.eval.ts | 55 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/evals/save_memory.eval.ts b/evals/save_memory.eval.ts index 8680f8eba8c..b31167fb4a0 100644 --- a/evals/save_memory.eval.ts +++ b/evals/save_memory.eval.ts @@ -18,6 +18,11 @@ describe('save_memory', () => { suiteName: 'default', suiteType: 'behavioral', name: rememberingFavoriteColor, + params: { + settings: { + experimental: { memoryV2: false }, + }, + }, prompt: `remember that my favorite color is blue. @@ -40,6 +45,11 @@ describe('save_memory', () => { suiteName: 'default', suiteType: 'behavioral', name: rememberingCommandRestrictions, + params: { + settings: { + experimental: { memoryV2: false }, + }, + }, prompt: `I don't want you to ever run npm commands.`, assert: async (rig, result) => { @@ -61,6 +71,11 @@ describe('save_memory', () => { suiteName: 'default', suiteType: 'behavioral', name: rememberingWorkflow, + params: { + settings: { + experimental: { memoryV2: false }, + }, + }, prompt: `I want you to always lint after building.`, assert: async (rig, result) => { @@ -83,6 +98,11 @@ describe('save_memory', () => { suiteName: 'default', suiteType: 'behavioral', name: ignoringTemporaryInformation, + params: { + settings: { + experimental: { memoryV2: false }, + }, + }, prompt: `I'm going to get a coffee.`, assert: async (rig, result) => { @@ -108,6 +128,11 @@ describe('save_memory', () => { suiteName: 'default', suiteType: 'behavioral', name: rememberingPetName, + params: { + settings: { + experimental: { memoryV2: false }, + }, + }, prompt: `Please remember that my dog's name is Buddy.`, assert: async (rig, result) => { @@ -129,6 +154,11 @@ describe('save_memory', () => { suiteName: 'default', suiteType: 'behavioral', name: rememberingCommandAlias, + params: { + settings: { + experimental: { memoryV2: false }, + }, + }, prompt: `When I say 'start server', you should run 'npm run dev'.`, assert: async (rig, result) => { @@ -151,6 +181,11 @@ describe('save_memory', () => { suiteName: 'default', suiteType: 'behavioral', name: savingDbSchemaLocationAsProjectMemory, + params: { + settings: { + experimental: { memoryV2: false }, + }, + }, prompt: `The database schema for this workspace is located in \`db/schema.sql\`.`, assert: async (rig, result) => { const wasToolCalled = await rig.waitForToolCall( @@ -180,6 +215,11 @@ describe('save_memory', () => { suiteName: 'default', suiteType: 'behavioral', name: rememberingCodingStyle, + params: { + settings: { + experimental: { memoryV2: false }, + }, + }, prompt: `I prefer to use tabs instead of spaces for indentation.`, assert: async (rig, result) => { @@ -202,6 +242,11 @@ describe('save_memory', () => { suiteName: 'default', suiteType: 'behavioral', name: savingBuildArtifactLocationAsProjectMemory, + params: { + settings: { + experimental: { memoryV2: false }, + }, + }, prompt: `In this workspace, build artifacts are stored in the \`dist/artifacts\` directory.`, assert: async (rig, result) => { const wasToolCalled = await rig.waitForToolCall( @@ -231,6 +276,11 @@ describe('save_memory', () => { suiteName: 'default', suiteType: 'behavioral', name: savingMainEntryPointAsProjectMemory, + params: { + settings: { + experimental: { memoryV2: false }, + }, + }, prompt: `The main entry point for this workspace is \`src/index.js\`.`, assert: async (rig, result) => { const wasToolCalled = await rig.waitForToolCall( @@ -259,6 +309,11 @@ describe('save_memory', () => { suiteName: 'default', suiteType: 'behavioral', name: rememberingBirthday, + params: { + settings: { + experimental: { memoryV2: false }, + }, + }, prompt: `My birthday is on June 15th.`, assert: async (rig, result) => {