From 737ce0de1ae3c03c6f6ecc82afb8ffa5ae3b7c84 Mon Sep 17 00:00:00 2001 From: Sandy Tao Date: Tue, 21 Apr 2026 11:25:01 -0700 Subject: [PATCH 1/5] test(e2e): default integration tests to Flash Preview Switch the default model injected by TestRig from gemini-3-pro-preview to gemini-3-flash-preview to cut PR-check latency and Pro-tier quota usage. Tests that need a specific model already pin it via --model. --- packages/test-utils/src/test-rig.ts | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/packages/test-utils/src/test-rig.ts b/packages/test-utils/src/test-rig.ts index 9374b573ac7..f057ba94074 100644 --- a/packages/test-utils/src/test-rig.ts +++ b/packages/test-utils/src/test-rig.ts @@ -11,7 +11,10 @@ import { join, dirname } from 'node:path'; import { fileURLToPath } from 'node:url'; import { env } from 'node:process'; import { setTimeout as sleep } from 'node:timers/promises'; -import { PREVIEW_GEMINI_MODEL, GEMINI_DIR } from '@google/gemini-cli-core'; +import { + PREVIEW_GEMINI_FLASH_MODEL, + GEMINI_DIR, +} from '@google/gemini-cli-core'; export { GEMINI_DIR }; import * as pty from '@lydell/node-pty'; import stripAnsi from 'strip-ansi'; @@ -475,7 +478,7 @@ export class TestRig { ...(env['GEMINI_TEST_TYPE'] === 'integration' ? { model: { - name: PREVIEW_GEMINI_MODEL, + name: PREVIEW_GEMINI_FLASH_MODEL, }, } : {}), From 8e8a223a2fded8030213e55b089c4e8ed144d7ae Mon Sep 17 00:00:00 2001 From: Sandy Tao Date: Tue, 21 Apr 2026 11:40:47 -0700 Subject: [PATCH 2/5] test(e2e): tolerate trailing newline in path-with-spaces test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The byte-exact assertion was over-specifying model output formatting. Trim before comparing so models that idiomatically append a trailing newline (e.g. gemini-3-flash-preview) also pass. This test exercises path-with-spaces handling, not whitespace fidelity — the sibling 'write a hello world message' test in the same file already uses a tolerant .includes() check. --- integration-tests/file-system.test.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/integration-tests/file-system.test.ts b/integration-tests/file-system.test.ts index 80552cfd68f..aa50000ef61 100644 --- a/integration-tests/file-system.test.ts +++ b/integration-tests/file-system.test.ts @@ -134,7 +134,9 @@ describe('file-system', () => { ).toBeTruthy(); const newFileContent = rig.readFile(fileName); - expect(newFileContent).toBe('hello'); + // Trim to tolerate models that idiomatically append a trailing newline. + // This test is about path-with-spaces handling, not whitespace fidelity. + expect(newFileContent.trim()).toBe('hello'); }); it('should perform a read-then-write sequence', async () => { From b6b6bcb6ad20fc94c6ff681e8daa68bdc32e1b5c Mon Sep 17 00:00:00 2001 From: Sandy Tao Date: Tue, 21 Apr 2026 11:58:50 -0700 Subject: [PATCH 3/5] test(e2e): disambiguate plan-mode write prompts as Directives Both 'should allow write_file to the plans directory' tests used a prompt ('Create a file called X in the plans directory.') that straddles the Inquiry/Directive line in the Plan Mode system prompt. Pro Preview happened to interpret it as a Directive and wrote the file; Flash Preview is more prompt-compliant and follows the 'STOP and wait for confirmation' rule for Simple Tasks, never invoking write_file. Make the prompt unambiguously a Directive by specifying content and explicitly opting out of the consult-and-stop step. Aligns with Rule 4 of the Plan Mode prompt without weakening the system instructions. --- integration-tests/plan-mode.test.ts | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/integration-tests/plan-mode.test.ts b/integration-tests/plan-mode.test.ts index 6f90c60fec7..41de123cf7a 100644 --- a/integration-tests/plan-mode.test.ts +++ b/integration-tests/plan-mode.test.ts @@ -81,7 +81,10 @@ describe('Plan Mode', () => { await rig.run({ approvalMode: 'plan', - args: 'Create a file called plan.md in the plans directory.', + args: + 'Create a file called plan.md in the plans directory with the ' + + 'content "# Plan". Treat this as a Directive and write the file ' + + 'immediately without proposing strategy or asking for confirmation.', }); const toolLogs = rig.readToolLogs(); @@ -194,7 +197,11 @@ describe('Plan Mode', () => { await rig.run({ approvalMode: 'plan', - args: 'Create a file called plan-no-session.md in the plans directory.', + args: + 'Create a file called plan-no-session.md in the plans directory ' + + 'with the content "# Plan". Treat this as a Directive and write ' + + 'the file immediately without proposing strategy or asking for ' + + 'confirmation.', }); const toolLogs = rig.readToolLogs(); From 6ea01ad5af7936ab7c680b720646736800d08b81 Mon Sep 17 00:00:00 2001 From: Sandy Tao Date: Tue, 21 Apr 2026 14:22:00 -0700 Subject: [PATCH 4/5] test(e2e): skip chat-compression interactive test on macOS The 'should trigger chat compression with /compress command' test is chronically flaky on macOS due to a pty/screen-buffer rendering issue: the captured output contains the CLI's startup escape sequences (`q4;?m...true color warning`) instead of the streamed model output, causing `expectText('THE_END.')` to time out. Reproducible across unrelated runs on `main` (24740161950, 24739323404) and the merge-queue gate for #25753 (24743605639); not specific to any model. Skip on darwin until the underlying pty rendering issue is fixed; Linux+Windows coverage remains intact. --- .../context-compress-interactive.test.ts | 72 +++++++++++-------- 1 file changed, 41 insertions(+), 31 deletions(-) diff --git a/integration-tests/context-compress-interactive.test.ts b/integration-tests/context-compress-interactive.test.ts index c7e04c6c239..5248b826ddf 100644 --- a/integration-tests/context-compress-interactive.test.ts +++ b/integration-tests/context-compress-interactive.test.ts @@ -8,6 +8,13 @@ import { expect, describe, it, beforeEach, afterEach } from 'vitest'; import { TestRig } from './test-helper.js'; import { join } from 'node:path'; +// Skip on macOS: the interactive pty captures the CLI's startup escape +// sequences (`q4;?m...true color warning`) instead of the streamed model +// output, causing `expectText('THE_END.')` to time out. Reproducible on +// vanilla `main` runs (e.g. 24740161950, 24739323404) and the merge-queue +// gate for #25753 (24743605639); not specific to any model. +const skipOnDarwin = process.platform === 'darwin'; + describe('Interactive Mode', () => { let rig: TestRig; @@ -19,37 +26,40 @@ describe('Interactive Mode', () => { await rig.cleanup(); }); - it('should trigger chat compression with /compress command', async () => { - await rig.setup('interactive-compress-success', { - fakeResponsesPath: join( - import.meta.dirname, - 'context-compress-interactive.compress.responses', - ), - }); - - const run = await rig.runInteractive(); - - await run.sendKeys( - 'Write a 200 word story about a robot. The story MUST end with the text THE_END followed by a period.', - ); - await run.type('\r'); - - // Wait for the specific end marker. - await run.expectText('THE_END.', 30000); - - await run.type('/compress'); - await run.type('\r'); - - const foundEvent = await rig.waitForTelemetryEvent( - 'chat_compression', - 25000, - ); - expect(foundEvent, 'chat_compression telemetry event was not found').toBe( - true, - ); - - await run.expectText('Chat history compressed', 5000); - }); + it.skipIf(skipOnDarwin)( + 'should trigger chat compression with /compress command', + async () => { + await rig.setup('interactive-compress-success', { + fakeResponsesPath: join( + import.meta.dirname, + 'context-compress-interactive.compress.responses', + ), + }); + + const run = await rig.runInteractive(); + + await run.sendKeys( + 'Write a 200 word story about a robot. The story MUST end with the text THE_END followed by a period.', + ); + await run.type('\r'); + + // Wait for the specific end marker. + await run.expectText('THE_END.', 30000); + + await run.type('/compress'); + await run.type('\r'); + + const foundEvent = await rig.waitForTelemetryEvent( + 'chat_compression', + 25000, + ); + expect(foundEvent, 'chat_compression telemetry event was not found').toBe( + true, + ); + + await run.expectText('Chat history compressed', 5000); + }, + ); // TODO: Context compression is broken and doesn't include the system // instructions or tool counts, so it thinks compression is beneficial when From c14d699e833819cf3be243b8d5e318b76962dcbf Mon Sep 17 00:00:00 2001 From: Sandy Tao Date: Tue, 21 Apr 2026 15:08:36 -0700 Subject: [PATCH 5/5] test(e2e): skip the entire Interactive Mode describe on macOS Promotes the per-test skip to a describe-level skip after the merge-queue gate hit a different test in the same file (24747624513 failed on 'should handle /compress command on empty history' instead of the previously-flaky 'should trigger chat compression with /compress command'). Every it() in this describe uses runInteractive + expectText against a pty buffer that, on macOS GitHub runners, captures the CLI's startup escape sequences rather than the streamed output. The flake is platform-level, not per-test. --- .../context-compress-interactive.test.ts | 79 +++++++++---------- 1 file changed, 39 insertions(+), 40 deletions(-) diff --git a/integration-tests/context-compress-interactive.test.ts b/integration-tests/context-compress-interactive.test.ts index 5248b826ddf..fbe2359aa2a 100644 --- a/integration-tests/context-compress-interactive.test.ts +++ b/integration-tests/context-compress-interactive.test.ts @@ -8,14 +8,16 @@ import { expect, describe, it, beforeEach, afterEach } from 'vitest'; import { TestRig } from './test-helper.js'; import { join } from 'node:path'; -// Skip on macOS: the interactive pty captures the CLI's startup escape -// sequences (`q4;?m...true color warning`) instead of the streamed model -// output, causing `expectText('THE_END.')` to time out. Reproducible on -// vanilla `main` runs (e.g. 24740161950, 24739323404) and the merge-queue -// gate for #25753 (24743605639); not specific to any model. +// Skip on macOS: every interactive test in this file is chronically flaky +// because the captured pty buffer contains the CLI's startup escape +// sequences (`q4;?m...true color warning`) instead of the streamed output, +// causing `expectText(...)` to time out. Reproducible across unrelated +// runs on `main` (24740161950, 24739323404) and on consecutive merge-queue +// gates for #25753 (24743605639, 24747624513) — different tests in the +// same describe fail on different runs. Not specific to any model. const skipOnDarwin = process.platform === 'darwin'; -describe('Interactive Mode', () => { +describe.skipIf(skipOnDarwin)('Interactive Mode', () => { let rig: TestRig; beforeEach(() => { @@ -26,40 +28,37 @@ describe('Interactive Mode', () => { await rig.cleanup(); }); - it.skipIf(skipOnDarwin)( - 'should trigger chat compression with /compress command', - async () => { - await rig.setup('interactive-compress-success', { - fakeResponsesPath: join( - import.meta.dirname, - 'context-compress-interactive.compress.responses', - ), - }); - - const run = await rig.runInteractive(); - - await run.sendKeys( - 'Write a 200 word story about a robot. The story MUST end with the text THE_END followed by a period.', - ); - await run.type('\r'); - - // Wait for the specific end marker. - await run.expectText('THE_END.', 30000); - - await run.type('/compress'); - await run.type('\r'); - - const foundEvent = await rig.waitForTelemetryEvent( - 'chat_compression', - 25000, - ); - expect(foundEvent, 'chat_compression telemetry event was not found').toBe( - true, - ); - - await run.expectText('Chat history compressed', 5000); - }, - ); + it('should trigger chat compression with /compress command', async () => { + await rig.setup('interactive-compress-success', { + fakeResponsesPath: join( + import.meta.dirname, + 'context-compress-interactive.compress.responses', + ), + }); + + const run = await rig.runInteractive(); + + await run.sendKeys( + 'Write a 200 word story about a robot. The story MUST end with the text THE_END followed by a period.', + ); + await run.type('\r'); + + // Wait for the specific end marker. + await run.expectText('THE_END.', 30000); + + await run.type('/compress'); + await run.type('\r'); + + const foundEvent = await rig.waitForTelemetryEvent( + 'chat_compression', + 25000, + ); + expect(foundEvent, 'chat_compression telemetry event was not found').toBe( + true, + ); + + await run.expectText('Chat history compressed', 5000); + }); // TODO: Context compression is broken and doesn't include the system // instructions or tool counts, so it thinks compression is beneficial when