From 737ce0de1ae3c03c6f6ecc82afb8ffa5ae3b7c84 Mon Sep 17 00:00:00 2001
From: Sandy Tao <sandytao520@icloud.com>
Date: Tue, 21 Apr 2026 11:25:01 -0700
Subject: [PATCH 1/5] test(e2e): default integration tests to Flash Preview

Switch the default model injected by TestRig from gemini-3-pro-preview to gemini-3-flash-preview to cut PR-check latency and Pro-tier quota usage. Tests that need a specific model already pin it via --model.
---
 packages/test-utils/src/test-rig.ts | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/packages/test-utils/src/test-rig.ts b/packages/test-utils/src/test-rig.ts
index 9374b573ac7..f057ba94074 100644
--- a/packages/test-utils/src/test-rig.ts
+++ b/packages/test-utils/src/test-rig.ts
@@ -11,7 +11,10 @@ import { join, dirname } from 'node:path';
 import { fileURLToPath } from 'node:url';
 import { env } from 'node:process';
 import { setTimeout as sleep } from 'node:timers/promises';
-import { PREVIEW_GEMINI_MODEL, GEMINI_DIR } from '@google/gemini-cli-core';
+import {
+  PREVIEW_GEMINI_FLASH_MODEL,
+  GEMINI_DIR,
+} from '@google/gemini-cli-core';
 export { GEMINI_DIR };
 import * as pty from '@lydell/node-pty';
 import stripAnsi from 'strip-ansi';
@@ -475,7 +478,7 @@ export class TestRig {
         ...(env['GEMINI_TEST_TYPE'] === 'integration'
           ? {
               model: {
-                name: PREVIEW_GEMINI_MODEL,
+                name: PREVIEW_GEMINI_FLASH_MODEL,
               },
             }
           : {}),

From 8e8a223a2fded8030213e55b089c4e8ed144d7ae Mon Sep 17 00:00:00 2001
From: Sandy Tao <sandytao520@icloud.com>
Date: Tue, 21 Apr 2026 11:40:47 -0700
Subject: [PATCH 2/5] test(e2e): tolerate trailing newline in path-with-spaces
 test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The byte-exact assertion was over-specifying model output formatting. Trim before comparing so models that idiomatically append a trailing newline (e.g. gemini-3-flash-preview) also pass. This test exercises path-with-spaces handling, not whitespace fidelity — the sibling 'write a hello world message' test in the same file already uses a tolerant .includes() check.
---
 integration-tests/file-system.test.ts | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/integration-tests/file-system.test.ts b/integration-tests/file-system.test.ts
index 80552cfd68f..aa50000ef61 100644
--- a/integration-tests/file-system.test.ts
+++ b/integration-tests/file-system.test.ts
@@ -134,7 +134,9 @@ describe('file-system', () => {
     ).toBeTruthy();
 
     const newFileContent = rig.readFile(fileName);
-    expect(newFileContent).toBe('hello');
+    // Trim to tolerate models that idiomatically append a trailing newline.
+    // This test is about path-with-spaces handling, not whitespace fidelity.
+    expect(newFileContent.trim()).toBe('hello');
   });
 
   it('should perform a read-then-write sequence', async () => {

From b6b6bcb6ad20fc94c6ff681e8daa68bdc32e1b5c Mon Sep 17 00:00:00 2001
From: Sandy Tao <sandytao520@icloud.com>
Date: Tue, 21 Apr 2026 11:58:50 -0700
Subject: [PATCH 3/5] test(e2e): disambiguate plan-mode write prompts as
 Directives

Both 'should allow write_file to the plans directory' tests used a prompt ('Create a file called X in the plans directory.') that straddles the Inquiry/Directive line in the Plan Mode system prompt. Pro Preview happened to interpret it as a Directive and wrote the file; Flash Preview is more prompt-compliant and follows the 'STOP and wait for confirmation' rule for Simple Tasks, never invoking write_file.

Make the prompt unambiguously a Directive by specifying content and explicitly opting out of the consult-and-stop step. Aligns with Rule 4 of the Plan Mode prompt without weakening the system instructions.
---
 integration-tests/plan-mode.test.ts | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/integration-tests/plan-mode.test.ts b/integration-tests/plan-mode.test.ts
index 6f90c60fec7..41de123cf7a 100644
--- a/integration-tests/plan-mode.test.ts
+++ b/integration-tests/plan-mode.test.ts
@@ -81,7 +81,10 @@ describe('Plan Mode', () => {
 
     await rig.run({
       approvalMode: 'plan',
-      args: 'Create a file called plan.md in the plans directory.',
+      args:
+        'Create a file called plan.md in the plans directory with the ' +
+        'content "# Plan". Treat this as a Directive and write the file ' +
+        'immediately without proposing strategy or asking for confirmation.',
     });
 
     const toolLogs = rig.readToolLogs();
@@ -194,7 +197,11 @@ describe('Plan Mode', () => {
 
     await rig.run({
       approvalMode: 'plan',
-      args: 'Create a file called plan-no-session.md in the plans directory.',
+      args:
+        'Create a file called plan-no-session.md in the plans directory ' +
+        'with the content "# Plan". Treat this as a Directive and write ' +
+        'the file immediately without proposing strategy or asking for ' +
+        'confirmation.',
     });
 
     const toolLogs = rig.readToolLogs();

From 6ea01ad5af7936ab7c680b720646736800d08b81 Mon Sep 17 00:00:00 2001
From: Sandy Tao <sandytao520@icloud.com>
Date: Tue, 21 Apr 2026 14:22:00 -0700
Subject: [PATCH 4/5] test(e2e): skip chat-compression interactive test on
 macOS

The 'should trigger chat compression with /compress command' test is chronically flaky on macOS due to a pty/screen-buffer rendering issue: the captured output contains the CLI's startup escape sequences (`q4;?m...true color warning`) instead of the streamed model output, causing `expectText('THE_END.')` to time out.

Reproducible across unrelated runs on `main` (24740161950, 24739323404) and the merge-queue gate for #25753 (24743605639); not specific to any model. Skip on darwin until the underlying pty rendering issue is fixed; Linux+Windows coverage remains intact.
---
 .../context-compress-interactive.test.ts      | 72 +++++++++++--------
 1 file changed, 41 insertions(+), 31 deletions(-)

diff --git a/integration-tests/context-compress-interactive.test.ts b/integration-tests/context-compress-interactive.test.ts
index c7e04c6c239..5248b826ddf 100644
--- a/integration-tests/context-compress-interactive.test.ts
+++ b/integration-tests/context-compress-interactive.test.ts
@@ -8,6 +8,13 @@ import { expect, describe, it, beforeEach, afterEach } from 'vitest';
 import { TestRig } from './test-helper.js';
 import { join } from 'node:path';
 
+// Skip on macOS: the interactive pty captures the CLI's startup escape
+// sequences (`q4;?m...true color warning`) instead of the streamed model
+// output, causing `expectText('THE_END.')` to time out. Reproducible on
+// vanilla `main` runs (e.g. 24740161950, 24739323404) and the merge-queue
+// gate for #25753 (24743605639); not specific to any model.
+const skipOnDarwin = process.platform === 'darwin';
+
 describe('Interactive Mode', () => {
   let rig: TestRig;
 
@@ -19,37 +26,40 @@ describe('Interactive Mode', () => {
     await rig.cleanup();
   });
 
-  it('should trigger chat compression with /compress command', async () => {
-    await rig.setup('interactive-compress-success', {
-      fakeResponsesPath: join(
-        import.meta.dirname,
-        'context-compress-interactive.compress.responses',
-      ),
-    });
-
-    const run = await rig.runInteractive();
-
-    await run.sendKeys(
-      'Write a 200 word story about a robot. The story MUST end with the text THE_END followed by a period.',
-    );
-    await run.type('\r');
-
-    // Wait for the specific end marker.
-    await run.expectText('THE_END.', 30000);
-
-    await run.type('/compress');
-    await run.type('\r');
-
-    const foundEvent = await rig.waitForTelemetryEvent(
-      'chat_compression',
-      25000,
-    );
-    expect(foundEvent, 'chat_compression telemetry event was not found').toBe(
-      true,
-    );
-
-    await run.expectText('Chat history compressed', 5000);
-  });
+  it.skipIf(skipOnDarwin)(
+    'should trigger chat compression with /compress command',
+    async () => {
+      await rig.setup('interactive-compress-success', {
+        fakeResponsesPath: join(
+          import.meta.dirname,
+          'context-compress-interactive.compress.responses',
+        ),
+      });
+
+      const run = await rig.runInteractive();
+
+      await run.sendKeys(
+        'Write a 200 word story about a robot. The story MUST end with the text THE_END followed by a period.',
+      );
+      await run.type('\r');
+
+      // Wait for the specific end marker.
+      await run.expectText('THE_END.', 30000);
+
+      await run.type('/compress');
+      await run.type('\r');
+
+      const foundEvent = await rig.waitForTelemetryEvent(
+        'chat_compression',
+        25000,
+      );
+      expect(foundEvent, 'chat_compression telemetry event was not found').toBe(
+        true,
+      );
+
+      await run.expectText('Chat history compressed', 5000);
+    },
+  );
 
   // TODO: Context compression is broken and doesn't include the system
   // instructions or tool counts, so it thinks compression is beneficial when

From c14d699e833819cf3be243b8d5e318b76962dcbf Mon Sep 17 00:00:00 2001
From: Sandy Tao <sandytao520@icloud.com>
Date: Tue, 21 Apr 2026 15:08:36 -0700
Subject: [PATCH 5/5] test(e2e): skip the entire Interactive Mode describe on
 macOS

Promotes the per-test skip to a describe-level skip after the merge-queue gate hit a different test in the same file (24747624513 failed on 'should handle /compress command on empty history' instead of the previously-flaky 'should trigger chat compression with /compress command').

Every it() in this describe uses runInteractive + expectText against a pty buffer that, on macOS GitHub runners, captures the CLI's startup escape sequences rather than the streamed output. The flake is platform-level, not per-test.
---
 .../context-compress-interactive.test.ts      | 79 +++++++++----------
 1 file changed, 39 insertions(+), 40 deletions(-)

diff --git a/integration-tests/context-compress-interactive.test.ts b/integration-tests/context-compress-interactive.test.ts
index 5248b826ddf..fbe2359aa2a 100644
--- a/integration-tests/context-compress-interactive.test.ts
+++ b/integration-tests/context-compress-interactive.test.ts
@@ -8,14 +8,16 @@ import { expect, describe, it, beforeEach, afterEach } from 'vitest';
 import { TestRig } from './test-helper.js';
 import { join } from 'node:path';
 
-// Skip on macOS: the interactive pty captures the CLI's startup escape
-// sequences (`q4;?m...true color warning`) instead of the streamed model
-// output, causing `expectText('THE_END.')` to time out. Reproducible on
-// vanilla `main` runs (e.g. 24740161950, 24739323404) and the merge-queue
-// gate for #25753 (24743605639); not specific to any model.
+// Skip on macOS: every interactive test in this file is chronically flaky
+// because the captured pty buffer contains the CLI's startup escape
+// sequences (`q4;?m...true color warning`) instead of the streamed output,
+// causing `expectText(...)` to time out. Reproducible across unrelated
+// runs on `main` (24740161950, 24739323404) and on consecutive merge-queue
+// gates for #25753 (24743605639, 24747624513) — different tests in the
+// same describe fail on different runs. Not specific to any model.
 const skipOnDarwin = process.platform === 'darwin';
 
-describe('Interactive Mode', () => {
+describe.skipIf(skipOnDarwin)('Interactive Mode', () => {
   let rig: TestRig;
 
   beforeEach(() => {
@@ -26,40 +28,37 @@ describe('Interactive Mode', () => {
     await rig.cleanup();
   });
 
-  it.skipIf(skipOnDarwin)(
-    'should trigger chat compression with /compress command',
-    async () => {
-      await rig.setup('interactive-compress-success', {
-        fakeResponsesPath: join(
-          import.meta.dirname,
-          'context-compress-interactive.compress.responses',
-        ),
-      });
-
-      const run = await rig.runInteractive();
-
-      await run.sendKeys(
-        'Write a 200 word story about a robot. The story MUST end with the text THE_END followed by a period.',
-      );
-      await run.type('\r');
-
-      // Wait for the specific end marker.
-      await run.expectText('THE_END.', 30000);
-
-      await run.type('/compress');
-      await run.type('\r');
-
-      const foundEvent = await rig.waitForTelemetryEvent(
-        'chat_compression',
-        25000,
-      );
-      expect(foundEvent, 'chat_compression telemetry event was not found').toBe(
-        true,
-      );
-
-      await run.expectText('Chat history compressed', 5000);
-    },
-  );
+  it('should trigger chat compression with /compress command', async () => {
+    await rig.setup('interactive-compress-success', {
+      fakeResponsesPath: join(
+        import.meta.dirname,
+        'context-compress-interactive.compress.responses',
+      ),
+    });
+
+    const run = await rig.runInteractive();
+
+    await run.sendKeys(
+      'Write a 200 word story about a robot. The story MUST end with the text THE_END followed by a period.',
+    );
+    await run.type('\r');
+
+    // Wait for the specific end marker.
+    await run.expectText('THE_END.', 30000);
+
+    await run.type('/compress');
+    await run.type('\r');
+
+    const foundEvent = await rig.waitForTelemetryEvent(
+      'chat_compression',
+      25000,
+    );
+    expect(foundEvent, 'chat_compression telemetry event was not found').toBe(
+      true,
+    );
+
+    await run.expectText('Chat history compressed', 5000);
+  });
 
   // TODO: Context compression is broken and doesn't include the system
   // instructions or tool counts, so it thinks compression is beneficial when