From 99c7108bb08e52b6550743dbe721a579af58c1f8 Mon Sep 17 00:00:00 2001
From: Jacob MacDonald <jakemac@google.com>
Date: Tue, 14 Oct 2025 11:36:49 -0700
Subject: [PATCH] fix integration test static errors, and run_shell_command
 tests to actually be testing what they intend (#11050)

---
 .../context-compress-interactive.test.ts      |   5 +-
 integration-tests/globalSetup.ts              |  16 +-
 integration-tests/run_shell_command.test.ts   | 236 +++++++++++-------
 integration-tests/stdin-context.test.ts       |   4 +-
 integration-tests/test-helper.ts              |  26 +-
 integration-tests/write_file.test.ts          |   5 +-
 6 files changed, 188 insertions(+), 104 deletions(-)
diff --git a/integration-tests/context-compress-interactive.test.ts b/integration-tests/context-compress-interactive.test.ts
index 98b1aa3a7b..51043e1265 100644
--- a/integration-tests/context-compress-interactive.test.ts
+++ b/integration-tests/context-compress-interactive.test.ts
@@ -18,13 +18,14 @@ describe.skip('Interactive Mode', () => {
     await rig.cleanup();
   });
 
-  it('should trigger chat compression with /compress command', async () => {
+  // TODO(#11062): Make this test reliable by not using the actual Gemini model
+  it.skip('should trigger chat compression with /compress command', async () => {
     await rig.setup('interactive-compress-test');
 
     const run = await rig.runInteractive();
 
     const longPrompt =
-      'Dont do anything except returning a 1000 token long paragragh with the <name of the scientist who discovered theory of relativity> at the end to indicate end of response. This is a moderately long sentence.';
+      'Dont do anything except returning a 1000 token long paragraph with the <name of the scientist who discovered theory of relativity> at the end to indicate end of response. This is a moderately long sentence.';
 
     await run.type(longPrompt);
     await run.type('\r');
diff --git a/integration-tests/globalSetup.ts b/integration-tests/globalSetup.ts
index 180728e09e..5ef7081e07 100644
--- a/integration-tests/globalSetup.ts
+++ b/integration-tests/globalSetup.ts
@@ -5,8 +5,8 @@
  */
 
 // Unset NO_COLOR environment variable to ensure consistent theme behavior between local and CI test runs
-if (process.env.NO_COLOR !== undefined) {
-  delete process.env.NO_COLOR;
+if (process.env['NO_COLOR'] !== undefined) {
+  delete process.env['NO_COLOR'];
 }
 
 import {
@@ -60,21 +60,21 @@ export async function setup() {
     console.error('Error cleaning up old test runs:', e);
   }
 
-  process.env.INTEGRATION_TEST_FILE_DIR = runDir;
-  process.env.GEMINI_CLI_INTEGRATION_TEST = 'true';
-  process.env.TELEMETRY_LOG_FILE = join(runDir, 'telemetry.log');
+  process.env['INTEGRATION_TEST_FILE_DIR'] = runDir;
+  process.env['GEMINI_CLI_INTEGRATION_TEST'] = 'true';
+  process.env['TELEMETRY_LOG_FILE'] = join(runDir, 'telemetry.log');
 
-  if (process.env.KEEP_OUTPUT) {
+  if (process.env['KEEP_OUTPUT']) {
     console.log(`Keeping output for test run in: ${runDir}`);
   }
-  process.env.VERBOSE = process.env.VERBOSE ?? 'false';
+  process.env['VERBOSE'] = process.env['VERBOSE'] ?? 'false';
 
   console.log(`\nIntegration test output directory: ${runDir}`);
 }
 
 export async function teardown() {
   // Cleanup the test run directory unless KEEP_OUTPUT is set
-  if (process.env.KEEP_OUTPUT !== 'true' && runDir) {
+  if (process.env['KEEP_OUTPUT'] !== 'true' && runDir) {
     await rm(runDir, { recursive: true, force: true });
   }
 
diff --git a/integration-tests/run_shell_command.test.ts b/integration-tests/run_shell_command.test.ts
index 6d573f01fe..c0b06bdb2a 100644
--- a/integration-tests/run_shell_command.test.ts
+++ b/integration-tests/run_shell_command.test.ts
@@ -95,65 +95,12 @@ describe('run_shell_command', () => {
     const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`;
 
     // Provide the prompt via stdin to simulate non-interactive mode
-    const result = await rig.run({
-      stdin: prompt,
-      args: [`--allowed-tools=run_shell_command(${tool})`],
-    });
-
-    const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
-
-    if (!foundToolCall) {
-      printDebugInfo(rig, result, {
-        'Found tool call': foundToolCall,
-      });
-    }
-
-    expect(
-      foundToolCall,
-      'Expected to find a run_shell_command tool call',
-    ).toBeTruthy();
-  });
-
-  it('should succeed with no parens in non-interactive mode', async () => {
-    const rig = new TestRig();
-    await rig.setup('should succeed with no parens in non-interactive mode');
-
-    const testFile = rig.createFile('test.txt', 'Lorem\nIpsum\nDolor\n');
-    const { tool } = getLineCountCommand();
-    const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`;
-
-    const result = await rig.run({
-      stdin: prompt,
-      args: ['--allowed-tools=run_shell_command'],
-    });
-
-    const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
-
-    if (!foundToolCall) {
-      printDebugInfo(rig, result, {
-        'Found tool call': foundToolCall,
-      });
-    }
-
-    expect(
-      foundToolCall,
-      'Expected to find a run_shell_command tool call',
-    ).toBeTruthy();
-  });
-
-  it('should succeed with --yolo mode', async () => {
-    const rig = new TestRig();
-    await rig.setup('should succeed with --yolo mode');
-
-    const testFile = rig.createFile('test.txt', 'Lorem\nIpsum\nDolor\n');
-    const { tool } = getLineCountCommand();
-    const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`;
-
     const result = await rig.run(
       {
-        prompt: prompt,
+        stdin: prompt,
+        yolo: false,
       },
-      '--yolo',
+      `--allowed-tools=run_shell_command(${tool})`,
     );
 
     const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
@@ -168,6 +115,84 @@ describe('run_shell_command', () => {
       foundToolCall,
       'Expected to find a run_shell_command tool call',
     ).toBeTruthy();
+
+    const toolCall = rig
+      .readToolLogs()
+      .filter(
+        (toolCall) => toolCall.toolRequest.name === 'run_shell_command',
+      )[0];
+    expect(toolCall.toolRequest.success).toBe(true);
+  });
+
+  it('should succeed with no parens in non-interactive mode', async () => {
+    const rig = new TestRig();
+    await rig.setup('should succeed with no parens in non-interactive mode');
+
+    const testFile = rig.createFile('test.txt', 'Lorem\nIpsum\nDolor\n');
+    const { tool } = getLineCountCommand();
+    const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`;
+
+    const result = await rig.run(
+      {
+        stdin: prompt,
+        yolo: false,
+      },
+      '--allowed-tools=run_shell_command',
+    );
+
+    const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
+
+    if (!foundToolCall) {
+      printDebugInfo(rig, result, {
+        'Found tool call': foundToolCall,
+      });
+    }
+
+    expect(
+      foundToolCall,
+      'Expected to find a run_shell_command tool call',
+    ).toBeTruthy();
+
+    const toolCall = rig
+      .readToolLogs()
+      .filter(
+        (toolCall) => toolCall.toolRequest.name === 'run_shell_command',
+      )[0];
+    expect(toolCall.toolRequest.success).toBe(true);
+  });
+
+  it('should succeed with --yolo mode', async () => {
+    const rig = new TestRig();
+    await rig.setup('should succeed with --yolo mode');
+
+    const testFile = rig.createFile('test.txt', 'Lorem\nIpsum\nDolor\n');
+    const { tool } = getLineCountCommand();
+    const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`;
+
+    const result = await rig.run({
+      prompt: prompt,
+      yolo: true,
+    });
+
+    const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
+
+    if (!foundToolCall) {
+      printDebugInfo(rig, result, {
+        'Found tool call': foundToolCall,
+      });
+    }
+
+    expect(
+      foundToolCall,
+      'Expected to find a run_shell_command tool call',
+    ).toBeTruthy();
+
+    const toolCall = rig
+      .readToolLogs()
+      .filter(
+        (toolCall) => toolCall.toolRequest.name === 'run_shell_command',
+      )[0];
+    expect(toolCall.toolRequest.success).toBe(true);
   });
 
   it('should work with ShellTool alias', async () => {
@@ -178,10 +203,13 @@ describe('run_shell_command', () => {
     const { tool } = getLineCountCommand();
     const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`;
 
-    const result = await rig.run({
-      stdin: prompt,
-      args: [`--allowed-tools=ShellTool(${tool})`],
-    });
+    const result = await rig.run(
+      {
+        stdin: prompt,
+        yolo: false,
+      },
+      `--allowed-tools=ShellTool(${tool})`,
+    );
 
     const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
 
@@ -195,37 +223,65 @@ describe('run_shell_command', () => {
       foundToolCall,
       'Expected to find a run_shell_command tool call',
     ).toBeTruthy();
+
+    const toolCall = rig
+      .readToolLogs()
+      .filter(
+        (toolCall) => toolCall.toolRequest.name === 'run_shell_command',
+      )[0];
+    expect(toolCall.toolRequest.success).toBe(true);
   });
 
-  it('should combine multiple --allowed-tools flags', async () => {
+  // TODO(#11062): Un-skip this once we can make it reliable by using hard coded
+  // model responses.
+  it.skip('should combine multiple --allowed-tools flags', async () => {
     const rig = new TestRig();
     await rig.setup('should combine multiple --allowed-tools flags');
 
     const { tool } = getLineCountCommand();
     const prompt =
-      `use both ${tool} and ls to count the number of lines in ` +
-      `files in this directory`;
+      `use both ${tool} and ls to count the number of lines in files in this ` +
+      `directory. Do not pipe these commands into each other, run them separately.`;
 
-    const result = await rig.run({
-      stdin: prompt,
-      args: [
-        `--allowed-tools=run_shell_command(${tool})`,
-        '--allowed-tools=run_shell_command(ls)',
-      ],
-    });
+    const result = await rig.run(
+      {
+        stdin: prompt,
+        yolo: false,
+      },
+      `--allowed-tools=run_shell_command(${tool})`,
+      '--allowed-tools=run_shell_command(ls)',
+    );
 
-    const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
+    for (const expected in ['ls', tool]) {
+      const foundToolCall = await rig.waitForToolCall(
+        'run_shell_command',
+        15000,
+        (args) => args.toLowerCase().includes(`"command": "${expected}`),
+      );
 
-    if (!foundToolCall) {
-      printDebugInfo(rig, result, {
-        'Found tool call': foundToolCall,
-      });
+      if (!foundToolCall) {
+        printDebugInfo(rig, result, {
+          'Found tool call': foundToolCall,
+        });
+      }
+
+      expect(
+        foundToolCall,
+        `Expected to find a run_shell_command tool call to "${expected}",` +
+          ` got ${rig.readToolLogs().join('\n')}`,
+      ).toBeTruthy();
     }
 
-    expect(
-      foundToolCall,
-      'Expected to find a run_shell_command tool call',
-    ).toBeTruthy();
+    const toolLogs = rig
+      .readToolLogs()
+      .filter((toolCall) => toolCall.toolRequest.name === 'run_shell_command');
+    expect(toolLogs.length, toolLogs.join('\n')).toBeGreaterThanOrEqual(2);
+    for (const toolLog of toolLogs) {
+      expect(
+        toolLog.toolRequest.success,
+        `Expected tool call ${toolLog} to succeed`,
+      ).toBe(true);
+    }
   });
 
   it('should allow all with "ShellTool" and other specific tools', async () => {
@@ -237,13 +293,14 @@ describe('run_shell_command', () => {
     const { tool } = getLineCountCommand();
     const prompt = `Please run the command "echo test-allow-all" and show me the output`;
 
-    const result = await rig.run({
-      stdin: prompt,
-      args: [
-        `--allowed-tools=run_shell_command(${tool})`,
-        '--allowed-tools=run_shell_command',
-      ],
-    });
+    const result = await rig.run(
+      {
+        stdin: prompt,
+        yolo: false,
+      },
+      `--allowed-tools=run_shell_command(${tool})`,
+      '--allowed-tools=run_shell_command',
+    );
 
     const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
 
@@ -259,6 +316,13 @@ describe('run_shell_command', () => {
       'Expected to find a run_shell_command tool call',
     ).toBeTruthy();
 
+    const toolCall = rig
+      .readToolLogs()
+      .filter(
+        (toolCall) => toolCall.toolRequest.name === 'run_shell_command',
+      )[0];
+    expect(toolCall.toolRequest.success).toBe(true);
+
     // Validate model output - will throw if no output, warn if missing expected content
     validateModelOutput(
       result,
diff --git a/integration-tests/stdin-context.test.ts b/integration-tests/stdin-context.test.ts
index 3ec6810002..66c3db2e4e 100644
--- a/integration-tests/stdin-context.test.ts
+++ b/integration-tests/stdin-context.test.ts
@@ -20,9 +20,9 @@ describe.skip('stdin context', () => {
 
     await rig.waitForTelemetryEvent('api_request');
     const lastRequest = rig.readLastApiRequest();
-    expect(lastRequest).not.toBeNull();
 
-    const historyString = lastRequest.attributes.request_text;
+    expect(lastRequest?.attributes?.request_text).toBeDefined();
+    const historyString = lastRequest!.attributes!.request_text!;
 
     // TODO: This test currently fails in sandbox mode (Docker/Podman) because
     // stdin content is not properly forwarded to the container when used
diff --git a/integration-tests/test-helper.ts b/integration-tests/test-helper.ts
index 0fe1714d44..375f12545a 100644
--- a/integration-tests/test-helper.ts
+++ b/integration-tests/test-helper.ts
@@ -158,6 +158,7 @@ interface ParsedLog {
     function_args?: string;
     success?: boolean;
     duration_ms?: number;
+    request_text?: string;
   };
   scopeMetrics?: {
     metrics: {
@@ -315,10 +316,19 @@ export class TestRig {
   run(
     promptOrOptions:
       | string
-      | { prompt?: string; stdin?: string; stdinDoesNotEnd?: boolean },
+      | {
+          prompt?: string;
+          stdin?: string;
+          stdinDoesNotEnd?: boolean;
+          yolo?: boolean;
+        },
     ...args: string[]
   ): Promise<string> {
-    const { command, initialArgs } = this._getCommandAndArgs(['--yolo']);
+    const yolo =
+      typeof promptOrOptions === 'string' || promptOrOptions.yolo !== false;
+    const { command, initialArgs } = this._getCommandAndArgs(
+      yolo ? ['--yolo'] : [],
+    );
     const commandArgs = [...initialArgs];
     const execOptions: {
       cwd: string;
@@ -566,7 +576,11 @@ export class TestRig {
     );
   }
 
-  async waitForToolCall(toolName: string, timeout?: number) {
+  async waitForToolCall(
+    toolName: string,
+    timeout?: number,
+    matchArgs?: (args: string) => boolean,
+  ) {
     // Use environment-specific timeout
     if (!timeout) {
       timeout = getDefaultTimeout();
@@ -578,7 +592,11 @@ export class TestRig {
     return poll(
       () => {
         const toolLogs = this.readToolLogs();
-        return toolLogs.some((log) => log.toolRequest.name === toolName);
+        return toolLogs.some(
+          (log) =>
+            log.toolRequest.name === toolName &&
+            (matchArgs?.call(this, log.toolRequest.args) ?? true),
+        );
       },
       timeout,
       100,
diff --git a/integration-tests/write_file.test.ts b/integration-tests/write_file.test.ts
index 7dd6445d96..a53ef9407e 100644
--- a/integration-tests/write_file.test.ts
+++ b/integration-tests/write_file.test.ts
@@ -28,13 +28,14 @@ describe('write_file', () => {
     }
 
     const allTools = rig.readToolLogs();
-    expect(foundToolCall, 'Expected to find a write_file tool call').toBeTruthy(
+    expect(
+      foundToolCall,
       createToolCallErrorMessage(
         'write_file',
         allTools.map((t) => t.toolRequest.name),
         result,
       ),
-    );
+    ).toBeTruthy();
 
     // Validate model output - will throw if no output, warn if missing expected content
     validateModelOutput(result, 'dad.txt', 'Write file test');