fix integration test static errors, and run_shell_command tests to actually be testing what they intend (#11050)

2026-05-12 21:03:05 -07:00 · 2025-10-14 11:36:49 -07:00
parent 49b66733c2
commit 99c7108bb0
6 changed files with 188 additions and 104 deletions
@@ -18,13 +18,14 @@ describe.skip('Interactive Mode', () => {
    await rig.cleanup();
  });

-  it('should trigger chat compression with /compress command', async () => {
+  // TODO(#11062): Make this test reliable by not using the actual Gemini model
+  it.skip('should trigger chat compression with /compress command', async () => {
    await rig.setup('interactive-compress-test');

    const run = await rig.runInteractive();

    const longPrompt =
-      'Dont do anything except returning a 1000 token long paragragh with the <name of the scientist who discovered theory of relativity> at the end to indicate end of response. This is a moderately long sentence.';
+      'Dont do anything except returning a 1000 token long paragraph with the <name of the scientist who discovered theory of relativity> at the end to indicate end of response. This is a moderately long sentence.';

    await run.type(longPrompt);
    await run.type('\r');
@@ -5,8 +5,8 @@
 */

 // Unset NO_COLOR environment variable to ensure consistent theme behavior between local and CI test runs
-if (process.env.NO_COLOR !== undefined) {
-  delete process.env.NO_COLOR;
+if (process.env['NO_COLOR'] !== undefined) {
+  delete process.env['NO_COLOR'];
 }

 import {
@@ -60,21 +60,21 @@ export async function setup() {
    console.error('Error cleaning up old test runs:', e);
  }

-  process.env.INTEGRATION_TEST_FILE_DIR = runDir;
-  process.env.GEMINI_CLI_INTEGRATION_TEST = 'true';
-  process.env.TELEMETRY_LOG_FILE = join(runDir, 'telemetry.log');
+  process.env['INTEGRATION_TEST_FILE_DIR'] = runDir;
+  process.env['GEMINI_CLI_INTEGRATION_TEST'] = 'true';
+  process.env['TELEMETRY_LOG_FILE'] = join(runDir, 'telemetry.log');

-  if (process.env.KEEP_OUTPUT) {
+  if (process.env['KEEP_OUTPUT']) {
    console.log(`Keeping output for test run in: ${runDir}`);
  }
-  process.env.VERBOSE = process.env.VERBOSE ?? 'false';
+  process.env['VERBOSE'] = process.env['VERBOSE'] ?? 'false';

  console.log(`\nIntegration test output directory: ${runDir}`);
 }

 export async function teardown() {
  // Cleanup the test run directory unless KEEP_OUTPUT is set
-  if (process.env.KEEP_OUTPUT !== 'true' && runDir) {
+  if (process.env['KEEP_OUTPUT'] !== 'true' && runDir) {
    await rm(runDir, { recursive: true, force: true });
  }

@@ -95,65 +95,12 @@ describe('run_shell_command', () => {
    const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`;

    // Provide the prompt via stdin to simulate non-interactive mode
-    const result = await rig.run({
-      stdin: prompt,
-      args: [`--allowed-tools=run_shell_command(${tool})`],
-    });
-
-    const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
-
-    if (!foundToolCall) {
-      printDebugInfo(rig, result, {
-        'Found tool call': foundToolCall,
-      });
-    }
-
-    expect(
-      foundToolCall,
-      'Expected to find a run_shell_command tool call',
-    ).toBeTruthy();
-  });
-
-  it('should succeed with no parens in non-interactive mode', async () => {
-    const rig = new TestRig();
-    await rig.setup('should succeed with no parens in non-interactive mode');
-
-    const testFile = rig.createFile('test.txt', 'Lorem\nIpsum\nDolor\n');
-    const { tool } = getLineCountCommand();
-    const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`;
-
-    const result = await rig.run({
-      stdin: prompt,
-      args: ['--allowed-tools=run_shell_command'],
-    });
-
-    const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
-
-    if (!foundToolCall) {
-      printDebugInfo(rig, result, {
-        'Found tool call': foundToolCall,
-      });
-    }
-
-    expect(
-      foundToolCall,
-      'Expected to find a run_shell_command tool call',
-    ).toBeTruthy();
-  });
-
-  it('should succeed with --yolo mode', async () => {
-    const rig = new TestRig();
-    await rig.setup('should succeed with --yolo mode');
-
-    const testFile = rig.createFile('test.txt', 'Lorem\nIpsum\nDolor\n');
-    const { tool } = getLineCountCommand();
-    const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`;
-
    const result = await rig.run(
      {
-        prompt: prompt,
+        stdin: prompt,
+        yolo: false,
      },
-      '--yolo',
+      `--allowed-tools=run_shell_command(${tool})`,
    );

    const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
@@ -168,6 +115,84 @@ describe('run_shell_command', () => {
      foundToolCall,
      'Expected to find a run_shell_command tool call',
    ).toBeTruthy();
+
+    const toolCall = rig
+      .readToolLogs()
+      .filter(
+        (toolCall) => toolCall.toolRequest.name === 'run_shell_command',
+      )[0];
+    expect(toolCall.toolRequest.success).toBe(true);
+  });
+
+  it('should succeed with no parens in non-interactive mode', async () => {
+    const rig = new TestRig();
+    await rig.setup('should succeed with no parens in non-interactive mode');
+
+    const testFile = rig.createFile('test.txt', 'Lorem\nIpsum\nDolor\n');
+    const { tool } = getLineCountCommand();
+    const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`;
+
+    const result = await rig.run(
+      {
+        stdin: prompt,
+        yolo: false,
+      },
+      '--allowed-tools=run_shell_command',
+    );
+
+    const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
+
+    if (!foundToolCall) {
+      printDebugInfo(rig, result, {
+        'Found tool call': foundToolCall,
+      });
+    }
+
+    expect(
+      foundToolCall,
+      'Expected to find a run_shell_command tool call',
+    ).toBeTruthy();
+
+    const toolCall = rig
+      .readToolLogs()
+      .filter(
+        (toolCall) => toolCall.toolRequest.name === 'run_shell_command',
+      )[0];
+    expect(toolCall.toolRequest.success).toBe(true);
+  });
+
+  it('should succeed with --yolo mode', async () => {
+    const rig = new TestRig();
+    await rig.setup('should succeed with --yolo mode');
+
+    const testFile = rig.createFile('test.txt', 'Lorem\nIpsum\nDolor\n');
+    const { tool } = getLineCountCommand();
+    const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`;
+
+    const result = await rig.run({
+      prompt: prompt,
+      yolo: true,
+    });
+
+    const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
+
+    if (!foundToolCall) {
+      printDebugInfo(rig, result, {
+        'Found tool call': foundToolCall,
+      });
+    }
+
+    expect(
+      foundToolCall,
+      'Expected to find a run_shell_command tool call',
+    ).toBeTruthy();
+
+    const toolCall = rig
+      .readToolLogs()
+      .filter(
+        (toolCall) => toolCall.toolRequest.name === 'run_shell_command',
+      )[0];
+    expect(toolCall.toolRequest.success).toBe(true);
  });

  it('should work with ShellTool alias', async () => {
@@ -178,10 +203,13 @@ describe('run_shell_command', () => {
    const { tool } = getLineCountCommand();
    const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`;

-    const result = await rig.run({
-      stdin: prompt,
-      args: [`--allowed-tools=ShellTool(${tool})`],
-    });
+    const result = await rig.run(
+      {
+        stdin: prompt,
+        yolo: false,
+      },
+      `--allowed-tools=ShellTool(${tool})`,
+    );

    const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);

@@ -195,37 +223,65 @@ describe('run_shell_command', () => {
      foundToolCall,
      'Expected to find a run_shell_command tool call',
    ).toBeTruthy();
+
+    const toolCall = rig
+      .readToolLogs()
+      .filter(
+        (toolCall) => toolCall.toolRequest.name === 'run_shell_command',
+      )[0];
+    expect(toolCall.toolRequest.success).toBe(true);
  });

-  it('should combine multiple --allowed-tools flags', async () => {
+  // TODO(#11062): Un-skip this once we can make it reliable by using hard coded
+  // model responses.
+  it.skip('should combine multiple --allowed-tools flags', async () => {
    const rig = new TestRig();
    await rig.setup('should combine multiple --allowed-tools flags');

    const { tool } = getLineCountCommand();
    const prompt =
-      `use both ${tool} and ls to count the number of lines in ` +
-      `files in this directory`;
+      `use both ${tool} and ls to count the number of lines in files in this ` +
+      `directory. Do not pipe these commands into each other, run them separately.`;

-    const result = await rig.run({
-      stdin: prompt,
-      args: [
-        `--allowed-tools=run_shell_command(${tool})`,
-        '--allowed-tools=run_shell_command(ls)',
-      ],
-    });
+    const result = await rig.run(
+      {
+        stdin: prompt,
+        yolo: false,
+      },
+      `--allowed-tools=run_shell_command(${tool})`,
+      '--allowed-tools=run_shell_command(ls)',
+    );

-    const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
+    for (const expected in ['ls', tool]) {
+      const foundToolCall = await rig.waitForToolCall(
+        'run_shell_command',
+        15000,
+        (args) => args.toLowerCase().includes(`"command": "${expected}`),
+      );

-    if (!foundToolCall) {
-      printDebugInfo(rig, result, {
-        'Found tool call': foundToolCall,
-      });
+      if (!foundToolCall) {
+        printDebugInfo(rig, result, {
+          'Found tool call': foundToolCall,
+        });
+      }
+
+      expect(
+        foundToolCall,
+        `Expected to find a run_shell_command tool call to "${expected}",` +
+          ` got ${rig.readToolLogs().join('\n')}`,
+      ).toBeTruthy();
    }

-    expect(
-      foundToolCall,
-      'Expected to find a run_shell_command tool call',
-    ).toBeTruthy();
+    const toolLogs = rig
+      .readToolLogs()
+      .filter((toolCall) => toolCall.toolRequest.name === 'run_shell_command');
+    expect(toolLogs.length, toolLogs.join('\n')).toBeGreaterThanOrEqual(2);
+    for (const toolLog of toolLogs) {
+      expect(
+        toolLog.toolRequest.success,
+        `Expected tool call ${toolLog} to succeed`,
+      ).toBe(true);
+    }
  });

  it('should allow all with "ShellTool" and other specific tools', async () => {
@@ -237,13 +293,14 @@ describe('run_shell_command', () => {
    const { tool } = getLineCountCommand();
    const prompt = `Please run the command "echo test-allow-all" and show me the output`;

-    const result = await rig.run({
-      stdin: prompt,
-      args: [
-        `--allowed-tools=run_shell_command(${tool})`,
-        '--allowed-tools=run_shell_command',
-      ],
-    });
+    const result = await rig.run(
+      {
+        stdin: prompt,
+        yolo: false,
+      },
+      `--allowed-tools=run_shell_command(${tool})`,
+      '--allowed-tools=run_shell_command',
+    );

    const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);

@@ -259,6 +316,13 @@ describe('run_shell_command', () => {
      'Expected to find a run_shell_command tool call',
    ).toBeTruthy();

+    const toolCall = rig
+      .readToolLogs()
+      .filter(
+        (toolCall) => toolCall.toolRequest.name === 'run_shell_command',
+      )[0];
+    expect(toolCall.toolRequest.success).toBe(true);
+
    // Validate model output - will throw if no output, warn if missing expected content
    validateModelOutput(
      result,
@@ -20,9 +20,9 @@ describe.skip('stdin context', () => {

    await rig.waitForTelemetryEvent('api_request');
    const lastRequest = rig.readLastApiRequest();
-    expect(lastRequest).not.toBeNull();

-    const historyString = lastRequest.attributes.request_text;
+    expect(lastRequest?.attributes?.request_text).toBeDefined();
+    const historyString = lastRequest!.attributes!.request_text!;

    // TODO: This test currently fails in sandbox mode (Docker/Podman) because
    // stdin content is not properly forwarded to the container when used
@@ -158,6 +158,7 @@ interface ParsedLog {
    function_args?: string;
    success?: boolean;
    duration_ms?: number;
+    request_text?: string;
  };
  scopeMetrics?: {
    metrics: {
@@ -315,10 +316,19 @@ export class TestRig {
  run(
    promptOrOptions:
      | string
-      | { prompt?: string; stdin?: string; stdinDoesNotEnd?: boolean },
+      | {
+          prompt?: string;
+          stdin?: string;
+          stdinDoesNotEnd?: boolean;
+          yolo?: boolean;
+        },
    ...args: string[]
  ): Promise<string> {
-    const { command, initialArgs } = this._getCommandAndArgs(['--yolo']);
+    const yolo =
+      typeof promptOrOptions === 'string' || promptOrOptions.yolo !== false;
+    const { command, initialArgs } = this._getCommandAndArgs(
+      yolo ? ['--yolo'] : [],
+    );
    const commandArgs = [...initialArgs];
    const execOptions: {
      cwd: string;
@@ -566,7 +576,11 @@ export class TestRig {
    );
  }

-  async waitForToolCall(toolName: string, timeout?: number) {
+  async waitForToolCall(
+    toolName: string,
+    timeout?: number,
+    matchArgs?: (args: string) => boolean,
+  ) {
    // Use environment-specific timeout
    if (!timeout) {
      timeout = getDefaultTimeout();
@@ -578,7 +592,11 @@ export class TestRig {
    return poll(
      () => {
        const toolLogs = this.readToolLogs();
-        return toolLogs.some((log) => log.toolRequest.name === toolName);
+        return toolLogs.some(
+          (log) =>
+            log.toolRequest.name === toolName &&
+            (matchArgs?.call(this, log.toolRequest.args) ?? true),
+        );
      },
      timeout,
      100,
@@ -28,13 +28,14 @@ describe('write_file', () => {
    }

    const allTools = rig.readToolLogs();
-    expect(foundToolCall, 'Expected to find a write_file tool call').toBeTruthy(
+    expect(
+      foundToolCall,
      createToolCallErrorMessage(
        'write_file',
        allTools.map((t) => t.toolRequest.name),
        result,
      ),
-    );
+    ).toBeTruthy();

    // Validate model output - will throw if no output, warn if missing expected content
    validateModelOutput(result, 'dad.txt', 'Write file test');