fix integration test static errors, and run_shell_command tests to actually be testing what they intend (#11050)

2026-05-13 05:12:55 -07:00 · 2025-10-14 11:36:49 -07:00
parent 49b66733c2
commit 99c7108bb0
6 changed files with 188 additions and 104 deletions
@@ -18,13 +18,14 @@ describe.skip('Interactive Mode', () => {
    await rig.cleanup();
  });
-  it('should trigger chat compression with /compress command', async () => {
+  // TODO(#11062): Make this test reliable by not using the actual Gemini model
  it.skip('should trigger chat compression with /compress command', async () => {
    await rig.setup('interactive-compress-test');
    const run = await rig.runInteractive();
    const longPrompt =
-      'Dont do anything except returning a 1000 token long paragragh with the <name of the scientist who discovered theory of relativity> at the end to indicate end of response. This is a moderately long sentence.';
+      'Dont do anything except returning a 1000 token long paragraph with the <name of the scientist who discovered theory of relativity> at the end to indicate end of response. This is a moderately long sentence.';
    await run.type(longPrompt);
    await run.type('\r');
@@ -5,8 +5,8 @@
 */
 // Unset NO_COLOR environment variable to ensure consistent theme behavior between local and CI test runs
-if (process.env.NO_COLOR !== undefined) {
+if (process.env['NO_COLOR'] !== undefined) {
-  delete process.env.NO_COLOR;
+  delete process.env['NO_COLOR'];
 }
 import {
@@ -60,21 +60,21 @@ export async function setup() {
    console.error('Error cleaning up old test runs:', e);
  }
-  process.env.INTEGRATION_TEST_FILE_DIR = runDir;
+  process.env['INTEGRATION_TEST_FILE_DIR'] = runDir;
-  process.env.GEMINI_CLI_INTEGRATION_TEST = 'true';
+  process.env['GEMINI_CLI_INTEGRATION_TEST'] = 'true';
-  process.env.TELEMETRY_LOG_FILE = join(runDir, 'telemetry.log');
+  process.env['TELEMETRY_LOG_FILE'] = join(runDir, 'telemetry.log');
-  if (process.env.KEEP_OUTPUT) {
+  if (process.env['KEEP_OUTPUT']) {
    console.log(`Keeping output for test run in: ${runDir}`);
  }
-  process.env.VERBOSE = process.env.VERBOSE ?? 'false';
+  process.env['VERBOSE'] = process.env['VERBOSE'] ?? 'false';
  console.log(`\nIntegration test output directory: ${runDir}`);
 }
 export async function teardown() {
  // Cleanup the test run directory unless KEEP_OUTPUT is set
-  if (process.env.KEEP_OUTPUT !== 'true' && runDir) {
+  if (process.env['KEEP_OUTPUT'] !== 'true' && runDir) {
    await rm(runDir, { recursive: true, force: true });
  }
@@ -95,65 +95,12 @@ describe('run_shell_command', () => {
    const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`;
    // Provide the prompt via stdin to simulate non-interactive mode
    const result = await rig.run({
      stdin: prompt,
      args: [`--allowed-tools=run_shell_command(${tool})`],
    });
    const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
    if (!foundToolCall) {
      printDebugInfo(rig, result, {
        'Found tool call': foundToolCall,
      });
    }
    expect(
      foundToolCall,
      'Expected to find a run_shell_command tool call',
    ).toBeTruthy();
  });
  it('should succeed with no parens in non-interactive mode', async () => {
    const rig = new TestRig();
    await rig.setup('should succeed with no parens in non-interactive mode');
    const testFile = rig.createFile('test.txt', 'Lorem\nIpsum\nDolor\n');
    const { tool } = getLineCountCommand();
    const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`;
    const result = await rig.run({
      stdin: prompt,
      args: ['--allowed-tools=run_shell_command'],
    });
    const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
    if (!foundToolCall) {
      printDebugInfo(rig, result, {
        'Found tool call': foundToolCall,
      });
    }
    expect(
      foundToolCall,
      'Expected to find a run_shell_command tool call',
    ).toBeTruthy();
  });
  it('should succeed with --yolo mode', async () => {
    const rig = new TestRig();
    await rig.setup('should succeed with --yolo mode');
    const testFile = rig.createFile('test.txt', 'Lorem\nIpsum\nDolor\n');
    const { tool } = getLineCountCommand();
    const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`;
    const result = await rig.run(
      {
-        prompt: prompt,
+        stdin: prompt,
        yolo: false,
      },
-      '--yolo',
+      `--allowed-tools=run_shell_command(${tool})`,
    );
    const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
@@ -168,6 +115,84 @@ describe('run_shell_command', () => {
      foundToolCall,
      'Expected to find a run_shell_command tool call',
    ).toBeTruthy();
    const toolCall = rig
      .readToolLogs()
      .filter(
        (toolCall) => toolCall.toolRequest.name === 'run_shell_command',
      )[0];
    expect(toolCall.toolRequest.success).toBe(true);
  });
  it('should succeed with no parens in non-interactive mode', async () => {
    const rig = new TestRig();
    await rig.setup('should succeed with no parens in non-interactive mode');
    const testFile = rig.createFile('test.txt', 'Lorem\nIpsum\nDolor\n');
    const { tool } = getLineCountCommand();
    const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`;
    const result = await rig.run(
      {
        stdin: prompt,
        yolo: false,
      },
      '--allowed-tools=run_shell_command',
    );
    const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
    if (!foundToolCall) {
      printDebugInfo(rig, result, {
        'Found tool call': foundToolCall,
      });
    }
    expect(
      foundToolCall,
      'Expected to find a run_shell_command tool call',
    ).toBeTruthy();
    const toolCall = rig
      .readToolLogs()
      .filter(
        (toolCall) => toolCall.toolRequest.name === 'run_shell_command',
      )[0];
    expect(toolCall.toolRequest.success).toBe(true);
  });
  it('should succeed with --yolo mode', async () => {
    const rig = new TestRig();
    await rig.setup('should succeed with --yolo mode');
    const testFile = rig.createFile('test.txt', 'Lorem\nIpsum\nDolor\n');
    const { tool } = getLineCountCommand();
    const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`;
    const result = await rig.run({
      prompt: prompt,
      yolo: true,
    });
    const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
    if (!foundToolCall) {
      printDebugInfo(rig, result, {
        'Found tool call': foundToolCall,
      });
    }
    expect(
      foundToolCall,
      'Expected to find a run_shell_command tool call',
    ).toBeTruthy();
    const toolCall = rig
      .readToolLogs()
      .filter(
        (toolCall) => toolCall.toolRequest.name === 'run_shell_command',
      )[0];
    expect(toolCall.toolRequest.success).toBe(true);
  });
  it('should work with ShellTool alias', async () => {
@@ -178,10 +203,13 @@ describe('run_shell_command', () => {
    const { tool } = getLineCountCommand();
    const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`;
-    const result = await rig.run({
+    const result = await rig.run(
-      stdin: prompt,
+      {
-      args: [`--allowed-tools=ShellTool(${tool})`],
+        stdin: prompt,
-    });
+        yolo: false,
      },
      `--allowed-tools=ShellTool(${tool})`,
    );
    const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
@@ -195,37 +223,65 @@ describe('run_shell_command', () => {
      foundToolCall,
      'Expected to find a run_shell_command tool call',
    ).toBeTruthy();
    const toolCall = rig
      .readToolLogs()
      .filter(
        (toolCall) => toolCall.toolRequest.name === 'run_shell_command',
      )[0];
    expect(toolCall.toolRequest.success).toBe(true);
  });
-  it('should combine multiple --allowed-tools flags', async () => {
+  // TODO(#11062): Un-skip this once we can make it reliable by using hard coded
  // model responses.
  it.skip('should combine multiple --allowed-tools flags', async () => {
    const rig = new TestRig();
    await rig.setup('should combine multiple --allowed-tools flags');
    const { tool } = getLineCountCommand();
    const prompt =
-      `use both ${tool} and ls to count the number of lines in ` +
+      `use both ${tool} and ls to count the number of lines in files in this ` +
-      `files in this directory`;
+      `directory. Do not pipe these commands into each other, run them separately.`;
-    const result = await rig.run({
+    const result = await rig.run(
-      stdin: prompt,
+      {
-      args: [
+        stdin: prompt,
-        `--allowed-tools=run_shell_command(${tool})`,
+        yolo: false,
-        '--allowed-tools=run_shell_command(ls)',
+      },
-      ],
+      `--allowed-tools=run_shell_command(${tool})`,
-    });
+      '--allowed-tools=run_shell_command(ls)',
    );
-    const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
+    for (const expected in ['ls', tool]) {
      const foundToolCall = await rig.waitForToolCall(
        'run_shell_command',
        15000,
        (args) => args.toLowerCase().includes(`"command": "${expected}`),
      );
-    if (!foundToolCall) {
+      if (!foundToolCall) {
-      printDebugInfo(rig, result, {
+        printDebugInfo(rig, result, {
-        'Found tool call': foundToolCall,
+          'Found tool call': foundToolCall,
-      });
+        });
      }
      expect(
        foundToolCall,
        `Expected to find a run_shell_command tool call to "${expected}",` +
          ` got ${rig.readToolLogs().join('\n')}`,
      ).toBeTruthy();
    }
-    expect(
+    const toolLogs = rig
-      foundToolCall,
+      .readToolLogs()
-      'Expected to find a run_shell_command tool call',
+      .filter((toolCall) => toolCall.toolRequest.name === 'run_shell_command');
-    ).toBeTruthy();
+    expect(toolLogs.length, toolLogs.join('\n')).toBeGreaterThanOrEqual(2);
    for (const toolLog of toolLogs) {
      expect(
        toolLog.toolRequest.success,
        `Expected tool call ${toolLog} to succeed`,
      ).toBe(true);
    }
  });
  it('should allow all with "ShellTool" and other specific tools', async () => {
@@ -237,13 +293,14 @@ describe('run_shell_command', () => {
    const { tool } = getLineCountCommand();
    const prompt = `Please run the command "echo test-allow-all" and show me the output`;
-    const result = await rig.run({
+    const result = await rig.run(
-      stdin: prompt,
+      {
-      args: [
+        stdin: prompt,
-        `--allowed-tools=run_shell_command(${tool})`,
+        yolo: false,
-        '--allowed-tools=run_shell_command',
+      },
-      ],
+      `--allowed-tools=run_shell_command(${tool})`,
-    });
+      '--allowed-tools=run_shell_command',
    );
    const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
@@ -259,6 +316,13 @@ describe('run_shell_command', () => {
      'Expected to find a run_shell_command tool call',
    ).toBeTruthy();
    const toolCall = rig
      .readToolLogs()
      .filter(
        (toolCall) => toolCall.toolRequest.name === 'run_shell_command',
      )[0];
    expect(toolCall.toolRequest.success).toBe(true);
    // Validate model output - will throw if no output, warn if missing expected content
    validateModelOutput(
      result,
@@ -20,9 +20,9 @@ describe.skip('stdin context', () => {
    await rig.waitForTelemetryEvent('api_request');
    const lastRequest = rig.readLastApiRequest();
    expect(lastRequest).not.toBeNull();
-    const historyString = lastRequest.attributes.request_text;
+    expect(lastRequest?.attributes?.request_text).toBeDefined();
    const historyString = lastRequest!.attributes!.request_text!;
    // TODO: This test currently fails in sandbox mode (Docker/Podman) because
    // stdin content is not properly forwarded to the container when used
@@ -158,6 +158,7 @@ interface ParsedLog {
    function_args?: string;
    success?: boolean;
    duration_ms?: number;
    request_text?: string;
  };
  scopeMetrics?: {
    metrics: {
@@ -315,10 +316,19 @@ export class TestRig {
  run(
    promptOrOptions:
      | string
-      | { prompt?: string; stdin?: string; stdinDoesNotEnd?: boolean },
+      | {
          prompt?: string;
          stdin?: string;
          stdinDoesNotEnd?: boolean;
          yolo?: boolean;
        },
    ...args: string[]
  ): Promise<string> {
-    const { command, initialArgs } = this._getCommandAndArgs(['--yolo']);
+    const yolo =
      typeof promptOrOptions === 'string' || promptOrOptions.yolo !== false;
    const { command, initialArgs } = this._getCommandAndArgs(
      yolo ? ['--yolo'] : [],
    );
    const commandArgs = [...initialArgs];
    const execOptions: {
      cwd: string;
@@ -566,7 +576,11 @@ export class TestRig {
    );
  }
-  async waitForToolCall(toolName: string, timeout?: number) {
+  async waitForToolCall(
    toolName: string,
    timeout?: number,
    matchArgs?: (args: string) => boolean,
  ) {
    // Use environment-specific timeout
    if (!timeout) {
      timeout = getDefaultTimeout();
@@ -578,7 +592,11 @@ export class TestRig {
    return poll(
      () => {
        const toolLogs = this.readToolLogs();
-        return toolLogs.some((log) => log.toolRequest.name === toolName);
+        return toolLogs.some(
          (log) =>
            log.toolRequest.name === toolName &&
            (matchArgs?.call(this, log.toolRequest.args) ?? true),
        );
      },
      timeout,
      100,
@@ -28,13 +28,14 @@ describe('write_file', () => {
    }
    const allTools = rig.readToolLogs();
-    expect(foundToolCall, 'Expected to find a write_file tool call').toBeTruthy(
+    expect(
      foundToolCall,
      createToolCallErrorMessage(
        'write_file',
        allTools.map((t) => t.toolRequest.name),
        result,
      ),
-    );
+    ).toBeTruthy();
    // Validate model output - will throw if no output, warn if missing expected content
    validateModelOutput(result, 'dad.txt', 'Write file test');