From 99c7108bb08e52b6550743dbe721a579af58c1f8 Mon Sep 17 00:00:00 2001 From: Jacob MacDonald Date: Tue, 14 Oct 2025 11:36:49 -0700 Subject: [PATCH] fix integration test static errors, and run_shell_command tests to actually be testing what they intend (#11050) --- .../context-compress-interactive.test.ts | 5 +- integration-tests/globalSetup.ts | 16 +- integration-tests/run_shell_command.test.ts | 236 +++++++++++------- integration-tests/stdin-context.test.ts | 4 +- integration-tests/test-helper.ts | 26 +- integration-tests/write_file.test.ts | 5 +- 6 files changed, 188 insertions(+), 104 deletions(-) diff --git a/integration-tests/context-compress-interactive.test.ts b/integration-tests/context-compress-interactive.test.ts index 98b1aa3a7b..51043e1265 100644 --- a/integration-tests/context-compress-interactive.test.ts +++ b/integration-tests/context-compress-interactive.test.ts @@ -18,13 +18,14 @@ describe.skip('Interactive Mode', () => { await rig.cleanup(); }); - it('should trigger chat compression with /compress command', async () => { + // TODO(#11062): Make this test reliable by not using the actual Gemini model + it.skip('should trigger chat compression with /compress command', async () => { await rig.setup('interactive-compress-test'); const run = await rig.runInteractive(); const longPrompt = - 'Dont do anything except returning a 1000 token long paragragh with the at the end to indicate end of response. This is a moderately long sentence.'; + 'Dont do anything except returning a 1000 token long paragraph with the at the end to indicate end of response. This is a moderately long sentence.'; await run.type(longPrompt); await run.type('\r'); diff --git a/integration-tests/globalSetup.ts b/integration-tests/globalSetup.ts index 180728e09e..5ef7081e07 100644 --- a/integration-tests/globalSetup.ts +++ b/integration-tests/globalSetup.ts @@ -5,8 +5,8 @@ */ // Unset NO_COLOR environment variable to ensure consistent theme behavior between local and CI test runs -if (process.env.NO_COLOR !== undefined) { - delete process.env.NO_COLOR; +if (process.env['NO_COLOR'] !== undefined) { + delete process.env['NO_COLOR']; } import { @@ -60,21 +60,21 @@ export async function setup() { console.error('Error cleaning up old test runs:', e); } - process.env.INTEGRATION_TEST_FILE_DIR = runDir; - process.env.GEMINI_CLI_INTEGRATION_TEST = 'true'; - process.env.TELEMETRY_LOG_FILE = join(runDir, 'telemetry.log'); + process.env['INTEGRATION_TEST_FILE_DIR'] = runDir; + process.env['GEMINI_CLI_INTEGRATION_TEST'] = 'true'; + process.env['TELEMETRY_LOG_FILE'] = join(runDir, 'telemetry.log'); - if (process.env.KEEP_OUTPUT) { + if (process.env['KEEP_OUTPUT']) { console.log(`Keeping output for test run in: ${runDir}`); } - process.env.VERBOSE = process.env.VERBOSE ?? 'false'; + process.env['VERBOSE'] = process.env['VERBOSE'] ?? 'false'; console.log(`\nIntegration test output directory: ${runDir}`); } export async function teardown() { // Cleanup the test run directory unless KEEP_OUTPUT is set - if (process.env.KEEP_OUTPUT !== 'true' && runDir) { + if (process.env['KEEP_OUTPUT'] !== 'true' && runDir) { await rm(runDir, { recursive: true, force: true }); } diff --git a/integration-tests/run_shell_command.test.ts b/integration-tests/run_shell_command.test.ts index 6d573f01fe..c0b06bdb2a 100644 --- a/integration-tests/run_shell_command.test.ts +++ b/integration-tests/run_shell_command.test.ts @@ -95,65 +95,12 @@ describe('run_shell_command', () => { const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`; // Provide the prompt via stdin to simulate non-interactive mode - const result = await rig.run({ - stdin: prompt, - args: [`--allowed-tools=run_shell_command(${tool})`], - }); - - const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000); - - if (!foundToolCall) { - printDebugInfo(rig, result, { - 'Found tool call': foundToolCall, - }); - } - - expect( - foundToolCall, - 'Expected to find a run_shell_command tool call', - ).toBeTruthy(); - }); - - it('should succeed with no parens in non-interactive mode', async () => { - const rig = new TestRig(); - await rig.setup('should succeed with no parens in non-interactive mode'); - - const testFile = rig.createFile('test.txt', 'Lorem\nIpsum\nDolor\n'); - const { tool } = getLineCountCommand(); - const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`; - - const result = await rig.run({ - stdin: prompt, - args: ['--allowed-tools=run_shell_command'], - }); - - const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000); - - if (!foundToolCall) { - printDebugInfo(rig, result, { - 'Found tool call': foundToolCall, - }); - } - - expect( - foundToolCall, - 'Expected to find a run_shell_command tool call', - ).toBeTruthy(); - }); - - it('should succeed with --yolo mode', async () => { - const rig = new TestRig(); - await rig.setup('should succeed with --yolo mode'); - - const testFile = rig.createFile('test.txt', 'Lorem\nIpsum\nDolor\n'); - const { tool } = getLineCountCommand(); - const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`; - const result = await rig.run( { - prompt: prompt, + stdin: prompt, + yolo: false, }, - '--yolo', + `--allowed-tools=run_shell_command(${tool})`, ); const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000); @@ -168,6 +115,84 @@ describe('run_shell_command', () => { foundToolCall, 'Expected to find a run_shell_command tool call', ).toBeTruthy(); + + const toolCall = rig + .readToolLogs() + .filter( + (toolCall) => toolCall.toolRequest.name === 'run_shell_command', + )[0]; + expect(toolCall.toolRequest.success).toBe(true); + }); + + it('should succeed with no parens in non-interactive mode', async () => { + const rig = new TestRig(); + await rig.setup('should succeed with no parens in non-interactive mode'); + + const testFile = rig.createFile('test.txt', 'Lorem\nIpsum\nDolor\n'); + const { tool } = getLineCountCommand(); + const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`; + + const result = await rig.run( + { + stdin: prompt, + yolo: false, + }, + '--allowed-tools=run_shell_command', + ); + + const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000); + + if (!foundToolCall) { + printDebugInfo(rig, result, { + 'Found tool call': foundToolCall, + }); + } + + expect( + foundToolCall, + 'Expected to find a run_shell_command tool call', + ).toBeTruthy(); + + const toolCall = rig + .readToolLogs() + .filter( + (toolCall) => toolCall.toolRequest.name === 'run_shell_command', + )[0]; + expect(toolCall.toolRequest.success).toBe(true); + }); + + it('should succeed with --yolo mode', async () => { + const rig = new TestRig(); + await rig.setup('should succeed with --yolo mode'); + + const testFile = rig.createFile('test.txt', 'Lorem\nIpsum\nDolor\n'); + const { tool } = getLineCountCommand(); + const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`; + + const result = await rig.run({ + prompt: prompt, + yolo: true, + }); + + const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000); + + if (!foundToolCall) { + printDebugInfo(rig, result, { + 'Found tool call': foundToolCall, + }); + } + + expect( + foundToolCall, + 'Expected to find a run_shell_command tool call', + ).toBeTruthy(); + + const toolCall = rig + .readToolLogs() + .filter( + (toolCall) => toolCall.toolRequest.name === 'run_shell_command', + )[0]; + expect(toolCall.toolRequest.success).toBe(true); }); it('should work with ShellTool alias', async () => { @@ -178,10 +203,13 @@ describe('run_shell_command', () => { const { tool } = getLineCountCommand(); const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`; - const result = await rig.run({ - stdin: prompt, - args: [`--allowed-tools=ShellTool(${tool})`], - }); + const result = await rig.run( + { + stdin: prompt, + yolo: false, + }, + `--allowed-tools=ShellTool(${tool})`, + ); const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000); @@ -195,37 +223,65 @@ describe('run_shell_command', () => { foundToolCall, 'Expected to find a run_shell_command tool call', ).toBeTruthy(); + + const toolCall = rig + .readToolLogs() + .filter( + (toolCall) => toolCall.toolRequest.name === 'run_shell_command', + )[0]; + expect(toolCall.toolRequest.success).toBe(true); }); - it('should combine multiple --allowed-tools flags', async () => { + // TODO(#11062): Un-skip this once we can make it reliable by using hard coded + // model responses. + it.skip('should combine multiple --allowed-tools flags', async () => { const rig = new TestRig(); await rig.setup('should combine multiple --allowed-tools flags'); const { tool } = getLineCountCommand(); const prompt = - `use both ${tool} and ls to count the number of lines in ` + - `files in this directory`; + `use both ${tool} and ls to count the number of lines in files in this ` + + `directory. Do not pipe these commands into each other, run them separately.`; - const result = await rig.run({ - stdin: prompt, - args: [ - `--allowed-tools=run_shell_command(${tool})`, - '--allowed-tools=run_shell_command(ls)', - ], - }); + const result = await rig.run( + { + stdin: prompt, + yolo: false, + }, + `--allowed-tools=run_shell_command(${tool})`, + '--allowed-tools=run_shell_command(ls)', + ); - const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000); + for (const expected in ['ls', tool]) { + const foundToolCall = await rig.waitForToolCall( + 'run_shell_command', + 15000, + (args) => args.toLowerCase().includes(`"command": "${expected}`), + ); - if (!foundToolCall) { - printDebugInfo(rig, result, { - 'Found tool call': foundToolCall, - }); + if (!foundToolCall) { + printDebugInfo(rig, result, { + 'Found tool call': foundToolCall, + }); + } + + expect( + foundToolCall, + `Expected to find a run_shell_command tool call to "${expected}",` + + ` got ${rig.readToolLogs().join('\n')}`, + ).toBeTruthy(); } - expect( - foundToolCall, - 'Expected to find a run_shell_command tool call', - ).toBeTruthy(); + const toolLogs = rig + .readToolLogs() + .filter((toolCall) => toolCall.toolRequest.name === 'run_shell_command'); + expect(toolLogs.length, toolLogs.join('\n')).toBeGreaterThanOrEqual(2); + for (const toolLog of toolLogs) { + expect( + toolLog.toolRequest.success, + `Expected tool call ${toolLog} to succeed`, + ).toBe(true); + } }); it('should allow all with "ShellTool" and other specific tools', async () => { @@ -237,13 +293,14 @@ describe('run_shell_command', () => { const { tool } = getLineCountCommand(); const prompt = `Please run the command "echo test-allow-all" and show me the output`; - const result = await rig.run({ - stdin: prompt, - args: [ - `--allowed-tools=run_shell_command(${tool})`, - '--allowed-tools=run_shell_command', - ], - }); + const result = await rig.run( + { + stdin: prompt, + yolo: false, + }, + `--allowed-tools=run_shell_command(${tool})`, + '--allowed-tools=run_shell_command', + ); const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000); @@ -259,6 +316,13 @@ describe('run_shell_command', () => { 'Expected to find a run_shell_command tool call', ).toBeTruthy(); + const toolCall = rig + .readToolLogs() + .filter( + (toolCall) => toolCall.toolRequest.name === 'run_shell_command', + )[0]; + expect(toolCall.toolRequest.success).toBe(true); + // Validate model output - will throw if no output, warn if missing expected content validateModelOutput( result, diff --git a/integration-tests/stdin-context.test.ts b/integration-tests/stdin-context.test.ts index 3ec6810002..66c3db2e4e 100644 --- a/integration-tests/stdin-context.test.ts +++ b/integration-tests/stdin-context.test.ts @@ -20,9 +20,9 @@ describe.skip('stdin context', () => { await rig.waitForTelemetryEvent('api_request'); const lastRequest = rig.readLastApiRequest(); - expect(lastRequest).not.toBeNull(); - const historyString = lastRequest.attributes.request_text; + expect(lastRequest?.attributes?.request_text).toBeDefined(); + const historyString = lastRequest!.attributes!.request_text!; // TODO: This test currently fails in sandbox mode (Docker/Podman) because // stdin content is not properly forwarded to the container when used diff --git a/integration-tests/test-helper.ts b/integration-tests/test-helper.ts index 0fe1714d44..375f12545a 100644 --- a/integration-tests/test-helper.ts +++ b/integration-tests/test-helper.ts @@ -158,6 +158,7 @@ interface ParsedLog { function_args?: string; success?: boolean; duration_ms?: number; + request_text?: string; }; scopeMetrics?: { metrics: { @@ -315,10 +316,19 @@ export class TestRig { run( promptOrOptions: | string - | { prompt?: string; stdin?: string; stdinDoesNotEnd?: boolean }, + | { + prompt?: string; + stdin?: string; + stdinDoesNotEnd?: boolean; + yolo?: boolean; + }, ...args: string[] ): Promise { - const { command, initialArgs } = this._getCommandAndArgs(['--yolo']); + const yolo = + typeof promptOrOptions === 'string' || promptOrOptions.yolo !== false; + const { command, initialArgs } = this._getCommandAndArgs( + yolo ? ['--yolo'] : [], + ); const commandArgs = [...initialArgs]; const execOptions: { cwd: string; @@ -566,7 +576,11 @@ export class TestRig { ); } - async waitForToolCall(toolName: string, timeout?: number) { + async waitForToolCall( + toolName: string, + timeout?: number, + matchArgs?: (args: string) => boolean, + ) { // Use environment-specific timeout if (!timeout) { timeout = getDefaultTimeout(); @@ -578,7 +592,11 @@ export class TestRig { return poll( () => { const toolLogs = this.readToolLogs(); - return toolLogs.some((log) => log.toolRequest.name === toolName); + return toolLogs.some( + (log) => + log.toolRequest.name === toolName && + (matchArgs?.call(this, log.toolRequest.args) ?? true), + ); }, timeout, 100, diff --git a/integration-tests/write_file.test.ts b/integration-tests/write_file.test.ts index 7dd6445d96..a53ef9407e 100644 --- a/integration-tests/write_file.test.ts +++ b/integration-tests/write_file.test.ts @@ -28,13 +28,14 @@ describe('write_file', () => { } const allTools = rig.readToolLogs(); - expect(foundToolCall, 'Expected to find a write_file tool call').toBeTruthy( + expect( + foundToolCall, createToolCallErrorMessage( 'write_file', allTools.map((t) => t.toolRequest.name), result, ), - ); + ).toBeTruthy(); // Validate model output - will throw if no output, warn if missing expected content validateModelOutput(result, 'dad.txt', 'Write file test');