fix integration test static errors, and run_shell_command tests to actually be testing what they intend (#11050)

This commit is contained in:
Jacob MacDonald
2025-10-14 11:36:49 -07:00
committed by GitHub
parent 49b66733c2
commit 99c7108bb0
6 changed files with 188 additions and 104 deletions
@@ -18,13 +18,14 @@ describe.skip('Interactive Mode', () => {
await rig.cleanup(); await rig.cleanup();
}); });
it('should trigger chat compression with /compress command', async () => { // TODO(#11062): Make this test reliable by not using the actual Gemini model
it.skip('should trigger chat compression with /compress command', async () => {
await rig.setup('interactive-compress-test'); await rig.setup('interactive-compress-test');
const run = await rig.runInteractive(); const run = await rig.runInteractive();
const longPrompt = const longPrompt =
'Dont do anything except returning a 1000 token long paragragh with the <name of the scientist who discovered theory of relativity> at the end to indicate end of response. This is a moderately long sentence.'; 'Dont do anything except returning a 1000 token long paragraph with the <name of the scientist who discovered theory of relativity> at the end to indicate end of response. This is a moderately long sentence.';
await run.type(longPrompt); await run.type(longPrompt);
await run.type('\r'); await run.type('\r');
+8 -8
View File
@@ -5,8 +5,8 @@
*/ */
// Unset NO_COLOR environment variable to ensure consistent theme behavior between local and CI test runs // Unset NO_COLOR environment variable to ensure consistent theme behavior between local and CI test runs
if (process.env.NO_COLOR !== undefined) { if (process.env['NO_COLOR'] !== undefined) {
delete process.env.NO_COLOR; delete process.env['NO_COLOR'];
} }
import { import {
@@ -60,21 +60,21 @@ export async function setup() {
console.error('Error cleaning up old test runs:', e); console.error('Error cleaning up old test runs:', e);
} }
process.env.INTEGRATION_TEST_FILE_DIR = runDir; process.env['INTEGRATION_TEST_FILE_DIR'] = runDir;
process.env.GEMINI_CLI_INTEGRATION_TEST = 'true'; process.env['GEMINI_CLI_INTEGRATION_TEST'] = 'true';
process.env.TELEMETRY_LOG_FILE = join(runDir, 'telemetry.log'); process.env['TELEMETRY_LOG_FILE'] = join(runDir, 'telemetry.log');
if (process.env.KEEP_OUTPUT) { if (process.env['KEEP_OUTPUT']) {
console.log(`Keeping output for test run in: ${runDir}`); console.log(`Keeping output for test run in: ${runDir}`);
} }
process.env.VERBOSE = process.env.VERBOSE ?? 'false'; process.env['VERBOSE'] = process.env['VERBOSE'] ?? 'false';
console.log(`\nIntegration test output directory: ${runDir}`); console.log(`\nIntegration test output directory: ${runDir}`);
} }
export async function teardown() { export async function teardown() {
// Cleanup the test run directory unless KEEP_OUTPUT is set // Cleanup the test run directory unless KEEP_OUTPUT is set
if (process.env.KEEP_OUTPUT !== 'true' && runDir) { if (process.env['KEEP_OUTPUT'] !== 'true' && runDir) {
await rm(runDir, { recursive: true, force: true }); await rm(runDir, { recursive: true, force: true });
} }
+150 -86
View File
@@ -95,65 +95,12 @@ describe('run_shell_command', () => {
const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`; const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`;
// Provide the prompt via stdin to simulate non-interactive mode // Provide the prompt via stdin to simulate non-interactive mode
const result = await rig.run({
stdin: prompt,
args: [`--allowed-tools=run_shell_command(${tool})`],
});
const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
if (!foundToolCall) {
printDebugInfo(rig, result, {
'Found tool call': foundToolCall,
});
}
expect(
foundToolCall,
'Expected to find a run_shell_command tool call',
).toBeTruthy();
});
it('should succeed with no parens in non-interactive mode', async () => {
const rig = new TestRig();
await rig.setup('should succeed with no parens in non-interactive mode');
const testFile = rig.createFile('test.txt', 'Lorem\nIpsum\nDolor\n');
const { tool } = getLineCountCommand();
const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`;
const result = await rig.run({
stdin: prompt,
args: ['--allowed-tools=run_shell_command'],
});
const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
if (!foundToolCall) {
printDebugInfo(rig, result, {
'Found tool call': foundToolCall,
});
}
expect(
foundToolCall,
'Expected to find a run_shell_command tool call',
).toBeTruthy();
});
it('should succeed with --yolo mode', async () => {
const rig = new TestRig();
await rig.setup('should succeed with --yolo mode');
const testFile = rig.createFile('test.txt', 'Lorem\nIpsum\nDolor\n');
const { tool } = getLineCountCommand();
const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`;
const result = await rig.run( const result = await rig.run(
{ {
prompt: prompt, stdin: prompt,
yolo: false,
}, },
'--yolo', `--allowed-tools=run_shell_command(${tool})`,
); );
const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000); const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
@@ -168,6 +115,84 @@ describe('run_shell_command', () => {
foundToolCall, foundToolCall,
'Expected to find a run_shell_command tool call', 'Expected to find a run_shell_command tool call',
).toBeTruthy(); ).toBeTruthy();
const toolCall = rig
.readToolLogs()
.filter(
(toolCall) => toolCall.toolRequest.name === 'run_shell_command',
)[0];
expect(toolCall.toolRequest.success).toBe(true);
});
it('should succeed with no parens in non-interactive mode', async () => {
const rig = new TestRig();
await rig.setup('should succeed with no parens in non-interactive mode');
const testFile = rig.createFile('test.txt', 'Lorem\nIpsum\nDolor\n');
const { tool } = getLineCountCommand();
const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`;
const result = await rig.run(
{
stdin: prompt,
yolo: false,
},
'--allowed-tools=run_shell_command',
);
const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
if (!foundToolCall) {
printDebugInfo(rig, result, {
'Found tool call': foundToolCall,
});
}
expect(
foundToolCall,
'Expected to find a run_shell_command tool call',
).toBeTruthy();
const toolCall = rig
.readToolLogs()
.filter(
(toolCall) => toolCall.toolRequest.name === 'run_shell_command',
)[0];
expect(toolCall.toolRequest.success).toBe(true);
});
it('should succeed with --yolo mode', async () => {
const rig = new TestRig();
await rig.setup('should succeed with --yolo mode');
const testFile = rig.createFile('test.txt', 'Lorem\nIpsum\nDolor\n');
const { tool } = getLineCountCommand();
const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`;
const result = await rig.run({
prompt: prompt,
yolo: true,
});
const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
if (!foundToolCall) {
printDebugInfo(rig, result, {
'Found tool call': foundToolCall,
});
}
expect(
foundToolCall,
'Expected to find a run_shell_command tool call',
).toBeTruthy();
const toolCall = rig
.readToolLogs()
.filter(
(toolCall) => toolCall.toolRequest.name === 'run_shell_command',
)[0];
expect(toolCall.toolRequest.success).toBe(true);
}); });
it('should work with ShellTool alias', async () => { it('should work with ShellTool alias', async () => {
@@ -178,10 +203,13 @@ describe('run_shell_command', () => {
const { tool } = getLineCountCommand(); const { tool } = getLineCountCommand();
const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`; const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`;
const result = await rig.run({ const result = await rig.run(
stdin: prompt, {
args: [`--allowed-tools=ShellTool(${tool})`], stdin: prompt,
}); yolo: false,
},
`--allowed-tools=ShellTool(${tool})`,
);
const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000); const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
@@ -195,37 +223,65 @@ describe('run_shell_command', () => {
foundToolCall, foundToolCall,
'Expected to find a run_shell_command tool call', 'Expected to find a run_shell_command tool call',
).toBeTruthy(); ).toBeTruthy();
const toolCall = rig
.readToolLogs()
.filter(
(toolCall) => toolCall.toolRequest.name === 'run_shell_command',
)[0];
expect(toolCall.toolRequest.success).toBe(true);
}); });
it('should combine multiple --allowed-tools flags', async () => { // TODO(#11062): Un-skip this once we can make it reliable by using hard coded
// model responses.
it.skip('should combine multiple --allowed-tools flags', async () => {
const rig = new TestRig(); const rig = new TestRig();
await rig.setup('should combine multiple --allowed-tools flags'); await rig.setup('should combine multiple --allowed-tools flags');
const { tool } = getLineCountCommand(); const { tool } = getLineCountCommand();
const prompt = const prompt =
`use both ${tool} and ls to count the number of lines in ` + `use both ${tool} and ls to count the number of lines in files in this ` +
`files in this directory`; `directory. Do not pipe these commands into each other, run them separately.`;
const result = await rig.run({ const result = await rig.run(
stdin: prompt, {
args: [ stdin: prompt,
`--allowed-tools=run_shell_command(${tool})`, yolo: false,
'--allowed-tools=run_shell_command(ls)', },
], `--allowed-tools=run_shell_command(${tool})`,
}); '--allowed-tools=run_shell_command(ls)',
);
const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000); for (const expected in ['ls', tool]) {
const foundToolCall = await rig.waitForToolCall(
'run_shell_command',
15000,
(args) => args.toLowerCase().includes(`"command": "${expected}`),
);
if (!foundToolCall) { if (!foundToolCall) {
printDebugInfo(rig, result, { printDebugInfo(rig, result, {
'Found tool call': foundToolCall, 'Found tool call': foundToolCall,
}); });
}
expect(
foundToolCall,
`Expected to find a run_shell_command tool call to "${expected}",` +
` got ${rig.readToolLogs().join('\n')}`,
).toBeTruthy();
} }
expect( const toolLogs = rig
foundToolCall, .readToolLogs()
'Expected to find a run_shell_command tool call', .filter((toolCall) => toolCall.toolRequest.name === 'run_shell_command');
).toBeTruthy(); expect(toolLogs.length, toolLogs.join('\n')).toBeGreaterThanOrEqual(2);
for (const toolLog of toolLogs) {
expect(
toolLog.toolRequest.success,
`Expected tool call ${toolLog} to succeed`,
).toBe(true);
}
}); });
it('should allow all with "ShellTool" and other specific tools', async () => { it('should allow all with "ShellTool" and other specific tools', async () => {
@@ -237,13 +293,14 @@ describe('run_shell_command', () => {
const { tool } = getLineCountCommand(); const { tool } = getLineCountCommand();
const prompt = `Please run the command "echo test-allow-all" and show me the output`; const prompt = `Please run the command "echo test-allow-all" and show me the output`;
const result = await rig.run({ const result = await rig.run(
stdin: prompt, {
args: [ stdin: prompt,
`--allowed-tools=run_shell_command(${tool})`, yolo: false,
'--allowed-tools=run_shell_command', },
], `--allowed-tools=run_shell_command(${tool})`,
}); '--allowed-tools=run_shell_command',
);
const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000); const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
@@ -259,6 +316,13 @@ describe('run_shell_command', () => {
'Expected to find a run_shell_command tool call', 'Expected to find a run_shell_command tool call',
).toBeTruthy(); ).toBeTruthy();
const toolCall = rig
.readToolLogs()
.filter(
(toolCall) => toolCall.toolRequest.name === 'run_shell_command',
)[0];
expect(toolCall.toolRequest.success).toBe(true);
// Validate model output - will throw if no output, warn if missing expected content // Validate model output - will throw if no output, warn if missing expected content
validateModelOutput( validateModelOutput(
result, result,
+2 -2
View File
@@ -20,9 +20,9 @@ describe.skip('stdin context', () => {
await rig.waitForTelemetryEvent('api_request'); await rig.waitForTelemetryEvent('api_request');
const lastRequest = rig.readLastApiRequest(); const lastRequest = rig.readLastApiRequest();
expect(lastRequest).not.toBeNull();
const historyString = lastRequest.attributes.request_text; expect(lastRequest?.attributes?.request_text).toBeDefined();
const historyString = lastRequest!.attributes!.request_text!;
// TODO: This test currently fails in sandbox mode (Docker/Podman) because // TODO: This test currently fails in sandbox mode (Docker/Podman) because
// stdin content is not properly forwarded to the container when used // stdin content is not properly forwarded to the container when used
+22 -4
View File
@@ -158,6 +158,7 @@ interface ParsedLog {
function_args?: string; function_args?: string;
success?: boolean; success?: boolean;
duration_ms?: number; duration_ms?: number;
request_text?: string;
}; };
scopeMetrics?: { scopeMetrics?: {
metrics: { metrics: {
@@ -315,10 +316,19 @@ export class TestRig {
run( run(
promptOrOptions: promptOrOptions:
| string | string
| { prompt?: string; stdin?: string; stdinDoesNotEnd?: boolean }, | {
prompt?: string;
stdin?: string;
stdinDoesNotEnd?: boolean;
yolo?: boolean;
},
...args: string[] ...args: string[]
): Promise<string> { ): Promise<string> {
const { command, initialArgs } = this._getCommandAndArgs(['--yolo']); const yolo =
typeof promptOrOptions === 'string' || promptOrOptions.yolo !== false;
const { command, initialArgs } = this._getCommandAndArgs(
yolo ? ['--yolo'] : [],
);
const commandArgs = [...initialArgs]; const commandArgs = [...initialArgs];
const execOptions: { const execOptions: {
cwd: string; cwd: string;
@@ -566,7 +576,11 @@ export class TestRig {
); );
} }
async waitForToolCall(toolName: string, timeout?: number) { async waitForToolCall(
toolName: string,
timeout?: number,
matchArgs?: (args: string) => boolean,
) {
// Use environment-specific timeout // Use environment-specific timeout
if (!timeout) { if (!timeout) {
timeout = getDefaultTimeout(); timeout = getDefaultTimeout();
@@ -578,7 +592,11 @@ export class TestRig {
return poll( return poll(
() => { () => {
const toolLogs = this.readToolLogs(); const toolLogs = this.readToolLogs();
return toolLogs.some((log) => log.toolRequest.name === toolName); return toolLogs.some(
(log) =>
log.toolRequest.name === toolName &&
(matchArgs?.call(this, log.toolRequest.args) ?? true),
);
}, },
timeout, timeout,
100, 100,
+3 -2
View File
@@ -28,13 +28,14 @@ describe('write_file', () => {
} }
const allTools = rig.readToolLogs(); const allTools = rig.readToolLogs();
expect(foundToolCall, 'Expected to find a write_file tool call').toBeTruthy( expect(
foundToolCall,
createToolCallErrorMessage( createToolCallErrorMessage(
'write_file', 'write_file',
allTools.map((t) => t.toolRequest.name), allTools.map((t) => t.toolRequest.name),
result, result,
), ),
); ).toBeTruthy();
// Validate model output - will throw if no output, warn if missing expected content // Validate model output - will throw if no output, warn if missing expected content
validateModelOutput(result, 'dad.txt', 'Write file test'); validateModelOutput(result, 'dad.txt', 'Write file test');