fix integration test static errors, and run_shell_command tests to actually be testing what they intend (#11050)

This commit is contained in:
Jacob MacDonald
2025-10-14 11:36:49 -07:00
committed by GitHub
parent 49b66733c2
commit 99c7108bb0
6 changed files with 188 additions and 104 deletions

View File

@@ -18,13 +18,14 @@ describe.skip('Interactive Mode', () => {
await rig.cleanup();
});
it('should trigger chat compression with /compress command', async () => {
// TODO(#11062): Make this test reliable by not using the actual Gemini model
it.skip('should trigger chat compression with /compress command', async () => {
await rig.setup('interactive-compress-test');
const run = await rig.runInteractive();
const longPrompt =
'Dont do anything except returning a 1000 token long paragragh with the <name of the scientist who discovered theory of relativity> at the end to indicate end of response. This is a moderately long sentence.';
'Dont do anything except returning a 1000 token long paragraph with the <name of the scientist who discovered theory of relativity> at the end to indicate end of response. This is a moderately long sentence.';
await run.type(longPrompt);
await run.type('\r');

View File

@@ -5,8 +5,8 @@
*/
// Unset NO_COLOR environment variable to ensure consistent theme behavior between local and CI test runs
if (process.env.NO_COLOR !== undefined) {
delete process.env.NO_COLOR;
if (process.env['NO_COLOR'] !== undefined) {
delete process.env['NO_COLOR'];
}
import {
@@ -60,21 +60,21 @@ export async function setup() {
console.error('Error cleaning up old test runs:', e);
}
process.env.INTEGRATION_TEST_FILE_DIR = runDir;
process.env.GEMINI_CLI_INTEGRATION_TEST = 'true';
process.env.TELEMETRY_LOG_FILE = join(runDir, 'telemetry.log');
process.env['INTEGRATION_TEST_FILE_DIR'] = runDir;
process.env['GEMINI_CLI_INTEGRATION_TEST'] = 'true';
process.env['TELEMETRY_LOG_FILE'] = join(runDir, 'telemetry.log');
if (process.env.KEEP_OUTPUT) {
if (process.env['KEEP_OUTPUT']) {
console.log(`Keeping output for test run in: ${runDir}`);
}
process.env.VERBOSE = process.env.VERBOSE ?? 'false';
process.env['VERBOSE'] = process.env['VERBOSE'] ?? 'false';
console.log(`\nIntegration test output directory: ${runDir}`);
}
export async function teardown() {
// Cleanup the test run directory unless KEEP_OUTPUT is set
if (process.env.KEEP_OUTPUT !== 'true' && runDir) {
if (process.env['KEEP_OUTPUT'] !== 'true' && runDir) {
await rm(runDir, { recursive: true, force: true });
}

View File

@@ -95,65 +95,12 @@ describe('run_shell_command', () => {
const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`;
// Provide the prompt via stdin to simulate non-interactive mode
const result = await rig.run({
stdin: prompt,
args: [`--allowed-tools=run_shell_command(${tool})`],
});
const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
if (!foundToolCall) {
printDebugInfo(rig, result, {
'Found tool call': foundToolCall,
});
}
expect(
foundToolCall,
'Expected to find a run_shell_command tool call',
).toBeTruthy();
});
it('should succeed with no parens in non-interactive mode', async () => {
const rig = new TestRig();
await rig.setup('should succeed with no parens in non-interactive mode');
const testFile = rig.createFile('test.txt', 'Lorem\nIpsum\nDolor\n');
const { tool } = getLineCountCommand();
const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`;
const result = await rig.run({
stdin: prompt,
args: ['--allowed-tools=run_shell_command'],
});
const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
if (!foundToolCall) {
printDebugInfo(rig, result, {
'Found tool call': foundToolCall,
});
}
expect(
foundToolCall,
'Expected to find a run_shell_command tool call',
).toBeTruthy();
});
it('should succeed with --yolo mode', async () => {
const rig = new TestRig();
await rig.setup('should succeed with --yolo mode');
const testFile = rig.createFile('test.txt', 'Lorem\nIpsum\nDolor\n');
const { tool } = getLineCountCommand();
const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`;
const result = await rig.run(
{
prompt: prompt,
stdin: prompt,
yolo: false,
},
'--yolo',
`--allowed-tools=run_shell_command(${tool})`,
);
const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
@@ -168,6 +115,84 @@ describe('run_shell_command', () => {
foundToolCall,
'Expected to find a run_shell_command tool call',
).toBeTruthy();
const toolCall = rig
.readToolLogs()
.filter(
(toolCall) => toolCall.toolRequest.name === 'run_shell_command',
)[0];
expect(toolCall.toolRequest.success).toBe(true);
});
it('should succeed with no parens in non-interactive mode', async () => {
const rig = new TestRig();
await rig.setup('should succeed with no parens in non-interactive mode');
const testFile = rig.createFile('test.txt', 'Lorem\nIpsum\nDolor\n');
const { tool } = getLineCountCommand();
const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`;
const result = await rig.run(
{
stdin: prompt,
yolo: false,
},
'--allowed-tools=run_shell_command',
);
const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
if (!foundToolCall) {
printDebugInfo(rig, result, {
'Found tool call': foundToolCall,
});
}
expect(
foundToolCall,
'Expected to find a run_shell_command tool call',
).toBeTruthy();
const toolCall = rig
.readToolLogs()
.filter(
(toolCall) => toolCall.toolRequest.name === 'run_shell_command',
)[0];
expect(toolCall.toolRequest.success).toBe(true);
});
it('should succeed with --yolo mode', async () => {
const rig = new TestRig();
await rig.setup('should succeed with --yolo mode');
const testFile = rig.createFile('test.txt', 'Lorem\nIpsum\nDolor\n');
const { tool } = getLineCountCommand();
const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`;
const result = await rig.run({
prompt: prompt,
yolo: true,
});
const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
if (!foundToolCall) {
printDebugInfo(rig, result, {
'Found tool call': foundToolCall,
});
}
expect(
foundToolCall,
'Expected to find a run_shell_command tool call',
).toBeTruthy();
const toolCall = rig
.readToolLogs()
.filter(
(toolCall) => toolCall.toolRequest.name === 'run_shell_command',
)[0];
expect(toolCall.toolRequest.success).toBe(true);
});
it('should work with ShellTool alias', async () => {
@@ -178,10 +203,13 @@ describe('run_shell_command', () => {
const { tool } = getLineCountCommand();
const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`;
const result = await rig.run({
stdin: prompt,
args: [`--allowed-tools=ShellTool(${tool})`],
});
const result = await rig.run(
{
stdin: prompt,
yolo: false,
},
`--allowed-tools=ShellTool(${tool})`,
);
const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
@@ -195,37 +223,65 @@ describe('run_shell_command', () => {
foundToolCall,
'Expected to find a run_shell_command tool call',
).toBeTruthy();
const toolCall = rig
.readToolLogs()
.filter(
(toolCall) => toolCall.toolRequest.name === 'run_shell_command',
)[0];
expect(toolCall.toolRequest.success).toBe(true);
});
it('should combine multiple --allowed-tools flags', async () => {
// TODO(#11062): Un-skip this once we can make it reliable by using hard coded
// model responses.
it.skip('should combine multiple --allowed-tools flags', async () => {
const rig = new TestRig();
await rig.setup('should combine multiple --allowed-tools flags');
const { tool } = getLineCountCommand();
const prompt =
`use both ${tool} and ls to count the number of lines in ` +
`files in this directory`;
`use both ${tool} and ls to count the number of lines in files in this ` +
`directory. Do not pipe these commands into each other, run them separately.`;
const result = await rig.run({
stdin: prompt,
args: [
`--allowed-tools=run_shell_command(${tool})`,
'--allowed-tools=run_shell_command(ls)',
],
});
const result = await rig.run(
{
stdin: prompt,
yolo: false,
},
`--allowed-tools=run_shell_command(${tool})`,
'--allowed-tools=run_shell_command(ls)',
);
const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
for (const expected in ['ls', tool]) {
const foundToolCall = await rig.waitForToolCall(
'run_shell_command',
15000,
(args) => args.toLowerCase().includes(`"command": "${expected}`),
);
if (!foundToolCall) {
printDebugInfo(rig, result, {
'Found tool call': foundToolCall,
});
if (!foundToolCall) {
printDebugInfo(rig, result, {
'Found tool call': foundToolCall,
});
}
expect(
foundToolCall,
`Expected to find a run_shell_command tool call to "${expected}",` +
` got ${rig.readToolLogs().join('\n')}`,
).toBeTruthy();
}
expect(
foundToolCall,
'Expected to find a run_shell_command tool call',
).toBeTruthy();
const toolLogs = rig
.readToolLogs()
.filter((toolCall) => toolCall.toolRequest.name === 'run_shell_command');
expect(toolLogs.length, toolLogs.join('\n')).toBeGreaterThanOrEqual(2);
for (const toolLog of toolLogs) {
expect(
toolLog.toolRequest.success,
`Expected tool call ${toolLog} to succeed`,
).toBe(true);
}
});
it('should allow all with "ShellTool" and other specific tools', async () => {
@@ -237,13 +293,14 @@ describe('run_shell_command', () => {
const { tool } = getLineCountCommand();
const prompt = `Please run the command "echo test-allow-all" and show me the output`;
const result = await rig.run({
stdin: prompt,
args: [
`--allowed-tools=run_shell_command(${tool})`,
'--allowed-tools=run_shell_command',
],
});
const result = await rig.run(
{
stdin: prompt,
yolo: false,
},
`--allowed-tools=run_shell_command(${tool})`,
'--allowed-tools=run_shell_command',
);
const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
@@ -259,6 +316,13 @@ describe('run_shell_command', () => {
'Expected to find a run_shell_command tool call',
).toBeTruthy();
const toolCall = rig
.readToolLogs()
.filter(
(toolCall) => toolCall.toolRequest.name === 'run_shell_command',
)[0];
expect(toolCall.toolRequest.success).toBe(true);
// Validate model output - will throw if no output, warn if missing expected content
validateModelOutput(
result,

View File

@@ -20,9 +20,9 @@ describe.skip('stdin context', () => {
await rig.waitForTelemetryEvent('api_request');
const lastRequest = rig.readLastApiRequest();
expect(lastRequest).not.toBeNull();
const historyString = lastRequest.attributes.request_text;
expect(lastRequest?.attributes?.request_text).toBeDefined();
const historyString = lastRequest!.attributes!.request_text!;
// TODO: This test currently fails in sandbox mode (Docker/Podman) because
// stdin content is not properly forwarded to the container when used

View File

@@ -158,6 +158,7 @@ interface ParsedLog {
function_args?: string;
success?: boolean;
duration_ms?: number;
request_text?: string;
};
scopeMetrics?: {
metrics: {
@@ -315,10 +316,19 @@ export class TestRig {
run(
promptOrOptions:
| string
| { prompt?: string; stdin?: string; stdinDoesNotEnd?: boolean },
| {
prompt?: string;
stdin?: string;
stdinDoesNotEnd?: boolean;
yolo?: boolean;
},
...args: string[]
): Promise<string> {
const { command, initialArgs } = this._getCommandAndArgs(['--yolo']);
const yolo =
typeof promptOrOptions === 'string' || promptOrOptions.yolo !== false;
const { command, initialArgs } = this._getCommandAndArgs(
yolo ? ['--yolo'] : [],
);
const commandArgs = [...initialArgs];
const execOptions: {
cwd: string;
@@ -566,7 +576,11 @@ export class TestRig {
);
}
async waitForToolCall(toolName: string, timeout?: number) {
async waitForToolCall(
toolName: string,
timeout?: number,
matchArgs?: (args: string) => boolean,
) {
// Use environment-specific timeout
if (!timeout) {
timeout = getDefaultTimeout();
@@ -578,7 +592,11 @@ export class TestRig {
return poll(
() => {
const toolLogs = this.readToolLogs();
return toolLogs.some((log) => log.toolRequest.name === toolName);
return toolLogs.some(
(log) =>
log.toolRequest.name === toolName &&
(matchArgs?.call(this, log.toolRequest.args) ?? true),
);
},
timeout,
100,

View File

@@ -28,13 +28,14 @@ describe('write_file', () => {
}
const allTools = rig.readToolLogs();
expect(foundToolCall, 'Expected to find a write_file tool call').toBeTruthy(
expect(
foundToolCall,
createToolCallErrorMessage(
'write_file',
allTools.map((t) => t.toolRequest.name),
result,
),
);
).toBeTruthy();
// Validate model output - will throw if no output, warn if missing expected content
validateModelOutput(result, 'dad.txt', 'Write file test');