mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-03-18 10:01:29 -07:00
fix integration test static errors, and run_shell_command tests to actually be testing what they intend (#11050)
This commit is contained in:
@@ -18,13 +18,14 @@ describe.skip('Interactive Mode', () => {
|
||||
await rig.cleanup();
|
||||
});
|
||||
|
||||
it('should trigger chat compression with /compress command', async () => {
|
||||
// TODO(#11062): Make this test reliable by not using the actual Gemini model
|
||||
it.skip('should trigger chat compression with /compress command', async () => {
|
||||
await rig.setup('interactive-compress-test');
|
||||
|
||||
const run = await rig.runInteractive();
|
||||
|
||||
const longPrompt =
|
||||
'Dont do anything except returning a 1000 token long paragragh with the <name of the scientist who discovered theory of relativity> at the end to indicate end of response. This is a moderately long sentence.';
|
||||
'Dont do anything except returning a 1000 token long paragraph with the <name of the scientist who discovered theory of relativity> at the end to indicate end of response. This is a moderately long sentence.';
|
||||
|
||||
await run.type(longPrompt);
|
||||
await run.type('\r');
|
||||
|
||||
@@ -5,8 +5,8 @@
|
||||
*/
|
||||
|
||||
// Unset NO_COLOR environment variable to ensure consistent theme behavior between local and CI test runs
|
||||
if (process.env.NO_COLOR !== undefined) {
|
||||
delete process.env.NO_COLOR;
|
||||
if (process.env['NO_COLOR'] !== undefined) {
|
||||
delete process.env['NO_COLOR'];
|
||||
}
|
||||
|
||||
import {
|
||||
@@ -60,21 +60,21 @@ export async function setup() {
|
||||
console.error('Error cleaning up old test runs:', e);
|
||||
}
|
||||
|
||||
process.env.INTEGRATION_TEST_FILE_DIR = runDir;
|
||||
process.env.GEMINI_CLI_INTEGRATION_TEST = 'true';
|
||||
process.env.TELEMETRY_LOG_FILE = join(runDir, 'telemetry.log');
|
||||
process.env['INTEGRATION_TEST_FILE_DIR'] = runDir;
|
||||
process.env['GEMINI_CLI_INTEGRATION_TEST'] = 'true';
|
||||
process.env['TELEMETRY_LOG_FILE'] = join(runDir, 'telemetry.log');
|
||||
|
||||
if (process.env.KEEP_OUTPUT) {
|
||||
if (process.env['KEEP_OUTPUT']) {
|
||||
console.log(`Keeping output for test run in: ${runDir}`);
|
||||
}
|
||||
process.env.VERBOSE = process.env.VERBOSE ?? 'false';
|
||||
process.env['VERBOSE'] = process.env['VERBOSE'] ?? 'false';
|
||||
|
||||
console.log(`\nIntegration test output directory: ${runDir}`);
|
||||
}
|
||||
|
||||
export async function teardown() {
|
||||
// Cleanup the test run directory unless KEEP_OUTPUT is set
|
||||
if (process.env.KEEP_OUTPUT !== 'true' && runDir) {
|
||||
if (process.env['KEEP_OUTPUT'] !== 'true' && runDir) {
|
||||
await rm(runDir, { recursive: true, force: true });
|
||||
}
|
||||
|
||||
|
||||
@@ -95,65 +95,12 @@ describe('run_shell_command', () => {
|
||||
const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`;
|
||||
|
||||
// Provide the prompt via stdin to simulate non-interactive mode
|
||||
const result = await rig.run({
|
||||
stdin: prompt,
|
||||
args: [`--allowed-tools=run_shell_command(${tool})`],
|
||||
});
|
||||
|
||||
const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
|
||||
|
||||
if (!foundToolCall) {
|
||||
printDebugInfo(rig, result, {
|
||||
'Found tool call': foundToolCall,
|
||||
});
|
||||
}
|
||||
|
||||
expect(
|
||||
foundToolCall,
|
||||
'Expected to find a run_shell_command tool call',
|
||||
).toBeTruthy();
|
||||
});
|
||||
|
||||
it('should succeed with no parens in non-interactive mode', async () => {
|
||||
const rig = new TestRig();
|
||||
await rig.setup('should succeed with no parens in non-interactive mode');
|
||||
|
||||
const testFile = rig.createFile('test.txt', 'Lorem\nIpsum\nDolor\n');
|
||||
const { tool } = getLineCountCommand();
|
||||
const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`;
|
||||
|
||||
const result = await rig.run({
|
||||
stdin: prompt,
|
||||
args: ['--allowed-tools=run_shell_command'],
|
||||
});
|
||||
|
||||
const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
|
||||
|
||||
if (!foundToolCall) {
|
||||
printDebugInfo(rig, result, {
|
||||
'Found tool call': foundToolCall,
|
||||
});
|
||||
}
|
||||
|
||||
expect(
|
||||
foundToolCall,
|
||||
'Expected to find a run_shell_command tool call',
|
||||
).toBeTruthy();
|
||||
});
|
||||
|
||||
it('should succeed with --yolo mode', async () => {
|
||||
const rig = new TestRig();
|
||||
await rig.setup('should succeed with --yolo mode');
|
||||
|
||||
const testFile = rig.createFile('test.txt', 'Lorem\nIpsum\nDolor\n');
|
||||
const { tool } = getLineCountCommand();
|
||||
const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`;
|
||||
|
||||
const result = await rig.run(
|
||||
{
|
||||
prompt: prompt,
|
||||
stdin: prompt,
|
||||
yolo: false,
|
||||
},
|
||||
'--yolo',
|
||||
`--allowed-tools=run_shell_command(${tool})`,
|
||||
);
|
||||
|
||||
const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
|
||||
@@ -168,6 +115,84 @@ describe('run_shell_command', () => {
|
||||
foundToolCall,
|
||||
'Expected to find a run_shell_command tool call',
|
||||
).toBeTruthy();
|
||||
|
||||
const toolCall = rig
|
||||
.readToolLogs()
|
||||
.filter(
|
||||
(toolCall) => toolCall.toolRequest.name === 'run_shell_command',
|
||||
)[0];
|
||||
expect(toolCall.toolRequest.success).toBe(true);
|
||||
});
|
||||
|
||||
it('should succeed with no parens in non-interactive mode', async () => {
|
||||
const rig = new TestRig();
|
||||
await rig.setup('should succeed with no parens in non-interactive mode');
|
||||
|
||||
const testFile = rig.createFile('test.txt', 'Lorem\nIpsum\nDolor\n');
|
||||
const { tool } = getLineCountCommand();
|
||||
const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`;
|
||||
|
||||
const result = await rig.run(
|
||||
{
|
||||
stdin: prompt,
|
||||
yolo: false,
|
||||
},
|
||||
'--allowed-tools=run_shell_command',
|
||||
);
|
||||
|
||||
const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
|
||||
|
||||
if (!foundToolCall) {
|
||||
printDebugInfo(rig, result, {
|
||||
'Found tool call': foundToolCall,
|
||||
});
|
||||
}
|
||||
|
||||
expect(
|
||||
foundToolCall,
|
||||
'Expected to find a run_shell_command tool call',
|
||||
).toBeTruthy();
|
||||
|
||||
const toolCall = rig
|
||||
.readToolLogs()
|
||||
.filter(
|
||||
(toolCall) => toolCall.toolRequest.name === 'run_shell_command',
|
||||
)[0];
|
||||
expect(toolCall.toolRequest.success).toBe(true);
|
||||
});
|
||||
|
||||
it('should succeed with --yolo mode', async () => {
|
||||
const rig = new TestRig();
|
||||
await rig.setup('should succeed with --yolo mode');
|
||||
|
||||
const testFile = rig.createFile('test.txt', 'Lorem\nIpsum\nDolor\n');
|
||||
const { tool } = getLineCountCommand();
|
||||
const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`;
|
||||
|
||||
const result = await rig.run({
|
||||
prompt: prompt,
|
||||
yolo: true,
|
||||
});
|
||||
|
||||
const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
|
||||
|
||||
if (!foundToolCall) {
|
||||
printDebugInfo(rig, result, {
|
||||
'Found tool call': foundToolCall,
|
||||
});
|
||||
}
|
||||
|
||||
expect(
|
||||
foundToolCall,
|
||||
'Expected to find a run_shell_command tool call',
|
||||
).toBeTruthy();
|
||||
|
||||
const toolCall = rig
|
||||
.readToolLogs()
|
||||
.filter(
|
||||
(toolCall) => toolCall.toolRequest.name === 'run_shell_command',
|
||||
)[0];
|
||||
expect(toolCall.toolRequest.success).toBe(true);
|
||||
});
|
||||
|
||||
it('should work with ShellTool alias', async () => {
|
||||
@@ -178,10 +203,13 @@ describe('run_shell_command', () => {
|
||||
const { tool } = getLineCountCommand();
|
||||
const prompt = `use ${tool} to tell me how many lines there are in ${testFile}`;
|
||||
|
||||
const result = await rig.run({
|
||||
stdin: prompt,
|
||||
args: [`--allowed-tools=ShellTool(${tool})`],
|
||||
});
|
||||
const result = await rig.run(
|
||||
{
|
||||
stdin: prompt,
|
||||
yolo: false,
|
||||
},
|
||||
`--allowed-tools=ShellTool(${tool})`,
|
||||
);
|
||||
|
||||
const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
|
||||
|
||||
@@ -195,37 +223,65 @@ describe('run_shell_command', () => {
|
||||
foundToolCall,
|
||||
'Expected to find a run_shell_command tool call',
|
||||
).toBeTruthy();
|
||||
|
||||
const toolCall = rig
|
||||
.readToolLogs()
|
||||
.filter(
|
||||
(toolCall) => toolCall.toolRequest.name === 'run_shell_command',
|
||||
)[0];
|
||||
expect(toolCall.toolRequest.success).toBe(true);
|
||||
});
|
||||
|
||||
it('should combine multiple --allowed-tools flags', async () => {
|
||||
// TODO(#11062): Un-skip this once we can make it reliable by using hard coded
|
||||
// model responses.
|
||||
it.skip('should combine multiple --allowed-tools flags', async () => {
|
||||
const rig = new TestRig();
|
||||
await rig.setup('should combine multiple --allowed-tools flags');
|
||||
|
||||
const { tool } = getLineCountCommand();
|
||||
const prompt =
|
||||
`use both ${tool} and ls to count the number of lines in ` +
|
||||
`files in this directory`;
|
||||
`use both ${tool} and ls to count the number of lines in files in this ` +
|
||||
`directory. Do not pipe these commands into each other, run them separately.`;
|
||||
|
||||
const result = await rig.run({
|
||||
stdin: prompt,
|
||||
args: [
|
||||
`--allowed-tools=run_shell_command(${tool})`,
|
||||
'--allowed-tools=run_shell_command(ls)',
|
||||
],
|
||||
});
|
||||
const result = await rig.run(
|
||||
{
|
||||
stdin: prompt,
|
||||
yolo: false,
|
||||
},
|
||||
`--allowed-tools=run_shell_command(${tool})`,
|
||||
'--allowed-tools=run_shell_command(ls)',
|
||||
);
|
||||
|
||||
const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
|
||||
for (const expected in ['ls', tool]) {
|
||||
const foundToolCall = await rig.waitForToolCall(
|
||||
'run_shell_command',
|
||||
15000,
|
||||
(args) => args.toLowerCase().includes(`"command": "${expected}`),
|
||||
);
|
||||
|
||||
if (!foundToolCall) {
|
||||
printDebugInfo(rig, result, {
|
||||
'Found tool call': foundToolCall,
|
||||
});
|
||||
if (!foundToolCall) {
|
||||
printDebugInfo(rig, result, {
|
||||
'Found tool call': foundToolCall,
|
||||
});
|
||||
}
|
||||
|
||||
expect(
|
||||
foundToolCall,
|
||||
`Expected to find a run_shell_command tool call to "${expected}",` +
|
||||
` got ${rig.readToolLogs().join('\n')}`,
|
||||
).toBeTruthy();
|
||||
}
|
||||
|
||||
expect(
|
||||
foundToolCall,
|
||||
'Expected to find a run_shell_command tool call',
|
||||
).toBeTruthy();
|
||||
const toolLogs = rig
|
||||
.readToolLogs()
|
||||
.filter((toolCall) => toolCall.toolRequest.name === 'run_shell_command');
|
||||
expect(toolLogs.length, toolLogs.join('\n')).toBeGreaterThanOrEqual(2);
|
||||
for (const toolLog of toolLogs) {
|
||||
expect(
|
||||
toolLog.toolRequest.success,
|
||||
`Expected tool call ${toolLog} to succeed`,
|
||||
).toBe(true);
|
||||
}
|
||||
});
|
||||
|
||||
it('should allow all with "ShellTool" and other specific tools', async () => {
|
||||
@@ -237,13 +293,14 @@ describe('run_shell_command', () => {
|
||||
const { tool } = getLineCountCommand();
|
||||
const prompt = `Please run the command "echo test-allow-all" and show me the output`;
|
||||
|
||||
const result = await rig.run({
|
||||
stdin: prompt,
|
||||
args: [
|
||||
`--allowed-tools=run_shell_command(${tool})`,
|
||||
'--allowed-tools=run_shell_command',
|
||||
],
|
||||
});
|
||||
const result = await rig.run(
|
||||
{
|
||||
stdin: prompt,
|
||||
yolo: false,
|
||||
},
|
||||
`--allowed-tools=run_shell_command(${tool})`,
|
||||
'--allowed-tools=run_shell_command',
|
||||
);
|
||||
|
||||
const foundToolCall = await rig.waitForToolCall('run_shell_command', 15000);
|
||||
|
||||
@@ -259,6 +316,13 @@ describe('run_shell_command', () => {
|
||||
'Expected to find a run_shell_command tool call',
|
||||
).toBeTruthy();
|
||||
|
||||
const toolCall = rig
|
||||
.readToolLogs()
|
||||
.filter(
|
||||
(toolCall) => toolCall.toolRequest.name === 'run_shell_command',
|
||||
)[0];
|
||||
expect(toolCall.toolRequest.success).toBe(true);
|
||||
|
||||
// Validate model output - will throw if no output, warn if missing expected content
|
||||
validateModelOutput(
|
||||
result,
|
||||
|
||||
@@ -20,9 +20,9 @@ describe.skip('stdin context', () => {
|
||||
|
||||
await rig.waitForTelemetryEvent('api_request');
|
||||
const lastRequest = rig.readLastApiRequest();
|
||||
expect(lastRequest).not.toBeNull();
|
||||
|
||||
const historyString = lastRequest.attributes.request_text;
|
||||
expect(lastRequest?.attributes?.request_text).toBeDefined();
|
||||
const historyString = lastRequest!.attributes!.request_text!;
|
||||
|
||||
// TODO: This test currently fails in sandbox mode (Docker/Podman) because
|
||||
// stdin content is not properly forwarded to the container when used
|
||||
|
||||
@@ -158,6 +158,7 @@ interface ParsedLog {
|
||||
function_args?: string;
|
||||
success?: boolean;
|
||||
duration_ms?: number;
|
||||
request_text?: string;
|
||||
};
|
||||
scopeMetrics?: {
|
||||
metrics: {
|
||||
@@ -315,10 +316,19 @@ export class TestRig {
|
||||
run(
|
||||
promptOrOptions:
|
||||
| string
|
||||
| { prompt?: string; stdin?: string; stdinDoesNotEnd?: boolean },
|
||||
| {
|
||||
prompt?: string;
|
||||
stdin?: string;
|
||||
stdinDoesNotEnd?: boolean;
|
||||
yolo?: boolean;
|
||||
},
|
||||
...args: string[]
|
||||
): Promise<string> {
|
||||
const { command, initialArgs } = this._getCommandAndArgs(['--yolo']);
|
||||
const yolo =
|
||||
typeof promptOrOptions === 'string' || promptOrOptions.yolo !== false;
|
||||
const { command, initialArgs } = this._getCommandAndArgs(
|
||||
yolo ? ['--yolo'] : [],
|
||||
);
|
||||
const commandArgs = [...initialArgs];
|
||||
const execOptions: {
|
||||
cwd: string;
|
||||
@@ -566,7 +576,11 @@ export class TestRig {
|
||||
);
|
||||
}
|
||||
|
||||
async waitForToolCall(toolName: string, timeout?: number) {
|
||||
async waitForToolCall(
|
||||
toolName: string,
|
||||
timeout?: number,
|
||||
matchArgs?: (args: string) => boolean,
|
||||
) {
|
||||
// Use environment-specific timeout
|
||||
if (!timeout) {
|
||||
timeout = getDefaultTimeout();
|
||||
@@ -578,7 +592,11 @@ export class TestRig {
|
||||
return poll(
|
||||
() => {
|
||||
const toolLogs = this.readToolLogs();
|
||||
return toolLogs.some((log) => log.toolRequest.name === toolName);
|
||||
return toolLogs.some(
|
||||
(log) =>
|
||||
log.toolRequest.name === toolName &&
|
||||
(matchArgs?.call(this, log.toolRequest.args) ?? true),
|
||||
);
|
||||
},
|
||||
timeout,
|
||||
100,
|
||||
|
||||
@@ -28,13 +28,14 @@ describe('write_file', () => {
|
||||
}
|
||||
|
||||
const allTools = rig.readToolLogs();
|
||||
expect(foundToolCall, 'Expected to find a write_file tool call').toBeTruthy(
|
||||
expect(
|
||||
foundToolCall,
|
||||
createToolCallErrorMessage(
|
||||
'write_file',
|
||||
allTools.map((t) => t.toolRequest.name),
|
||||
result,
|
||||
),
|
||||
);
|
||||
).toBeTruthy();
|
||||
|
||||
// Validate model output - will throw if no output, warn if missing expected content
|
||||
validateModelOutput(result, 'dad.txt', 'Write file test');
|
||||
|
||||
Reference in New Issue
Block a user