2025-06-16 08:27:29 -07:00
/**
* @license
* Copyright 2025 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
2025-12-15 11:11:08 -08:00
import { describe , it , expect , beforeEach , afterEach } from 'vitest' ;
2026-02-05 10:07:47 -08:00
import {
TestRig ,
printDebugInfo ,
assertModelHasOutput ,
checkModelOutputContent ,
} from './test-helper.js' ;
2025-10-06 12:15:21 -07:00
import { getShellConfiguration } from '../packages/core/src/utils/shell-utils.js' ;
const { shell } = getShellConfiguration ( ) ;
function getLineCountCommand ( ) : { command : string ; tool : string } {
switch ( shell ) {
case 'powershell' :
2026-03-03 06:32:19 +05:30
return { command : ` Measure-Object -Line ` , tool : 'Measure-Object' } ;
2025-10-06 12:15:21 -07:00
case 'cmd' :
2025-10-15 12:44:07 -07:00
return { command : ` find /c /v ` , tool : 'find' } ;
2025-10-06 12:15:21 -07:00
case 'bash' :
default :
2025-10-15 12:44:07 -07:00
return { command : ` wc -l ` , tool : 'wc' } ;
2025-10-06 12:15:21 -07:00
}
}
2025-06-16 08:27:29 -07:00
2025-10-16 17:25:30 -07:00
function getInvalidCommand ( ) : string {
switch ( shell ) {
case 'powershell' :
return ` Get-ChildItem | | Select-Object ` ;
case 'cmd' :
return ` dir | | findstr foo ` ;
case 'bash' :
default :
return ` echo "hello" > > file ` ;
}
}
function getAllowedListCommand ( ) : string {
switch ( shell ) {
case 'powershell' :
return 'Get-ChildItem' ;
case 'cmd' :
return 'dir' ;
case 'bash' :
default :
return 'ls' ;
}
}
function getDisallowedFileReadCommand ( testFile : string ) : {
command : string ;
tool : string ;
} {
const quotedPath = ` " ${ testFile } " ` ;
switch ( shell ) {
case 'powershell' :
return { command : ` Get-Content ${ quotedPath } ` , tool : 'Get-Content' } ;
case 'cmd' :
return { command : ` type ${ quotedPath } ` , tool : 'type' } ;
case 'bash' :
default :
return { command : ` cat ${ quotedPath } ` , tool : 'cat' } ;
}
}
2025-10-23 16:55:01 -04:00
function getChainedEchoCommand ( ) : { allowPattern : string ; command : string } {
const secondCommand = getAllowedListCommand ( ) ;
switch ( shell ) {
case 'powershell' :
return {
allowPattern : 'Write-Output' ,
command : ` Write-Output "foo" && ${ secondCommand } ` ,
} ;
case 'cmd' :
return {
allowPattern : 'echo' ,
command : ` echo "foo" && ${ secondCommand } ` ,
} ;
case 'bash' :
default :
return {
allowPattern : 'echo' ,
command : ` echo "foo" && ${ secondCommand } ` ,
} ;
}
}
2025-08-12 15:57:27 -07:00
describe ( 'run_shell_command' , ( ) = > {
2025-12-15 11:11:08 -08:00
let rig : TestRig ;
beforeEach ( ( ) = > {
rig = new TestRig ( ) ;
} ) ;
afterEach ( async ( ) = > await rig . cleanup ( ) ) ;
2025-08-12 15:57:27 -07:00
it ( 'should be able to run a shell command' , async ( ) = > {
2025-12-02 17:43:06 -08:00
await rig . setup ( 'should be able to run a shell command' , {
settings : { tools : { core : [ 'run_shell_command' ] } } ,
} ) ;
2025-06-16 08:27:29 -07:00
2025-08-12 15:57:27 -07:00
const prompt = ` Please run the command "echo hello-world" and show me the output ` ;
2025-07-05 08:27:22 -07:00
2025-12-15 13:18:04 -08:00
const result = await rig . run ( { args : prompt } ) ;
2025-08-01 14:33:33 -07:00
2025-08-12 15:57:27 -07:00
const foundToolCall = await rig . waitForToolCall ( 'run_shell_command' ) ;
2025-08-01 14:33:33 -07:00
2025-08-12 15:57:27 -07:00
// Add debugging information
if ( ! foundToolCall || ! result . includes ( 'hello-world' ) ) {
printDebugInfo ( rig , result , {
'Found tool call' : foundToolCall ,
'Contains hello-world' : result . includes ( 'hello-world' ) ,
} ) ;
}
2025-08-01 14:33:33 -07:00
2025-08-12 15:57:27 -07:00
expect (
foundToolCall ,
'Expected to find a run_shell_command tool call' ,
) . toBeTruthy ( ) ;
2025-08-01 14:33:33 -07:00
2026-02-05 10:07:47 -08:00
assertModelHasOutput ( result ) ;
checkModelOutputContent ( result , {
expectedContent : [ 'hello-world' , 'exit code 0' ] ,
testName : 'Shell command test' ,
} ) ;
2025-08-12 15:57:27 -07:00
} ) ;
2025-07-05 08:27:22 -07:00
2025-08-12 15:57:27 -07:00
it ( 'should be able to run a shell command via stdin' , async ( ) = > {
2025-12-02 17:43:06 -08:00
await rig . setup ( 'should be able to run a shell command via stdin' , {
settings : { tools : { core : [ 'run_shell_command' ] } } ,
} ) ;
2025-08-01 14:33:33 -07:00
2025-08-12 15:57:27 -07:00
const prompt = ` Please run the command "echo test-stdin" and show me what it outputs ` ;
2025-08-01 14:33:33 -07:00
2025-08-12 15:57:27 -07:00
const result = await rig . run ( { stdin : prompt } ) ;
2025-08-01 14:33:33 -07:00
2025-08-12 15:57:27 -07:00
const foundToolCall = await rig . waitForToolCall ( 'run_shell_command' ) ;
2025-08-01 14:33:33 -07:00
2025-08-12 15:57:27 -07:00
// Add debugging information
if ( ! foundToolCall || ! result . includes ( 'test-stdin' ) ) {
printDebugInfo ( rig , result , {
'Test type' : 'Stdin test' ,
'Found tool call' : foundToolCall ,
'Contains test-stdin' : result . includes ( 'test-stdin' ) ,
} ) ;
}
2025-07-05 08:27:22 -07:00
2025-08-12 15:57:27 -07:00
expect (
foundToolCall ,
'Expected to find a run_shell_command tool call' ,
) . toBeTruthy ( ) ;
2025-06-16 08:27:29 -07:00
2026-02-05 10:07:47 -08:00
assertModelHasOutput ( result ) ;
checkModelOutputContent ( result , {
expectedContent : 'test-stdin' ,
testName : 'Shell command stdin test' ,
} ) ;
2025-08-12 15:57:27 -07:00
} ) ;
2025-09-25 17:32:40 -07:00
2025-10-24 13:04:40 -07:00
it . skip ( 'should run allowed sub-command in non-interactive mode' , async ( ) = > {
2025-10-06 12:15:21 -07:00
await rig . setup ( 'should run allowed sub-command in non-interactive mode' ) ;
const testFile = rig . createFile ( 'test.txt' , 'Lorem\nIpsum\nDolor\n' ) ;
2025-10-15 12:44:07 -07:00
const { tool , command } = getLineCountCommand ( ) ;
const prompt = ` use ${ command } to tell me how many lines there are in ${ testFile } ` ;
2025-10-06 12:15:21 -07:00
// Provide the prompt via stdin to simulate non-interactive mode
2025-12-15 13:18:04 -08:00
const result = await rig . run ( {
args : [ ` --allowed-tools=run_shell_command( ${ tool } ) ` ] ,
stdin : prompt ,
2026-01-21 10:43:48 -05:00
approvalMode : 'default' ,
2025-12-15 13:18:04 -08:00
} ) ;
2025-10-06 12:15:21 -07:00
const foundToolCall = await rig . waitForToolCall ( 'run_shell_command' , 15000 ) ;
if ( ! foundToolCall ) {
2025-10-16 17:25:30 -07:00
const toolLogs = rig . readToolLogs ( ) . map ( ( { toolRequest } ) = > ( {
name : toolRequest.name ,
success : toolRequest.success ,
args : toolRequest.args ,
} ) ) ;
2025-10-06 12:15:21 -07:00
printDebugInfo ( rig , result , {
'Found tool call' : foundToolCall ,
2025-10-16 17:25:30 -07:00
'Allowed tools flag' : ` run_shell_command( ${ tool } ) ` ,
Prompt : prompt ,
'Tool logs' : toolLogs ,
Result : result ,
2025-10-06 12:15:21 -07:00
} ) ;
}
expect (
foundToolCall ,
'Expected to find a run_shell_command tool call' ,
) . toBeTruthy ( ) ;
2025-10-14 11:36:49 -07:00
const toolCall = rig
. readToolLogs ( )
. filter (
( toolCall ) = > toolCall . toolRequest . name === ' run_shell_command ',
)[0];
expect(toolCall.toolRequest.success).toBe(true);
2025-10-06 12:15:21 -07:00
});
2025-11-09 14:36:13 -08:00
it.skip(' should succeed with no parens in non - interactive mode ', async () => {
2025-10-06 12:15:21 -07:00
await rig.setup(' should succeed with no parens in non - interactive mode ');
const testFile = rig.createFile(' test . txt ', ' Lorem \ nIpsum \ nDolor \ n ');
2025-10-15 12:44:07 -07:00
const { command } = getLineCountCommand();
const prompt = `use ${command} to tell me how many lines there are in ${testFile}`;
2025-10-06 12:15:21 -07:00
2025-12-15 13:18:04 -08:00
const result = await rig.run({
args: ' -- allowed - tools = run_shell_command ',
stdin: prompt,
2026-01-21 10:43:48 -05:00
approvalMode: ' default ',
2025-12-15 13:18:04 -08:00
});
2025-10-06 12:15:21 -07:00
const foundToolCall = await rig.waitForToolCall(' run_shell_command ', 15000);
if (!foundToolCall) {
printDebugInfo(rig, result, {
' Found tool call ': foundToolCall,
});
}
expect(
foundToolCall,
' Expected to find a run_shell_command tool call ',
).toBeTruthy();
2025-10-14 11:36:49 -07:00
const toolCall = rig
.readToolLogs()
.filter(
(toolCall) => toolCall.toolRequest.name === ' run_shell_command ',
)[0];
expect(toolCall.toolRequest.success).toBe(true);
2025-10-06 12:15:21 -07:00
});
2026-01-21 10:43:48 -05:00
it(' should succeed in yolo mode ', async () => {
2026-03-03 06:32:19 +05:30
const isWindows = process.platform === ' win32 ';
2026-01-21 10:43:48 -05:00
await rig.setup(' should succeed in yolo mode ', {
2026-03-03 06:32:19 +05:30
settings: {
tools: { core: [' run_shell_command '] },
shell: isWindows ? { enableInteractiveShell: false } : undefined,
},
2025-12-02 17:43:06 -08:00
});
2025-10-06 12:15:21 -07:00
const testFile = rig.createFile(' test . txt ', ' Lorem \ nIpsum \ nDolor \ n ');
2025-10-15 12:44:07 -07:00
const { command } = getLineCountCommand();
const prompt = `use ${command} to tell me how many lines there are in ${testFile}`;
2025-10-06 12:15:21 -07:00
2025-10-14 11:36:49 -07:00
const result = await rig.run({
2025-12-15 13:18:04 -08:00
args: prompt,
2026-01-21 10:43:48 -05:00
approvalMode: ' yolo ',
2025-10-14 11:36:49 -07:00
});
2025-10-06 12:15:21 -07:00
const foundToolCall = await rig.waitForToolCall(' run_shell_command ', 15000);
if (!foundToolCall) {
printDebugInfo(rig, result, {
' Found tool call ': foundToolCall,
});
}
expect(
foundToolCall,
' Expected to find a run_shell_command tool call ',
).toBeTruthy();
2025-10-14 11:36:49 -07:00
const toolCall = rig
.readToolLogs()
.filter(
(toolCall) => toolCall.toolRequest.name === ' run_shell_command ',
)[0];
expect(toolCall.toolRequest.success).toBe(true);
2025-10-06 12:15:21 -07:00
});
2025-10-24 13:04:40 -07:00
it.skip(' should work with ShellTool alias ', async () => {
2025-10-06 12:15:21 -07:00
await rig.setup(' should work with ShellTool alias ');
const testFile = rig.createFile(' test . txt ', ' Lorem \ nIpsum \ nDolor \ n ');
2025-10-15 12:44:07 -07:00
const { tool, command } = getLineCountCommand();
const prompt = `use ${command} to tell me how many lines there are in ${testFile}`;
2025-10-06 12:15:21 -07:00
2025-12-15 13:18:04 -08:00
const result = await rig.run({
args: `--allowed-tools=ShellTool(${tool})`,
stdin: prompt,
2026-01-21 10:43:48 -05:00
approvalMode: ' default ',
2025-12-15 13:18:04 -08:00
});
2025-10-06 12:15:21 -07:00
const foundToolCall = await rig.waitForToolCall(' run_shell_command ', 15000);
if (!foundToolCall) {
2025-10-16 17:25:30 -07:00
const toolLogs = rig.readToolLogs().map(({ toolRequest }) => ({
name: toolRequest.name,
success: toolRequest.success,
args: toolRequest.args,
}));
2025-10-06 12:15:21 -07:00
printDebugInfo(rig, result, {
' Found tool call ': foundToolCall,
2025-10-16 17:25:30 -07:00
' Allowed tools flag ': `ShellTool(${tool})`,
Prompt: prompt,
' Tool logs ': toolLogs,
Result: result,
2025-10-06 12:15:21 -07:00
});
}
expect(
foundToolCall,
' Expected to find a run_shell_command tool call ',
).toBeTruthy();
2025-10-14 11:36:49 -07:00
const toolCall = rig
.readToolLogs()
.filter(
(toolCall) => toolCall.toolRequest.name === ' run_shell_command ',
)[0];
expect(toolCall.toolRequest.success).toBe(true);
2025-10-06 12:15:21 -07:00
});
2025-10-14 11:36:49 -07:00
// TODO(#11062): Un-skip this once we can make it reliable by using hard coded
// model responses.
it.skip(' should combine multiple -- allowed - tools flags ', async () => {
2025-10-06 12:15:21 -07:00
await rig.setup(' should combine multiple -- allowed - tools flags ');
2025-10-15 12:44:07 -07:00
const { tool, command } = getLineCountCommand();
2025-10-09 14:13:26 -07:00
const prompt =
2025-10-15 12:44:07 -07:00
`use both ${command} and ls to count the number of lines in files in this ` +
2025-10-14 11:36:49 -07:00
`directory. Do not pipe these commands into each other, run them separately.`;
2025-10-06 12:15:21 -07:00
2025-12-15 13:18:04 -08:00
const result = await rig.run({
args: [
`--allowed-tools=run_shell_command(${tool})`,
' -- allowed - tools = run_shell_command ( ls ) ',
],
stdin: prompt,
2026-01-21 10:43:48 -05:00
approvalMode: ' default ',
2025-12-15 13:18:04 -08:00
});
2025-10-06 12:15:21 -07:00
2025-10-14 11:36:49 -07:00
for (const expected in [' ls ', tool]) {
const foundToolCall = await rig.waitForToolCall(
' run_shell_command ',
15000,
(args) => args.toLowerCase().includes(`"command": "${expected}`),
);
if (!foundToolCall) {
printDebugInfo(rig, result, {
' Found tool call ': foundToolCall,
});
}
expect(
foundToolCall,
`Expected to find a run_shell_command tool call to "${expected}",` +
` got ${rig.readToolLogs().join(' \ n ')}`,
).toBeTruthy();
2025-10-06 12:15:21 -07:00
}
2025-10-14 11:36:49 -07:00
const toolLogs = rig
.readToolLogs()
.filter((toolCall) => toolCall.toolRequest.name === ' run_shell_command ');
expect(toolLogs.length, toolLogs.join(' \ n ')).toBeGreaterThanOrEqual(2);
for (const toolLog of toolLogs) {
expect(
toolLog.toolRequest.success,
`Expected tool call ${toolLog} to succeed`,
).toBe(true);
}
2025-10-06 12:15:21 -07:00
});
2025-10-23 09:50:50 -04:00
it(' should reject commands not on the allowlist ', async () => {
2025-12-02 17:43:06 -08:00
await rig.setup(' should reject commands not on the allowlist ', {
settings: { tools: { core: [' run_shell_command '] } },
});
2025-10-16 17:25:30 -07:00
const testFile = rig.createFile(' test . txt ', ' Disallowed command check \ n ');
const allowedCommand = getAllowedListCommand();
const disallowed = getDisallowedFileReadCommand(testFile);
const prompt =
`I am testing the allowed tools configuration. ` +
`Attempt to run "${disallowed.command}" to read the contents of ${testFile}. ` +
`If the command fails because it is not permitted, respond with the single word FAIL. ` +
`If it succeeds, respond with SUCCESS.`;
2025-12-15 13:18:04 -08:00
const result = await rig.run({
args: `--allowed-tools=run_shell_command(${allowedCommand})`,
stdin: prompt,
2026-01-21 10:43:48 -05:00
approvalMode: ' default ',
2025-12-15 13:18:04 -08:00
});
2025-10-16 17:25:30 -07:00
if (!result.toLowerCase().includes(' fail ')) {
printDebugInfo(rig, result, {
Result: result,
AllowedCommand: allowedCommand,
DisallowedCommand: disallowed.command,
});
}
expect(result).toContain(' FAIL ');
const foundToolCall = await rig.waitForToolCall(
' run_shell_command ',
15000,
(args) => args.toLowerCase().includes(disallowed.tool.toLowerCase()),
);
if (!foundToolCall) {
printDebugInfo(rig, result, {
' Found tool call ': foundToolCall,
ToolLogs: rig.readToolLogs(),
});
}
expect(foundToolCall).toBe(true);
const toolLogs = rig
.readToolLogs()
.filter((toolLog) => toolLog.toolRequest.name === ' run_shell_command ');
const failureLog = toolLogs.find((toolLog) =>
toolLog.toolRequest.args
.toLowerCase()
.includes(disallowed.tool.toLowerCase()),
);
if (!failureLog || failureLog.toolRequest.success) {
printDebugInfo(rig, result, {
ToolLogs: toolLogs,
DisallowedTool: disallowed.tool,
});
}
expect(
failureLog,
' Expected failing run_shell_command invocation ',
).toBeTruthy();
expect(failureLog!.toolRequest.success).toBe(false);
});
2025-10-24 14:25:54 -04:00
// TODO(#11966): Deflake this test and re-enable once the underlying race is resolved.
it.skip(' should reject chained commands when only the first segment is allowlisted in non - interactive mode ', async () => {
2025-10-23 16:55:01 -04:00
await rig.setup(
' should reject chained commands when only the first segment is allowlisted ',
);
const chained = getChainedEchoCommand();
const shellInjection = `!{${chained.command}}`;
2025-12-15 13:18:04 -08:00
await rig.run({
args: `--allowed-tools=ShellTool(${chained.allowPattern})`,
stdin: `${shellInjection}\n`,
2026-01-21 10:43:48 -05:00
approvalMode: ' default ',
2025-12-15 13:18:04 -08:00
});
2025-10-23 16:55:01 -04:00
// CLI should refuse to execute the chained command without scheduling run_shell_command.
const toolLogs = rig
.readToolLogs()
.filter((log) => log.toolRequest.name === ' run_shell_command ');
// Success is false because tool is in the scheduled state.
for (const log of toolLogs) {
expect(log.toolRequest.success).toBe(false);
expect(log.toolRequest.args).toContain(' && ');
}
});
2025-10-10 10:48:24 -07:00
it(' should allow all with "ShellTool" and other specific tools ', async () => {
await rig.setup(
' should allow all with "ShellTool" and other specific tools ',
2025-12-02 17:43:06 -08:00
{
settings: { tools: { core: [' run_shell_command '] } },
},
2025-10-10 10:48:24 -07:00
);
2025-10-06 12:15:21 -07:00
const { tool } = getLineCountCommand();
2025-10-10 10:48:24 -07:00
const prompt = `Please run the command "echo test-allow-all" and show me the output`;
2025-10-06 12:15:21 -07:00
2025-12-15 13:18:04 -08:00
const result = await rig.run({
args: [
`--allowed-tools=run_shell_command(${tool})`,
' -- allowed - tools = run_shell_command ',
],
stdin: prompt,
2026-01-21 10:43:48 -05:00
approvalMode: ' default ',
2025-12-15 13:18:04 -08:00
});
2025-10-06 12:15:21 -07:00
const foundToolCall = await rig.waitForToolCall(' run_shell_command ', 15000);
2025-10-10 10:48:24 -07:00
if (!foundToolCall || !result.includes(' test - allow - all ')) {
2025-10-06 12:15:21 -07:00
printDebugInfo(rig, result, {
' Found tool call ': foundToolCall,
2025-10-10 10:48:24 -07:00
Result: result,
2025-10-06 12:15:21 -07:00
});
}
expect(
foundToolCall,
' Expected to find a run_shell_command tool call ',
).toBeTruthy();
2025-10-10 10:48:24 -07:00
2025-10-14 11:36:49 -07:00
const toolCall = rig
.readToolLogs()
.filter(
(toolCall) => toolCall.toolRequest.name === ' run_shell_command ',
)[0];
expect(toolCall.toolRequest.success).toBe(true);
2026-02-05 10:07:47 -08:00
assertModelHasOutput(result);
checkModelOutputContent(result, {
expectedContent: ' test - allow - all ',
testName: ' Shell command stdin allow all ',
});
2025-10-06 12:15:21 -07:00
});
2025-09-25 17:32:40 -07:00
it(' should propagate environment variables to the child process ', async () => {
2025-12-02 17:43:06 -08:00
await rig.setup(' should propagate environment variables ', {
settings: { tools: { core: [' run_shell_command '] } },
});
2025-09-25 17:32:40 -07:00
const varName = ' GEMINI_CLI_TEST_VAR ';
const varValue = `test-value-${Math.random().toString(36).substring(7)}`;
process.env[varName] = varValue;
try {
const prompt = `Use echo to learn the value of the environment variable named ${varName} and tell me what it is.`;
2025-12-15 13:18:04 -08:00
const result = await rig.run({ args: prompt });
2025-09-25 17:32:40 -07:00
const foundToolCall = await rig.waitForToolCall(' run_shell_command ');
if (!foundToolCall || !result.includes(varValue)) {
printDebugInfo(rig, result, {
' Found tool call ': foundToolCall,
' Contains varValue ': result.includes(varValue),
});
}
expect(
foundToolCall,
' Expected to find a run_shell_command tool call ',
).toBeTruthy();
2026-02-05 10:07:47 -08:00
assertModelHasOutput(result);
checkModelOutputContent(result, {
expectedContent: varValue,
testName: ' Env var propagation test ',
});
2025-09-25 17:32:40 -07:00
expect(result).toContain(varValue);
} finally {
delete process.env[varName];
}
});
2025-10-21 08:01:13 -07:00
it.skip(' should run a platform - specific file listing command ', async () => {
2025-09-25 17:32:40 -07:00
await rig.setup(' should run platform - specific file listing ');
const fileName = `test-file-${Math.random().toString(36).substring(7)}.txt`;
rig.createFile(fileName, ' test content ');
const prompt = `Run a shell command to list the files in the current directory and tell me what they are.`;
2025-12-15 13:18:04 -08:00
const result = await rig.run({ args: prompt });
2025-09-25 17:32:40 -07:00
const foundToolCall = await rig.waitForToolCall(' run_shell_command ');
// Debugging info
if (!foundToolCall || !result.includes(fileName)) {
printDebugInfo(rig, result, {
' Found tool call ': foundToolCall,
' Contains fileName ': result.includes(fileName),
});
}
expect(
foundToolCall,
' Expected to find a run_shell_command tool call ',
).toBeTruthy();
2026-02-05 10:07:47 -08:00
assertModelHasOutput(result);
checkModelOutputContent(result, {
expectedContent: fileName,
testName: ' Platform - specific listing test ',
});
2025-09-25 17:32:40 -07:00
expect(result).toContain(fileName);
});
2025-10-16 17:25:30 -07:00
it(' rejects invalid shell expressions ', async () => {
2025-12-02 17:43:06 -08:00
await rig.setup(' rejects invalid shell expressions ', {
2026-01-30 15:44:17 -05:00
settings: {
tools: {
core: [' run_shell_command '],
allowed: [' run_shell_command ( echo ) '], // Specifically allow echo
},
},
2025-12-02 17:43:06 -08:00
});
2025-10-16 17:25:30 -07:00
const invalidCommand = getInvalidCommand();
2025-12-15 13:18:04 -08:00
const result = await rig.run({
args: `I am testing the error handling of the run_shell_command tool. Please attempt to run the following command, which I know has invalid syntax: \`${invalidCommand}\`. If the command fails as expected, please return the word FAIL, otherwise return the word SUCCESS.`,
2026-01-30 15:44:17 -05:00
approvalMode: ' default ', // Use default mode so safety fallback triggers confirmation
2025-12-15 13:18:04 -08:00
});
2025-10-16 17:25:30 -07:00
expect(result).toContain(' FAIL ');
const escapedInvalidCommand = JSON.stringify(invalidCommand).slice(1, -1);
const foundToolCall = await rig.waitForToolCall(
' run_shell_command ',
15000,
(args) =>
args.toLowerCase().includes(escapedInvalidCommand.toLowerCase()),
);
if (!foundToolCall) {
printDebugInfo(rig, result, {
' Found tool call ': foundToolCall,
EscapedCommand: escapedInvalidCommand,
ToolLogs: rig.readToolLogs(),
});
}
expect(foundToolCall).toBe(true);
const toolLogs = rig
.readToolLogs()
.filter((toolLog) => toolLog.toolRequest.name === ' run_shell_command ');
const failureLog = toolLogs.find((toolLog) =>
toolLog.toolRequest.args
.toLowerCase()
.includes(escapedInvalidCommand.toLowerCase()),
);
if (!failureLog || failureLog.toolRequest.success) {
printDebugInfo(rig, result, {
ToolLogs: toolLogs,
EscapedCommand: escapedInvalidCommand,
});
}
expect(
failureLog,
' Expected failing run_shell_command invocation for invalid syntax ' ,
) . toBeTruthy ( ) ;
expect ( failureLog ! . toolRequest . success ) . toBe ( false ) ;
} ) ;
2025-06-16 08:27:29 -07:00
} ) ;