test(evals): add behavioral eval for file creation and write_file tool selection (#26292)

This commit is contained in:
AK
2026-04-30 20:45:56 -07:00
committed by GitHub
parent 4e81f48646
commit b3e6c28933
5 changed files with 171 additions and 3 deletions
+132
View File
@@ -0,0 +1,132 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, expect } from 'vitest';
import { evalTest } from './test-helper.js';
describe('file_creation_behavior', () => {
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should create a new file in the correct directory when asked',
files: {
'package.json': JSON.stringify({
name: 'test-project',
version: '1.0.0',
type: 'module',
}),
'src/index.ts': 'console.log("hello");',
},
prompt:
'Please create a new file called src/logger.ts containing a simple logging class. Do not modify any existing files.',
assert: async (rig) => {
// 1) Verify write_file tool was called
const logs = rig.readToolLogs();
const writeFileCalls = logs.filter(
(log) => log.toolRequest?.name === 'write_file',
);
expect(
writeFileCalls.length,
'Expected a write_file call to create the new file',
).toBeGreaterThanOrEqual(1);
// 2) Verify existing files were not modified
const indexContent = rig.readFile('src/index.ts');
expect(indexContent).toBe('console.log("hello");');
const pkgContent = rig.readFile('package.json');
expect(JSON.parse(pkgContent).name).toBe('test-project');
// 3) Verify new file is created
const loggerContent = rig.readFile('src/logger.ts');
expect(loggerContent.length).toBeGreaterThan(0);
},
});
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should not overwrite existing file when creating new file with same name',
files: {
'package.json': JSON.stringify({
name: 'test-project',
version: '1.0.0',
type: 'module',
}),
'config.json': JSON.stringify({ port: 3000, env: 'production' }),
},
prompt:
"Please create a new configuration file called config.json in the workspace. Ensure the port is set to 8080. Since there's already a config file there, make sure to check it first before making changes.",
assert: async (rig) => {
// Verify that read_file was called on config.json before write_file
const logs = rig.readToolLogs();
const targetReadFileIndex = logs.findIndex((log) => {
if (log.toolRequest?.name !== 'read_file') return false;
try {
const args =
typeof log.toolRequest.args === 'string'
? JSON.parse(log.toolRequest.args)
: log.toolRequest.args;
return args.file_path === 'config.json';
} catch {
return false;
}
});
const targetWriteFileIndex = logs.findIndex((log) => {
if (log.toolRequest?.name !== 'write_file') return false;
try {
const args =
typeof log.toolRequest.args === 'string'
? JSON.parse(log.toolRequest.args)
: log.toolRequest.args;
return args.file_path === 'config.json';
} catch {
return false;
}
});
expect(
targetReadFileIndex,
'Expected read_file to be called to inspect config.json before overwriting it',
).toBeGreaterThanOrEqual(0);
if (targetWriteFileIndex !== -1) {
expect(
targetReadFileIndex,
'Expected read_file to be invoked before write_file for safety',
).toBeLessThan(targetWriteFileIndex);
}
// Also check the resulting config.json content
const configContent = rig.readFile('config.json');
expect(configContent).toContain('8080');
},
});
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should scaffold multiple related files in correct locations',
files: {
'package.json': JSON.stringify({
name: 'test-project',
version: '1.0.0',
type: 'module',
}),
},
prompt:
'Please scaffold auth validation and types by creating two new files: src/auth/validator.ts and src/auth/types.ts with relevant exports. Do not modify existing files.',
assert: async (rig) => {
// Verify files are created in right place
const validatorContent = rig.readFile('src/auth/validator.ts');
const typesContent = rig.readFile('src/auth/types.ts');
expect(validatorContent.length).toBeGreaterThan(0);
expect(typesContent.length).toBeGreaterThan(0);
},
});
});
+33
View File
@@ -78,4 +78,37 @@ describe('git repo eval', () => {
expect(commitCalls.length).toBeGreaterThanOrEqual(1);
},
});
/**
* Ensures that when the agent is prompted to commit its changes, it does not
* use `git add .` or `git add -A`.
*/
evalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'should not stage changes via git add . when prompted to commit',
prompt:
'Make a targeted fix for the bug in index.ts without building, installing anything, or adding tests. Then, stage and commit your changes.',
files: FILES,
assert: async (rig, _result) => {
const toolLogs = rig.readToolLogs();
const gitAddAllCalls = toolLogs.filter((log) => {
if (log.toolRequest.name !== 'run_shell_command') return false;
try {
const args = JSON.parse(log.toolRequest.args);
if (!args.command) return false;
const cmd = args.command.toLowerCase();
return (
cmd.includes('git add .') ||
cmd.includes('git add -a') ||
cmd.includes('git add --all')
);
} catch {
return false;
}
});
expect(gitAddAllCalls.length).toBe(0);
},
});
});
@@ -1340,10 +1340,11 @@ You are running outside of a sandbox container, directly on the user's system. F
- "Commit the change" -> add changed files and commit.
- "Wrap up this PR for me" -> do not commit.
- When asked to commit changes or prepare a commit, always start by gathering information using shell commands:
- \`git status\` to ensure that all relevant files are tracked and staged, using \`git add ...\` as needed.
- \`git status\` to ensure that all relevant files are tracked and staged, using \`git add <file>...\` for specific files as needed.
- \`git diff HEAD\` to review all changes (including unstaged changes) to tracked files in work tree since last commit.
- \`git diff --staged\` to review only staged changes when a partial commit makes sense or was requested by the user.
- \`git log -n 3\` to review recent commit messages and match their style (verbosity, formatting, signature line, etc.)
- Do not use \`git add .\` or \`git add -A\` unprompted as this can stage unwanted or untracked files. Instead, stage only the specific files that were changed or created as part of the task.
- Combine shell commands whenever possible to save time/steps, e.g. \`git status && git diff HEAD && git log -n 3\`.
- Always propose a draft commit message. Never just ask the user to give you the full commit message.
- Prefer commit messages that are clear, concise, and focused more on "why" and less on "what".
+2 -1
View File
@@ -364,10 +364,11 @@ export function renderGitRepo(options?: GitRepoOptions): string {
- "Commit the change" -> add changed files and commit.
- "Wrap up this PR for me" -> do not commit.
- When asked to commit changes or prepare a commit, always start by gathering information using shell commands:
- \`git status\` to ensure that all relevant files are tracked and staged, using \`git add ...\` as needed.
- \`git status\` to ensure that all relevant files are tracked and staged, using \`git add <file>...\` for specific files as needed.
- \`git diff HEAD\` to review all changes (including unstaged changes) to tracked files in work tree since last commit.
- \`git diff --staged\` to review only staged changes when a partial commit makes sense or was requested by the user.
- \`git log -n 3\` to review recent commit messages and match their style (verbosity, formatting, signature line, etc.)
- Do not use \`git add .\` or \`git add -A\` unprompted as this can stage unwanted or untracked files. Instead, stage only the specific files that were changed or created as part of the task.
- Combine shell commands whenever possible to save time/steps, e.g. \`git status && git diff HEAD && git log -n 3\`.
- Always propose a draft commit message. Never just ask the user to give you the full commit message.
- Prefer commit messages that are clear, concise, and focused more on "why" and less on "what".${gitRepoKeepUserInformed(options.interactive)}
+2 -1
View File
@@ -495,10 +495,11 @@ export function renderGitRepo(options?: GitRepoOptions): string {
- "Commit the change" -> add changed files and commit.
- "Wrap up this PR for me" -> do not commit.
- When asked to commit changes or prepare a commit, always start by gathering information using shell commands:
- \`git status\` to ensure that all relevant files are tracked and staged, using \`git add ...\` as needed.
- \`git status\` to ensure that all relevant files are tracked and staged, using \`git add <file>...\` for specific files as needed.
- \`git diff HEAD\` to review all changes (including unstaged changes) to tracked files in work tree since last commit.
- \`git diff --staged\` to review only staged changes when a partial commit makes sense or was requested by the user.
- \`git log -n 3\` to review recent commit messages and match their style (verbosity, formatting, signature line, etc.)
- Do not use \`git add .\` or \`git add -A\` unprompted as this can stage unwanted or untracked files. Instead, stage only the specific files that were changed or created as part of the task.
- Combine shell commands whenever possible to save time/steps, e.g. \`git status && git diff HEAD && git log -n 3\`.
- Always propose a draft commit message. Never just ask the user to give you the full commit message.
- Prefer commit messages that are clear, concise, and focused more on "why" and less on "what".${gitRepoKeepUserInformed(options.interactive)}