mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-05-01 23:44:15 -07:00
test(evals): add behavioral eval for file creation and write_file tool selection (#26292)
This commit is contained in:
@@ -0,0 +1,132 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { describe, expect } from 'vitest';
|
||||
import { evalTest } from './test-helper.js';
|
||||
|
||||
describe('file_creation_behavior', () => {
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should create a new file in the correct directory when asked',
|
||||
files: {
|
||||
'package.json': JSON.stringify({
|
||||
name: 'test-project',
|
||||
version: '1.0.0',
|
||||
type: 'module',
|
||||
}),
|
||||
'src/index.ts': 'console.log("hello");',
|
||||
},
|
||||
prompt:
|
||||
'Please create a new file called src/logger.ts containing a simple logging class. Do not modify any existing files.',
|
||||
assert: async (rig) => {
|
||||
// 1) Verify write_file tool was called
|
||||
const logs = rig.readToolLogs();
|
||||
const writeFileCalls = logs.filter(
|
||||
(log) => log.toolRequest?.name === 'write_file',
|
||||
);
|
||||
expect(
|
||||
writeFileCalls.length,
|
||||
'Expected a write_file call to create the new file',
|
||||
).toBeGreaterThanOrEqual(1);
|
||||
|
||||
// 2) Verify existing files were not modified
|
||||
const indexContent = rig.readFile('src/index.ts');
|
||||
expect(indexContent).toBe('console.log("hello");');
|
||||
|
||||
const pkgContent = rig.readFile('package.json');
|
||||
expect(JSON.parse(pkgContent).name).toBe('test-project');
|
||||
|
||||
// 3) Verify new file is created
|
||||
const loggerContent = rig.readFile('src/logger.ts');
|
||||
expect(loggerContent.length).toBeGreaterThan(0);
|
||||
},
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should not overwrite existing file when creating new file with same name',
|
||||
files: {
|
||||
'package.json': JSON.stringify({
|
||||
name: 'test-project',
|
||||
version: '1.0.0',
|
||||
type: 'module',
|
||||
}),
|
||||
'config.json': JSON.stringify({ port: 3000, env: 'production' }),
|
||||
},
|
||||
prompt:
|
||||
"Please create a new configuration file called config.json in the workspace. Ensure the port is set to 8080. Since there's already a config file there, make sure to check it first before making changes.",
|
||||
assert: async (rig) => {
|
||||
// Verify that read_file was called on config.json before write_file
|
||||
const logs = rig.readToolLogs();
|
||||
const targetReadFileIndex = logs.findIndex((log) => {
|
||||
if (log.toolRequest?.name !== 'read_file') return false;
|
||||
try {
|
||||
const args =
|
||||
typeof log.toolRequest.args === 'string'
|
||||
? JSON.parse(log.toolRequest.args)
|
||||
: log.toolRequest.args;
|
||||
return args.file_path === 'config.json';
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
});
|
||||
|
||||
const targetWriteFileIndex = logs.findIndex((log) => {
|
||||
if (log.toolRequest?.name !== 'write_file') return false;
|
||||
try {
|
||||
const args =
|
||||
typeof log.toolRequest.args === 'string'
|
||||
? JSON.parse(log.toolRequest.args)
|
||||
: log.toolRequest.args;
|
||||
return args.file_path === 'config.json';
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
});
|
||||
|
||||
expect(
|
||||
targetReadFileIndex,
|
||||
'Expected read_file to be called to inspect config.json before overwriting it',
|
||||
).toBeGreaterThanOrEqual(0);
|
||||
|
||||
if (targetWriteFileIndex !== -1) {
|
||||
expect(
|
||||
targetReadFileIndex,
|
||||
'Expected read_file to be invoked before write_file for safety',
|
||||
).toBeLessThan(targetWriteFileIndex);
|
||||
}
|
||||
|
||||
// Also check the resulting config.json content
|
||||
const configContent = rig.readFile('config.json');
|
||||
expect(configContent).toContain('8080');
|
||||
},
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should scaffold multiple related files in correct locations',
|
||||
files: {
|
||||
'package.json': JSON.stringify({
|
||||
name: 'test-project',
|
||||
version: '1.0.0',
|
||||
type: 'module',
|
||||
}),
|
||||
},
|
||||
prompt:
|
||||
'Please scaffold auth validation and types by creating two new files: src/auth/validator.ts and src/auth/types.ts with relevant exports. Do not modify existing files.',
|
||||
assert: async (rig) => {
|
||||
// Verify files are created in right place
|
||||
const validatorContent = rig.readFile('src/auth/validator.ts');
|
||||
const typesContent = rig.readFile('src/auth/types.ts');
|
||||
|
||||
expect(validatorContent.length).toBeGreaterThan(0);
|
||||
expect(typesContent.length).toBeGreaterThan(0);
|
||||
},
|
||||
});
|
||||
});
|
||||
@@ -78,4 +78,37 @@ describe('git repo eval', () => {
|
||||
expect(commitCalls.length).toBeGreaterThanOrEqual(1);
|
||||
},
|
||||
});
|
||||
|
||||
/**
|
||||
* Ensures that when the agent is prompted to commit its changes, it does not
|
||||
* use `git add .` or `git add -A`.
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'should not stage changes via git add . when prompted to commit',
|
||||
prompt:
|
||||
'Make a targeted fix for the bug in index.ts without building, installing anything, or adding tests. Then, stage and commit your changes.',
|
||||
files: FILES,
|
||||
assert: async (rig, _result) => {
|
||||
const toolLogs = rig.readToolLogs();
|
||||
const gitAddAllCalls = toolLogs.filter((log) => {
|
||||
if (log.toolRequest.name !== 'run_shell_command') return false;
|
||||
try {
|
||||
const args = JSON.parse(log.toolRequest.args);
|
||||
if (!args.command) return false;
|
||||
const cmd = args.command.toLowerCase();
|
||||
return (
|
||||
cmd.includes('git add .') ||
|
||||
cmd.includes('git add -a') ||
|
||||
cmd.includes('git add --all')
|
||||
);
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
});
|
||||
|
||||
expect(gitAddAllCalls.length).toBe(0);
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
@@ -1340,10 +1340,11 @@ You are running outside of a sandbox container, directly on the user's system. F
|
||||
- "Commit the change" -> add changed files and commit.
|
||||
- "Wrap up this PR for me" -> do not commit.
|
||||
- When asked to commit changes or prepare a commit, always start by gathering information using shell commands:
|
||||
- \`git status\` to ensure that all relevant files are tracked and staged, using \`git add ...\` as needed.
|
||||
- \`git status\` to ensure that all relevant files are tracked and staged, using \`git add <file>...\` for specific files as needed.
|
||||
- \`git diff HEAD\` to review all changes (including unstaged changes) to tracked files in work tree since last commit.
|
||||
- \`git diff --staged\` to review only staged changes when a partial commit makes sense or was requested by the user.
|
||||
- \`git log -n 3\` to review recent commit messages and match their style (verbosity, formatting, signature line, etc.)
|
||||
- Do not use \`git add .\` or \`git add -A\` unprompted as this can stage unwanted or untracked files. Instead, stage only the specific files that were changed or created as part of the task.
|
||||
- Combine shell commands whenever possible to save time/steps, e.g. \`git status && git diff HEAD && git log -n 3\`.
|
||||
- Always propose a draft commit message. Never just ask the user to give you the full commit message.
|
||||
- Prefer commit messages that are clear, concise, and focused more on "why" and less on "what".
|
||||
|
||||
@@ -364,10 +364,11 @@ export function renderGitRepo(options?: GitRepoOptions): string {
|
||||
- "Commit the change" -> add changed files and commit.
|
||||
- "Wrap up this PR for me" -> do not commit.
|
||||
- When asked to commit changes or prepare a commit, always start by gathering information using shell commands:
|
||||
- \`git status\` to ensure that all relevant files are tracked and staged, using \`git add ...\` as needed.
|
||||
- \`git status\` to ensure that all relevant files are tracked and staged, using \`git add <file>...\` for specific files as needed.
|
||||
- \`git diff HEAD\` to review all changes (including unstaged changes) to tracked files in work tree since last commit.
|
||||
- \`git diff --staged\` to review only staged changes when a partial commit makes sense or was requested by the user.
|
||||
- \`git log -n 3\` to review recent commit messages and match their style (verbosity, formatting, signature line, etc.)
|
||||
- Do not use \`git add .\` or \`git add -A\` unprompted as this can stage unwanted or untracked files. Instead, stage only the specific files that were changed or created as part of the task.
|
||||
- Combine shell commands whenever possible to save time/steps, e.g. \`git status && git diff HEAD && git log -n 3\`.
|
||||
- Always propose a draft commit message. Never just ask the user to give you the full commit message.
|
||||
- Prefer commit messages that are clear, concise, and focused more on "why" and less on "what".${gitRepoKeepUserInformed(options.interactive)}
|
||||
|
||||
@@ -495,10 +495,11 @@ export function renderGitRepo(options?: GitRepoOptions): string {
|
||||
- "Commit the change" -> add changed files and commit.
|
||||
- "Wrap up this PR for me" -> do not commit.
|
||||
- When asked to commit changes or prepare a commit, always start by gathering information using shell commands:
|
||||
- \`git status\` to ensure that all relevant files are tracked and staged, using \`git add ...\` as needed.
|
||||
- \`git status\` to ensure that all relevant files are tracked and staged, using \`git add <file>...\` for specific files as needed.
|
||||
- \`git diff HEAD\` to review all changes (including unstaged changes) to tracked files in work tree since last commit.
|
||||
- \`git diff --staged\` to review only staged changes when a partial commit makes sense or was requested by the user.
|
||||
- \`git log -n 3\` to review recent commit messages and match their style (verbosity, formatting, signature line, etc.)
|
||||
- Do not use \`git add .\` or \`git add -A\` unprompted as this can stage unwanted or untracked files. Instead, stage only the specific files that were changed or created as part of the task.
|
||||
- Combine shell commands whenever possible to save time/steps, e.g. \`git status && git diff HEAD && git log -n 3\`.
|
||||
- Always propose a draft commit message. Never just ask the user to give you the full commit message.
|
||||
- Prefer commit messages that are clear, concise, and focused more on "why" and less on "what".${gitRepoKeepUserInformed(options.interactive)}
|
||||
|
||||
Reference in New Issue
Block a user