From b3e6c28933ea23ed015a1899879938f834a51df1 Mon Sep 17 00:00:00 2001 From: AK Date: Thu, 30 Apr 2026 20:45:56 -0700 Subject: [PATCH] test(evals): add behavioral eval for file creation and write_file tool selection (#26292) --- evals/file_creation_behavior.eval.ts | 132 ++++++++++++++++++ evals/gitRepo.eval.ts | 33 +++++ .../core/__snapshots__/prompts.test.ts.snap | 3 +- packages/core/src/prompts/snippets.legacy.ts | 3 +- packages/core/src/prompts/snippets.ts | 3 +- 5 files changed, 171 insertions(+), 3 deletions(-) create mode 100644 evals/file_creation_behavior.eval.ts diff --git a/evals/file_creation_behavior.eval.ts b/evals/file_creation_behavior.eval.ts new file mode 100644 index 0000000000..2092eadb5b --- /dev/null +++ b/evals/file_creation_behavior.eval.ts @@ -0,0 +1,132 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, expect } from 'vitest'; +import { evalTest } from './test-helper.js'; + +describe('file_creation_behavior', () => { + evalTest('USUALLY_PASSES', { + suiteName: 'default', + suiteType: 'behavioral', + name: 'should create a new file in the correct directory when asked', + files: { + 'package.json': JSON.stringify({ + name: 'test-project', + version: '1.0.0', + type: 'module', + }), + 'src/index.ts': 'console.log("hello");', + }, + prompt: + 'Please create a new file called src/logger.ts containing a simple logging class. Do not modify any existing files.', + assert: async (rig) => { + // 1) Verify write_file tool was called + const logs = rig.readToolLogs(); + const writeFileCalls = logs.filter( + (log) => log.toolRequest?.name === 'write_file', + ); + expect( + writeFileCalls.length, + 'Expected a write_file call to create the new file', + ).toBeGreaterThanOrEqual(1); + + // 2) Verify existing files were not modified + const indexContent = rig.readFile('src/index.ts'); + expect(indexContent).toBe('console.log("hello");'); + + const pkgContent = rig.readFile('package.json'); + expect(JSON.parse(pkgContent).name).toBe('test-project'); + + // 3) Verify new file is created + const loggerContent = rig.readFile('src/logger.ts'); + expect(loggerContent.length).toBeGreaterThan(0); + }, + }); + + evalTest('USUALLY_PASSES', { + suiteName: 'default', + suiteType: 'behavioral', + name: 'should not overwrite existing file when creating new file with same name', + files: { + 'package.json': JSON.stringify({ + name: 'test-project', + version: '1.0.0', + type: 'module', + }), + 'config.json': JSON.stringify({ port: 3000, env: 'production' }), + }, + prompt: + "Please create a new configuration file called config.json in the workspace. Ensure the port is set to 8080. Since there's already a config file there, make sure to check it first before making changes.", + assert: async (rig) => { + // Verify that read_file was called on config.json before write_file + const logs = rig.readToolLogs(); + const targetReadFileIndex = logs.findIndex((log) => { + if (log.toolRequest?.name !== 'read_file') return false; + try { + const args = + typeof log.toolRequest.args === 'string' + ? JSON.parse(log.toolRequest.args) + : log.toolRequest.args; + return args.file_path === 'config.json'; + } catch { + return false; + } + }); + + const targetWriteFileIndex = logs.findIndex((log) => { + if (log.toolRequest?.name !== 'write_file') return false; + try { + const args = + typeof log.toolRequest.args === 'string' + ? JSON.parse(log.toolRequest.args) + : log.toolRequest.args; + return args.file_path === 'config.json'; + } catch { + return false; + } + }); + + expect( + targetReadFileIndex, + 'Expected read_file to be called to inspect config.json before overwriting it', + ).toBeGreaterThanOrEqual(0); + + if (targetWriteFileIndex !== -1) { + expect( + targetReadFileIndex, + 'Expected read_file to be invoked before write_file for safety', + ).toBeLessThan(targetWriteFileIndex); + } + + // Also check the resulting config.json content + const configContent = rig.readFile('config.json'); + expect(configContent).toContain('8080'); + }, + }); + + evalTest('USUALLY_PASSES', { + suiteName: 'default', + suiteType: 'behavioral', + name: 'should scaffold multiple related files in correct locations', + files: { + 'package.json': JSON.stringify({ + name: 'test-project', + version: '1.0.0', + type: 'module', + }), + }, + prompt: + 'Please scaffold auth validation and types by creating two new files: src/auth/validator.ts and src/auth/types.ts with relevant exports. Do not modify existing files.', + assert: async (rig) => { + // Verify files are created in right place + const validatorContent = rig.readFile('src/auth/validator.ts'); + const typesContent = rig.readFile('src/auth/types.ts'); + + expect(validatorContent.length).toBeGreaterThan(0); + expect(typesContent.length).toBeGreaterThan(0); + }, + }); +}); diff --git a/evals/gitRepo.eval.ts b/evals/gitRepo.eval.ts index b5dbd8a760..1f69ba7560 100644 --- a/evals/gitRepo.eval.ts +++ b/evals/gitRepo.eval.ts @@ -78,4 +78,37 @@ describe('git repo eval', () => { expect(commitCalls.length).toBeGreaterThanOrEqual(1); }, }); + + /** + * Ensures that when the agent is prompted to commit its changes, it does not + * use `git add .` or `git add -A`. + */ + evalTest('USUALLY_PASSES', { + suiteName: 'default', + suiteType: 'behavioral', + name: 'should not stage changes via git add . when prompted to commit', + prompt: + 'Make a targeted fix for the bug in index.ts without building, installing anything, or adding tests. Then, stage and commit your changes.', + files: FILES, + assert: async (rig, _result) => { + const toolLogs = rig.readToolLogs(); + const gitAddAllCalls = toolLogs.filter((log) => { + if (log.toolRequest.name !== 'run_shell_command') return false; + try { + const args = JSON.parse(log.toolRequest.args); + if (!args.command) return false; + const cmd = args.command.toLowerCase(); + return ( + cmd.includes('git add .') || + cmd.includes('git add -a') || + cmd.includes('git add --all') + ); + } catch { + return false; + } + }); + + expect(gitAddAllCalls.length).toBe(0); + }, + }); }); diff --git a/packages/core/src/core/__snapshots__/prompts.test.ts.snap b/packages/core/src/core/__snapshots__/prompts.test.ts.snap index 6edb51cf34..2116b0cfd3 100644 --- a/packages/core/src/core/__snapshots__/prompts.test.ts.snap +++ b/packages/core/src/core/__snapshots__/prompts.test.ts.snap @@ -1340,10 +1340,11 @@ You are running outside of a sandbox container, directly on the user's system. F - "Commit the change" -> add changed files and commit. - "Wrap up this PR for me" -> do not commit. - When asked to commit changes or prepare a commit, always start by gathering information using shell commands: - - \`git status\` to ensure that all relevant files are tracked and staged, using \`git add ...\` as needed. + - \`git status\` to ensure that all relevant files are tracked and staged, using \`git add ...\` for specific files as needed. - \`git diff HEAD\` to review all changes (including unstaged changes) to tracked files in work tree since last commit. - \`git diff --staged\` to review only staged changes when a partial commit makes sense or was requested by the user. - \`git log -n 3\` to review recent commit messages and match their style (verbosity, formatting, signature line, etc.) +- Do not use \`git add .\` or \`git add -A\` unprompted as this can stage unwanted or untracked files. Instead, stage only the specific files that were changed or created as part of the task. - Combine shell commands whenever possible to save time/steps, e.g. \`git status && git diff HEAD && git log -n 3\`. - Always propose a draft commit message. Never just ask the user to give you the full commit message. - Prefer commit messages that are clear, concise, and focused more on "why" and less on "what". diff --git a/packages/core/src/prompts/snippets.legacy.ts b/packages/core/src/prompts/snippets.legacy.ts index f2c8bb2b33..e8f65d7106 100644 --- a/packages/core/src/prompts/snippets.legacy.ts +++ b/packages/core/src/prompts/snippets.legacy.ts @@ -364,10 +364,11 @@ export function renderGitRepo(options?: GitRepoOptions): string { - "Commit the change" -> add changed files and commit. - "Wrap up this PR for me" -> do not commit. - When asked to commit changes or prepare a commit, always start by gathering information using shell commands: - - \`git status\` to ensure that all relevant files are tracked and staged, using \`git add ...\` as needed. + - \`git status\` to ensure that all relevant files are tracked and staged, using \`git add ...\` for specific files as needed. - \`git diff HEAD\` to review all changes (including unstaged changes) to tracked files in work tree since last commit. - \`git diff --staged\` to review only staged changes when a partial commit makes sense or was requested by the user. - \`git log -n 3\` to review recent commit messages and match their style (verbosity, formatting, signature line, etc.) +- Do not use \`git add .\` or \`git add -A\` unprompted as this can stage unwanted or untracked files. Instead, stage only the specific files that were changed or created as part of the task. - Combine shell commands whenever possible to save time/steps, e.g. \`git status && git diff HEAD && git log -n 3\`. - Always propose a draft commit message. Never just ask the user to give you the full commit message. - Prefer commit messages that are clear, concise, and focused more on "why" and less on "what".${gitRepoKeepUserInformed(options.interactive)} diff --git a/packages/core/src/prompts/snippets.ts b/packages/core/src/prompts/snippets.ts index 385e8ffb22..5bd472fde5 100644 --- a/packages/core/src/prompts/snippets.ts +++ b/packages/core/src/prompts/snippets.ts @@ -495,10 +495,11 @@ export function renderGitRepo(options?: GitRepoOptions): string { - "Commit the change" -> add changed files and commit. - "Wrap up this PR for me" -> do not commit. - When asked to commit changes or prepare a commit, always start by gathering information using shell commands: - - \`git status\` to ensure that all relevant files are tracked and staged, using \`git add ...\` as needed. + - \`git status\` to ensure that all relevant files are tracked and staged, using \`git add ...\` for specific files as needed. - \`git diff HEAD\` to review all changes (including unstaged changes) to tracked files in work tree since last commit. - \`git diff --staged\` to review only staged changes when a partial commit makes sense or was requested by the user. - \`git log -n 3\` to review recent commit messages and match their style (verbosity, formatting, signature line, etc.) +- Do not use \`git add .\` or \`git add -A\` unprompted as this can stage unwanted or untracked files. Instead, stage only the specific files that were changed or created as part of the task. - Combine shell commands whenever possible to save time/steps, e.g. \`git status && git diff HEAD && git log -n 3\`. - Always propose a draft commit message. Never just ask the user to give you the full commit message. - Prefer commit messages that are clear, concise, and focused more on "why" and less on "what".${gitRepoKeepUserInformed(options.interactive)}