Add behavioral eval for incorrect replace.

This commit is contained in:
Christian Gunderman
2026-03-24 15:54:33 -07:00
parent e6cd5d208c
commit e4cdad0d55
+66
View File
@@ -0,0 +1,66 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { expect, describe } from 'vitest';
import { evalTest } from './test-helper.js';
describe('botched replace eval', () => {
evalTest('ALWAYS_PASSES', {
name: 'should perform multiple edits to the same file sequentially across turns',
files: {
'build.bzl': `"""my-internal-build-system"""
load(
"//core/build/rules:my_rules.bzl",
"target_alpha",
"target_beta",
"target_gamma",
)
target = my_rules.Target.create(
build_package = "//src/project/web/server",
visibility = ["//visibility:private"],
)
`,
'README.md': '# Project\nTODO: Fill me in.',
},
prompt:
'The file contents are standard. Please immediately apply these updates using the replace tool: 1. In build.bzl, remove "target_alpha" from the load statement. 2. In build.bzl, change the visibility to ["//visibility:public"]. 3. In README.md, replace "TODO: Fill me in." with "This is the My Project Web Server". Do not read the files first, just execute the replacements right away to save time.',
assert: async (rig) => {
const logs = rig.readToolLogs();
// Group tool calls by turn (prompt_id)
const callsPerTurn: Record<string, any[]> = {};
for (const log of logs) {
if (log.toolRequest) {
const pid = log.toolRequest.prompt_id || 'unknown';
if (!callsPerTurn[pid]) callsPerTurn[pid] = [];
callsPerTurn[pid].push(log.toolRequest);
}
}
for (const [pid, calls] of Object.entries(callsPerTurn)) {
const replaceCallsForManifest = calls.filter((c) => {
const args = JSON.parse(c.args);
return c.name === 'replace' && args.file_path === 'build.bzl';
});
expect(
replaceCallsForManifest.length,
`Turn ${pid} made multiple replace calls to build.bzl, which violates the concurrency mandate.`,
).toBeLessThanOrEqual(1);
}
// Verify changes were actually made
const finalManifest = rig.readFile('build.bzl');
expect(finalManifest).not.toContain('"target_alpha"');
expect(finalManifest).toContain('"//visibility:public"');
const finalReadme = rig.readFile('README.md');
expect(finalReadme).toContain('This is the My Project Web Server');
},
});
});