diff --git a/evals/botched-replace.eval.ts b/evals/botched-replace.eval.ts new file mode 100644 index 0000000000..18c03508b5 --- /dev/null +++ b/evals/botched-replace.eval.ts @@ -0,0 +1,66 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { expect, describe } from 'vitest'; +import { evalTest } from './test-helper.js'; + +describe('botched replace eval', () => { + evalTest('ALWAYS_PASSES', { + name: 'should perform multiple edits to the same file sequentially across turns', + files: { + 'build.bzl': `"""my-internal-build-system""" + +load( + "//core/build/rules:my_rules.bzl", + "target_alpha", + "target_beta", + "target_gamma", +) + +target = my_rules.Target.create( + build_package = "//src/project/web/server", + visibility = ["//visibility:private"], +) +`, + 'README.md': '# Project\nTODO: Fill me in.', + }, + prompt: + 'The file contents are standard. Please immediately apply these updates using the replace tool: 1. In build.bzl, remove "target_alpha" from the load statement. 2. In build.bzl, change the visibility to ["//visibility:public"]. 3. In README.md, replace "TODO: Fill me in." with "This is the My Project Web Server". Do not read the files first, just execute the replacements right away to save time.', + assert: async (rig) => { + const logs = rig.readToolLogs(); + + // Group tool calls by turn (prompt_id) + const callsPerTurn: Record = {}; + for (const log of logs) { + if (log.toolRequest) { + const pid = log.toolRequest.prompt_id || 'unknown'; + if (!callsPerTurn[pid]) callsPerTurn[pid] = []; + callsPerTurn[pid].push(log.toolRequest); + } + } + + for (const [pid, calls] of Object.entries(callsPerTurn)) { + const replaceCallsForManifest = calls.filter((c) => { + const args = JSON.parse(c.args); + return c.name === 'replace' && args.file_path === 'build.bzl'; + }); + + expect( + replaceCallsForManifest.length, + `Turn ${pid} made multiple replace calls to build.bzl, which violates the concurrency mandate.`, + ).toBeLessThanOrEqual(1); + } + + // Verify changes were actually made + const finalManifest = rig.readFile('build.bzl'); + expect(finalManifest).not.toContain('"target_alpha"'); + expect(finalManifest).toContain('"//visibility:public"'); + + const finalReadme = rig.readFile('README.md'); + expect(finalReadme).toContain('This is the My Project Web Server'); + }, + }); +});