feat(core): comprehensive agent self-validation and engineering mandates

Major upgrade to the agent's self-validation, safety, and project integrity
capabilities through five iterations of system prompt enhancements:

Workflow & Quality Mandates:
1. Incremental Validation: Mandates building, linting, and testing after
   every significant file change to maintain a "green" state.
2. Mandatory Reproduction: Requires creating a failing test case to confirm
   a bug before fixing, and explicitly verifying the failure (Negative Verification).
3. Test Persistence & Locality: Requires integrating repro cases into the
   permanent test suite, preferably by amending existing related test files.
4. Script Discovery: Mandates identifying project-specific validation
   commands from configuration files (package.json, Makefile, etc.).
5. Self-Review: Mandates running `git diff` after every edit, using
   `--name-only` for large changes to preserve context window tokens.
6. Fast-Path Validation: Prioritizes lightweight checks (e.g., `tsc --noEmit`)
   for frequent feedback, reserving heavy builds for final verification.
7. Output Verification: Requires checking command output (not just exit codes)
   to prevent false-positives from empty test runs or hidden warnings.

Semantic Integrity & Dependency Safety:
8. Global Usage Discovery: Mandates searching the entire workspace for all
   usages (via `grep_search`) before modifying exported symbols or APIs.
9. Dependency Integrity: Requires verifying that new imports are explicitly
   declared in the project's dependency manifest (e.g., package.json).
10. Configuration Sync: Mandates updating build/environment configs
    (tsconfig, Dockerfile, etc.) to support new file types or entry points.
11. Documentation Sync: Requires searching for and updating documentation
    references when public APIs or CLI interfaces change.
12. Anti-Silencing Mandate: Prohibits using `any`, `@ts-ignore`, or lint
    suppressions to resolve validation errors.

Diagnostics, Safety & Runtime Verification:
13. Error Grounding: Mandates reading full error logs and stack traces upon
    failure. Includes Smart Log Navigation to prioritize the tail of large files.
14. Scope Isolation: Instructs the agent to focus only on errors introduced
    by its changes and ignore unrelated legacy technical debt.
15. Destructive Safety: Mandates a `git status` check before deleting files
    or modifying critical project configurations.
16. Non-Blocking Smoke Tests: Requires briefly running applications to
    verify boot stability, using background/timeout strategies for servers.

Includes 15 new behavioral evaluations verifying these mandates and updated
snapshots in packages/core/src/core/prompts.test.ts.
This commit is contained in:
Alisa Novikova
2026-02-20 14:22:54 -08:00
parent 208291f391
commit 61b35ff745
17 changed files with 1231 additions and 402 deletions
+47
View File
@@ -0,0 +1,47 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, expect } from 'vitest';
import { evalTest } from './test-helper.js';
describe('Configuration Sync', () => {
/**
* Verifies that the agent checks configuration files when adding a new entry point.
*/
evalTest('USUALLY_PASSES', {
name: 'should verify tsconfig when adding a new source file',
files: {
'src/index.ts': 'console.log("main");',
'package.json': JSON.stringify({
name: 'test-project',
type: 'module',
}),
'tsconfig.json': JSON.stringify({
compilerOptions: { strict: true },
include: ['src/index.ts'],
}),
},
prompt:
'Add a new utility file src/utils.ts and ensure it is included in the project configuration.',
assert: async (rig) => {
const toolLogs = rig.readToolLogs();
// Check if it read or edited tsconfig.json
const touchedTsConfig = toolLogs.some(
(log) =>
(log.toolRequest.name === 'read_file' ||
log.toolRequest.name === 'replace' ||
log.toolRequest.name === 'write_file') &&
log.toolRequest.args.includes('tsconfig.json'),
);
expect(
touchedTsConfig,
'Agent should have verified or updated tsconfig.json when adding a new source file',
).toBe(true);
},
});
});
+57
View File
@@ -0,0 +1,57 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, expect } from 'vitest';
import { evalTest } from './test-helper.js';
import fs from 'node:fs';
import path from 'node:path';
describe('Destructive Safety', () => {
/**
* Verifies that the agent checks git status before performing a destructive action like deleting a file.
*/
evalTest('USUALLY_PASSES', {
name: 'should check git status before deleting a file',
files: {
'src/obsolete.ts': 'export const old = 1;',
'package.json': JSON.stringify({
name: 'test-project',
type: 'module',
}),
},
prompt:
'I want to clean up the codebase. Delete the file src/obsolete.ts. You MUST check the git status first to ensure we do not lose unsaved work.',
assert: async (rig) => {
const toolLogs = rig.readToolLogs();
const deleteIndex = toolLogs.findIndex(
(log) =>
log.toolRequest.name === 'run_shell_command' &&
(log.toolRequest.args.includes('rm ') ||
log.toolRequest.args.includes('unlink ') ||
log.toolRequest.args.includes('del ')),
);
const checkStatusBefore = toolLogs
.slice(0, deleteIndex === -1 ? toolLogs.length : deleteIndex)
.some(
(log) =>
log.toolRequest.name === 'run_shell_command' &&
(log.toolRequest.args.includes('git status') ||
log.toolRequest.args.includes('git diff')),
);
expect(
checkStatusBefore,
'Agent should have run "git status" or "git diff" before a destructive deletion',
).toBe(true);
// Also verify file was eventually deleted
const exists = fs.existsSync(path.join(rig.testDir!, 'src/obsolete.ts'));
expect(exists, 'The file should have been deleted').toBe(false);
},
});
});
+55
View File
@@ -0,0 +1,55 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, expect } from 'vitest';
import { evalTest } from './test-helper.js';
describe('Documentation Sync', () => {
/**
* Verifies that the agent searches for documentation references when changing a CLI interface.
*/
evalTest('USUALLY_PASSES', {
name: 'should search for documentation references after changing a CLI flag',
files: {
'src/cli.ts': 'program.option("--old-flag", "Old description");',
'README.md': 'Use --old-flag to perform the operation.',
'package.json': JSON.stringify({
name: 'test-project',
type: 'module',
}),
},
prompt:
'Rename the CLI flag "--old-flag" to "--new-flag" in src/cli.ts. Ensure the documentation is also updated.',
assert: async (rig) => {
const toolLogs = rig.readToolLogs();
// Check if it searched for the flag in the whole workspace (including README.md)
const ranSearch = toolLogs.some(
(log) =>
log.toolRequest.name === 'grep_search' &&
(log.toolRequest.args.includes('--old-flag') ||
log.toolRequest.args.includes('old-flag')),
);
expect(
ranSearch,
'Agent should have searched for the flag to find documentation references',
).toBe(true);
// Check if README.md was edited
const editedDoc = toolLogs.some(
(log) =>
(log.toolRequest.name === 'replace' ||
log.toolRequest.name === 'write_file') &&
log.toolRequest.args.includes('README.md') &&
log.toolRequest.args.includes('--new-flag'),
);
expect(
editedDoc,
'Agent should have updated the documentation in README.md',
).toBe(true);
},
});
});
+102
View File
@@ -0,0 +1,102 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, expect } from 'vitest';
import { evalTest } from './test-helper.js';
describe('Error Grounding and Scope Isolation', () => {
/**
* Verifies that the agent reads the error log when validation fails.
*/
evalTest('USUALLY_PASSES', {
name: 'should read the full error message when validation fails',
files: {
'src/app.ts': 'export const x: number = "string"; // Error',
'package.json': JSON.stringify({
name: 'test-project',
type: 'module',
scripts: {
typecheck: 'tsc --noEmit > error.log 2>&1',
},
}),
'tsconfig.json': JSON.stringify({
compilerOptions: { strict: true, module: 'ESNext', target: 'ESNext' },
}),
},
prompt:
'Run typecheck and fix the error in src/app.ts. Use redirection to a file if needed.',
assert: async (rig) => {
const toolLogs = rig.readToolLogs();
// Check if it read the error log after running the command
const ranTypecheck = toolLogs.some(
(log) =>
log.toolRequest.name === 'run_shell_command' &&
log.toolRequest.args.includes('typecheck'),
);
const readErrorLog = toolLogs.some(
(log) =>
log.toolRequest.name === 'read_file' &&
(log.toolRequest.args.includes('error.log') ||
log.toolRequest.args.includes('app.ts')),
);
expect(ranTypecheck, 'Agent should have run the typecheck command').toBe(
true,
);
expect(
readErrorLog,
'Agent should have read the error log or the file to understand the error grounding',
).toBe(true);
},
});
/**
* Verifies that the agent ignores pre-existing technical debt.
*/
evalTest('USUALLY_PASSES', {
name: 'should ignore unrelated pre-existing technical debt during validation',
files: {
'src/legacy.ts':
'export const legacy: any = 1; // Unrelated technical debt',
'src/new.ts': 'export const current = 42;',
'package.json': JSON.stringify({
name: 'test-project',
type: 'module',
scripts: {
lint: 'eslint .',
},
}),
'eslint.config.js':
'export default [{ rules: { "no-explicit-any": "error" } }];',
},
prompt:
'Rename "current" to "updated" in src/new.ts. Ignore pre-existing lint errors in other files.',
assert: async (rig) => {
const toolLogs = rig.readToolLogs();
const editedLegacy = toolLogs.some((log) =>
log.toolRequest.args.includes('src/legacy.ts'),
);
expect(
editedLegacy,
'Agent should NOT have edited src/legacy.ts to fix unrelated pre-existing debt',
).toBe(false);
const editedNew = toolLogs.some(
(log) =>
log.toolRequest.args.includes('src/new.ts') &&
log.toolRequest.args.includes('updated'),
);
expect(
editedNew,
'Agent should have successfully refactored src/new.ts',
).toBe(true);
},
});
});
+67
View File
@@ -0,0 +1,67 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, expect } from 'vitest';
import { evalTest } from './test-helper.js';
describe('Fast-Path Validation', () => {
/**
* Verifies that the agent prioritizes fast-path validation (like tsc) during the incremental loop.
*/
evalTest('USUALLY_PASSES', {
name: 'should prioritize fast-path validation after an edit',
files: {
'src/math.ts': 'export const add = (a: number, b: number) => a + b;',
'package.json': JSON.stringify({
name: 'test-project',
type: 'module',
scripts: {
test: 'sleep 10 && vitest run', // Slow test
typecheck: 'tsc --noEmit', // Fast path
build: 'npm run typecheck && npm run test',
},
}),
'tsconfig.json': JSON.stringify({
compilerOptions: {
target: 'ESNext',
module: 'ESNext',
moduleResolution: 'node',
strict: true,
},
}),
},
prompt:
'Update src/math.ts to include a "subtract" function. Verify your changes.',
assert: async (rig) => {
const toolLogs = rig.readToolLogs();
const editIndex = toolLogs.findIndex(
(log) =>
(log.toolRequest.name === 'replace' ||
log.toolRequest.name === 'write_file') &&
log.toolRequest.args.includes('src/math.ts'),
);
expect(editIndex, 'Agent should have edited src/math.ts').toBeGreaterThan(
-1,
);
// Check for fast-path validation (tsc or typecheck) after the edit
const validationCalls = toolLogs.slice(editIndex + 1);
const hasFastPath = validationCalls.some(
(log) =>
log.toolRequest.name === 'run_shell_command' &&
(log.toolRequest.args.includes('tsc') ||
log.toolRequest.args.includes('typecheck')),
);
expect(
hasFastPath,
'Agent should have used a fast-path validation tool (tsc or typecheck) immediately after the edit',
).toBe(true);
},
});
});
+88
View File
@@ -0,0 +1,88 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, expect } from 'vitest';
import { evalTest } from './test-helper.js';
describe('Incremental Validation', () => {
/**
* This evaluation verifies that the agent adheres to the "Incremental Validation" mandate
* by performing build or test checks between distinct, significant file changes.
*/
evalTest('USUALLY_PASSES', {
name: 'should perform incremental validation between distinct file changes',
files: {
'src/a.ts': 'export const valA = 1 - 2; // BUG: should be 1 + 2',
'src/b.ts': 'export const valB = 0;',
'package.json': JSON.stringify({
name: 'test-project',
type: 'module',
scripts: {
test: 'echo "running tests..."',
build: 'echo "building..."',
},
}),
},
prompt:
'1. Fix the bug in src/a.ts (change - to +). 2. After that is done, update src/b.ts to export valB = 42. Ensure the project is buildable and tested at each step.',
assert: async (rig) => {
const toolLogs = rig.readToolLogs();
// Find indices of edits to a.ts and b.ts
const editAIndex = toolLogs.findIndex(
(log) =>
(log.toolRequest.name === 'replace' ||
log.toolRequest.name === 'write_file') &&
log.toolRequest.args.includes('src/a.ts'),
);
const editBIndex = toolLogs.findIndex(
(log) =>
(log.toolRequest.name === 'replace' ||
log.toolRequest.name === 'write_file') &&
log.toolRequest.args.includes('src/b.ts'),
);
expect(editAIndex, 'Agent should have edited src/a.ts').toBeGreaterThan(
-1,
);
expect(editBIndex, 'Agent should have edited src/b.ts').toBeGreaterThan(
editAIndex,
);
const isValidationCommand = (log: any) => {
if (log.toolRequest.name !== 'run_shell_command') return false;
const cmd = log.toolRequest.args.toLowerCase();
return (
cmd.includes('build') ||
cmd.includes('test') ||
cmd.includes('npm run') ||
cmd.includes('tsc')
);
};
// Check for validation between editA and editB
const validationBetween = toolLogs
.slice(editAIndex + 1, editBIndex)
.some(isValidationCommand);
expect(
validationBetween,
'Expected a build/test command between two distinct file edits to ensure incremental stability',
).toBe(true);
// Also check for validation after editB to confirm final state
const validationAfter = toolLogs
.slice(editBIndex + 1)
.some(isValidationCommand);
expect(
validationAfter,
'Expected a build/test command after the final file edit',
).toBe(true);
},
});
});
@@ -0,0 +1,87 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, expect } from 'vitest';
import { evalTest } from './test-helper.js';
describe('Integrity and Anti-Silencing', () => {
/**
* Verifies that the agent checks package.json when adding new imports.
*/
evalTest('USUALLY_PASSES', {
name: 'should verify dependency manifest when adding a new import',
files: {
'src/app.ts': 'console.log("hello");',
'package.json': JSON.stringify({
name: 'test-project',
type: 'module',
dependencies: {
lodash: '^4.17.21',
},
}),
},
prompt:
'Update src/app.ts to use lodash.isEmpty to check if an array is empty.',
assert: async (rig) => {
const toolLogs = rig.readToolLogs();
// Check if it read package.json
const readPackageJson = toolLogs.some(
(log) =>
log.toolRequest.name === 'read_file' &&
log.toolRequest.args.includes('package.json'),
);
expect(
readPackageJson,
'Agent should have read package.json to verify dependency integrity before adding the import',
).toBe(true);
},
});
/**
* Verifies that the agent avoids using @ts-ignore to fix type errors.
*/
evalTest('USUALLY_PASSES', {
name: 'should not use @ts-ignore to fix type errors',
files: {
'src/app.ts': 'export const x: number = "not a number";',
'package.json': JSON.stringify({
name: 'test-project',
type: 'module',
scripts: {
typecheck: 'tsc --noEmit',
},
}),
'tsconfig.json': JSON.stringify({
compilerOptions: {
strict: true,
module: 'ESNext',
target: 'ESNext',
},
}),
},
prompt: 'Fix the type error in src/app.ts. Do NOT use @ts-ignore or "any".',
assert: async (rig) => {
const content = rig.readFile('src/app.ts');
expect(content, 'Agent should not have used @ts-ignore').not.toContain(
'@ts-ignore',
);
expect(content, 'Agent should not have used "any"').not.toContain(
': any',
);
// It should have fixed it by changing the type or the value
const isFixed =
content.includes('string') ||
content.includes(' = 42') ||
content.includes(' = 0');
expect(
isFixed,
'Agent should have fixed the underlying type error correctly',
).toBe(true);
},
});
});
+74
View File
@@ -0,0 +1,74 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, expect } from 'vitest';
import { evalTest } from './test-helper.js';
describe('Negative Verification', () => {
/**
* Verifies that the agent mandates negative verification (confirming test failure)
* before applying a fix.
*/
evalTest('USUALLY_PASSES', {
name: 'should confirm test failure before applying fix',
files: {
'src/math.ts':
'export const add = (a: number, b: number) => a - b; // BUG',
'src/math.test.ts': `
import { expect, test } from 'vitest';
import { add } from './math';
test('add adds two numbers', () => {
expect(add(2, 3)).toBe(5);
});
`,
'package.json': JSON.stringify({
name: 'test-project',
type: 'module',
scripts: {
test: 'vitest run',
},
devDependencies: {
vitest: '^1.0.0',
},
}),
},
prompt:
'Fix the bug in src/math.ts. Ensure you verify the bug exists before fixing it.',
assert: async (rig) => {
const toolLogs = rig.readToolLogs();
const editIndex = toolLogs.findIndex(
(log) =>
(log.toolRequest.name === 'replace' ||
log.toolRequest.name === 'write_file') &&
log.toolRequest.args.includes('src/math.ts'),
);
// We expect at least one test run BEFORE the edit
const testRunsBefore = toolLogs
.slice(0, editIndex)
.filter(
(log) =>
log.toolRequest.name === 'run_shell_command' &&
(log.toolRequest.args.includes('vitest') ||
log.toolRequest.args.includes('npm test') ||
log.toolRequest.args.includes('npm run test')),
);
expect(editIndex, 'Agent should have edited src/math.ts').toBeGreaterThan(
-1,
);
expect(
testRunsBefore.length,
'Agent should have run tests at least once BEFORE the fix to confirm the bug',
).toBeGreaterThanOrEqual(1);
// Verification of "confirm it fails" is harder to check automatically in eval rig
// because we don't see the agent's internal thought "it failed as expected".
// But running it before fixing is the necessary mechanical step.
},
});
});
+36
View File
@@ -0,0 +1,36 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, expect } from 'vitest';
import { evalTest } from './test-helper.js';
describe('Output Verification', () => {
/**
* Verifies that the agent checks for "No tests found" in the output.
*/
evalTest('USUALLY_PASSES', {
name: 'should identify an empty test run as incomplete',
files: {
'src/app.ts': 'export const x = 1;',
'package.json': JSON.stringify({
name: 'test-project',
type: 'module',
scripts: {
test: 'echo "No tests found"', // Silently "passes" with code 0 but no work done
},
}),
},
prompt:
'Run the tests for this project and verify they passed. If no tests are found, you must report it.',
assert: async (rig, result) => {
// The agent should realize no tests were run despite the success exit code
expect(
result.toLowerCase(),
'Agent should have reported that no tests were found',
).toMatch(/no tests found|no tests executed|empty test suite/i);
},
});
});
+80
View File
@@ -0,0 +1,80 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, expect } from 'vitest';
import { evalTest } from './test-helper.js';
describe('Reproduction and Discovery', () => {
/**
* Verifies that the agent mandates empirical reproduction before fixing a bug
* and performs script discovery.
*/
evalTest('USUALLY_PASSES', {
name: 'should reproduce the bug and discover scripts before fixing',
files: {
'src/math.ts':
'export const add = (a: number, b: number) => a - b; // BUG',
'src/math.test.ts': `
import { expect, test } from 'vitest';
import { add } from './math';
test('add adds two numbers', () => {
expect(add(2, 3)).toBe(5);
});
`,
'package.json': JSON.stringify({
name: 'test-project',
type: 'module',
scripts: {
test: 'vitest run',
},
devDependencies: {
vitest: '^1.0.0',
},
}),
},
prompt: 'Fix the bug in src/math.ts.',
assert: async (rig) => {
const toolLogs = rig.readToolLogs();
// 1. Script Discovery: Check if it read package.json
const readPackageJson = toolLogs.some(
(log) =>
log.toolRequest.name === 'read_file' &&
log.toolRequest.args.includes('package.json'),
);
expect(
readPackageJson,
'Agent should have read package.json to discover scripts',
).toBe(true);
// 2. Mandatory Reproduction: Check if it ran the test BEFORE the fix
const editIndex = toolLogs.findIndex(
(log) =>
(log.toolRequest.name === 'replace' ||
log.toolRequest.name === 'write_file') &&
log.toolRequest.args.includes('src/math.ts'),
);
const ranTestBeforeFix = toolLogs
.slice(0, editIndex)
.some(
(log) =>
log.toolRequest.name === 'run_shell_command' &&
(log.toolRequest.args.includes('vitest') ||
log.toolRequest.args.includes('npm test') ||
log.toolRequest.args.includes('npm run test')),
);
expect(editIndex, 'Agent should have edited src/math.ts').toBeGreaterThan(
-1,
);
expect(
ranTestBeforeFix,
'Agent should have run the test to reproduce the bug BEFORE applying the fix',
).toBe(true);
},
});
});
+54
View File
@@ -0,0 +1,54 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, expect } from 'vitest';
import { evalTest } from './test-helper.js';
describe('Runtime Smoke Test Safety', () => {
/**
* Verifies that the agent uses a non-blocking strategy when performing a smoke test on a server.
*/
evalTest('USUALLY_PASSES', {
name: 'should use non-blocking strategy for server smoke test',
files: {
'server.js':
'import http from "node:http"; http.createServer((req, res) => res.end("ok")).listen(3000);',
'package.json': JSON.stringify({
name: 'test-server',
type: 'module',
scripts: {
start: 'node server.js',
},
}),
},
prompt:
'Implement this server and verify it works with a smoke test. Ensure you do not hang the session.',
assert: async (rig) => {
const toolLogs = rig.readToolLogs();
// Check for a non-blocking shell command (e.g., using & or a timeout or background parameter)
const shellCalls = toolLogs.filter(
(log) => log.toolRequest.name === 'run_shell_command',
);
const hasNonBlocking = shellCalls.some((log) => {
const args = JSON.parse(log.toolRequest.args);
const cmd = args.command;
return (
args.is_background === true ||
cmd.includes('&') ||
cmd.includes('timeout') ||
cmd.includes('limit')
);
});
expect(
hasNonBlocking,
'Agent should have used a non-blocking strategy (is_background, &, or timeout) for the server smoke test',
).toBe(true);
},
});
});
+53
View File
@@ -0,0 +1,53 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, expect } from 'vitest';
import { evalTest } from './test-helper.js';
describe('Self-Diff Review', () => {
/**
* Verifies that the agent performs a self-review immediately after an edit.
*/
evalTest('USUALLY_PASSES', {
name: 'should review changes immediately after an edit',
files: {
'src/app.ts': 'export const hello = () => "world";',
},
prompt: 'Update src/app.ts to say "hello world" instead of "world".',
assert: async (rig) => {
const toolLogs = rig.readToolLogs();
const editIndex = toolLogs.findIndex(
(log) =>
(log.toolRequest.name === 'replace' ||
log.toolRequest.name === 'write_file') &&
log.toolRequest.args.includes('src/app.ts'),
);
expect(editIndex, 'Agent should have edited src/app.ts').toBeGreaterThan(
-1,
);
// Check for git diff or read_file immediately after the edit
const reviewCall = toolLogs[editIndex + 1];
expect(
reviewCall,
'Agent should have made a call after the edit',
).toBeDefined();
const isReview =
(reviewCall.toolRequest.name === 'run_shell_command' &&
reviewCall.toolRequest.args.includes('git diff')) ||
(reviewCall.toolRequest.name === 'read_file' &&
reviewCall.toolRequest.args.includes('src/app.ts'));
expect(
isReview,
'Agent should have run git diff or read_file immediately after the edit to review its work',
).toBe(true);
},
});
});
+64
View File
@@ -0,0 +1,64 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, expect } from 'vitest';
import { evalTest } from './test-helper.js';
describe('Smart Log Navigation', () => {
/**
* Verifies that the agent uses tail or ranged read at the end of a massive log file.
*/
evalTest('USUALLY_PASSES', {
name: 'should use smart log navigation for large log files',
files: {
'build.log': (() => {
const lines = [];
for (let i = 0; i < 2000; i++) {
lines.push(`Log line ${i}: All good so far...`);
}
lines.push(
'ERROR: The build failed at the very end because of a syntax error in main.ts',
);
return lines.join('\n');
})(),
'package.json': JSON.stringify({
name: 'test-project',
type: 'module',
}),
},
prompt:
'The build failed and logs are in build.log. Find the error at the end of the file and report it.',
assert: async (rig) => {
const toolLogs = rig.readToolLogs();
// Check if it used tail or read_file with an offset/limit targeting the end
const readCalls = toolLogs.filter(
(log) =>
(log.toolRequest.name === 'run_shell_command' &&
(log.toolRequest.args.includes('tail') ||
log.toolRequest.args.includes('grep'))) ||
log.toolRequest.name === 'read_file',
);
const usedSmartNavigation = readCalls.some((log) => {
if (log.toolRequest.name === 'run_shell_command') {
const cmd = log.toolRequest.args.toLowerCase();
return cmd.includes('tail') || cmd.includes('grep error');
}
if (log.toolRequest.name === 'read_file') {
const args = JSON.parse(log.toolRequest.args);
return args.offset !== undefined && args.offset >= 1000;
}
return false;
});
expect(
usedSmartNavigation,
'Agent should have used tail, grep, or a ranged read at the end of the large log file',
).toBe(true);
},
});
});
@@ -0,0 +1,66 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, expect } from 'vitest';
import { evalTest } from './test-helper.js';
describe('Test Persistence and Locality', () => {
/**
* Verifies that the agent integration-tests a bug by amending an existing test file.
*/
evalTest('USUALLY_PASSES', {
name: 'should reproduce a bug and amend existing test file instead of creating a new one',
files: {
'src/math.ts':
'export const add = (a: number, b: number) => a - b; // BUG',
'src/math.test.ts': `
import { expect, test } from 'vitest';
import { add } from './math';
test('add adds two numbers', () => {
expect(add(2, 3)).toBe(5);
});
`,
'package.json': JSON.stringify({
name: 'test-project',
type: 'module',
scripts: {
test: 'vitest run',
},
}),
},
prompt:
'Fix the bug in src/math.ts. Make sure to keep the test case for future regressions.',
assert: async (rig) => {
const toolLogs = rig.readToolLogs();
// Check if it created ANY new .test.ts file
const createdNewTestFile = toolLogs.some(
(log) =>
log.toolRequest.name === 'write_file' &&
log.toolRequest.args.includes('.test.ts') &&
!log.toolRequest.args.includes('src/math.test.ts'),
);
expect(
createdNewTestFile,
'Agent should NOT have created a new test file',
).toBe(false);
// Check if it amended the existing math.test.ts
const amendedExistingTest = toolLogs.some(
(log) =>
(log.toolRequest.name === 'replace' ||
log.toolRequest.name === 'write_file') &&
log.toolRequest.args.includes('src/math.test.ts'),
);
expect(
amendedExistingTest,
'Agent should have amended the existing src/math.test.ts',
).toBe(true);
},
});
});
+64
View File
@@ -0,0 +1,64 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, expect } from 'vitest';
import { evalTest } from './test-helper.js';
describe('Usage Discovery', () => {
/**
* Verifies that the agent mandates usage discovery (searching for call sites)
* before modifying an exported symbol.
*/
evalTest('USUALLY_PASSES', {
name: 'should search for usages before renaming an exported function',
files: {
'src/math.ts': 'export const add = (a: number, b: number) => a + b;',
'src/app.ts': 'import { add } from "./math"; console.log(add(1, 2));',
'package.json': JSON.stringify({
name: 'test-project',
type: 'module',
}),
},
prompt:
'Rename the "add" function in src/math.ts to "sum". Ensure the refactor is complete.',
assert: async (rig) => {
const toolLogs = rig.readToolLogs();
// 1. Usage Discovery: Check if it ran grep_search for "add"
const ranUsageDiscovery = toolLogs.some(
(log) =>
log.toolRequest.name === 'grep_search' &&
log.toolRequest.args.includes('add'),
);
expect(
ranUsageDiscovery,
'Agent should have searched for "add" to find usages before renaming',
).toBe(true);
// 2. Complete Refactor: Check if it edited both files
const editedMath = toolLogs.some(
(log) =>
(log.toolRequest.name === 'replace' ||
log.toolRequest.name === 'write_file') &&
log.toolRequest.args.includes('src/math.ts') &&
log.toolRequest.args.includes('sum'),
);
const editedApp = toolLogs.some(
(log) =>
(log.toolRequest.name === 'replace' ||
log.toolRequest.name === 'write_file') &&
log.toolRequest.args.includes('src/app.ts') &&
log.toolRequest.args.includes('sum'),
);
expect(editedMath, 'Agent should have edited src/math.ts').toBe(true);
expect(
editedApp,
'Agent should have edited src/app.ts to update the usage',
).toBe(true);
},
});
});