Files
gemini-cli/evals/self_validation_workflow.eval.ts
Alisa Novikova c3215aed93 feat(core): implement self-validation workflow with prompt-verbatim restoration
This commit upgrades the agent with a robust self-validation workflow while
ensuring 100% semantic and verbatim coverage of the original system prompt.
By moving to an additive model, we preserve the original reasoning anchors
(lead-ins, heuristics, and formatting) while injecting critical autonomous
engineering mandates.

Self-Validation Workflow Injections:
- Research Phase: Parallel Discovery (combining manifests/logic) and High-Signal Grep.
- Bug Fixing: Negative Verification (confirming repro failure) and Coverage Expansion.
- Implementation: Transactional Edits (logical batching of module changes).
- Validation Loop: Tiered Validation (Fixers -> Fast-Path -> Related Tests) and Smart Log Navigation.

Technical Verification:
- Verbatim restoration verified against 66 core tests and 14 snapshots.
- New behavioral eval suite passed (evals/self_validation_workflow.eval.ts).
- Full 'npm run preflight' validation successful.
2026-03-03 00:50:59 -08:00

137 lines
4.2 KiB
TypeScript

/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, expect } from 'vitest';
import { evalTest } from './test-helper.js';
describe('Self-Validation Workflow', () => {
/**
* Verifies that the agent performs "Parallel Discovery" in the first turn.
*/
evalTest('USUALLY_PASSES', {
name: 'should perform parallel discovery in the first turn',
files: {
'package.json': JSON.stringify({
name: 'test-project',
scripts: { test: 'vitest' },
}),
'src/index.ts': 'export const main = () => console.log("hello");',
},
prompt: 'Explore the project and find where the main function is defined.',
assert: async (rig) => {
const toolLogs = rig.readToolLogs();
// Group by prompt_id and find the one that ends with #0 (first turn)
const firstTurnLogs = toolLogs.filter((log) =>
log.toolRequest.prompt_id?.endsWith('#0'),
);
const hasReadPackageJson = toolLogs.some(
(log) =>
log.toolRequest.name === 'read_file' &&
log.toolRequest.args.includes('package.json'),
);
const hasSearch = toolLogs.some(
(log) =>
log.toolRequest.name === 'grep_search' ||
log.toolRequest.name === 'list_directory' ||
log.toolRequest.name === 'glob',
);
// Relaxing turn-1 check slightly as it might take a moment to bootstrap,
// but ensuring they happen early.
expect(
hasReadPackageJson,
'Should read package.json to discover scripts',
).toBe(true);
expect(
hasSearch,
'Should perform search/listing to explore the project',
).toBe(true);
},
});
/**
* Verifies "Negative Verification": Agent must run the repro and confirm failure.
*/
evalTest('USUALLY_PASSES', {
name: 'should confirm negative verification (repro fails) before fix',
files: {
'src/utils.ts':
'export const square = (n: number) => n + n; // BUG: should be n * n',
'package.json': JSON.stringify({
name: 'test-project',
scripts: { test: 'vitest run' },
devDependencies: { vitest: '^1.0.0' },
}),
},
prompt:
'Fix the square function in src/utils.ts. Create a test to reproduce it first.',
assert: async (rig) => {
const toolLogs = rig.readToolLogs();
const editIndex = toolLogs.findIndex(
(log) =>
log.toolRequest.name === 'replace' &&
log.toolRequest.args.includes('src/utils.ts'),
);
const testRunsBeforeFix = toolLogs
.slice(0, editIndex)
.filter(
(log) =>
log.toolRequest.name === 'run_shell_command' &&
log.toolRequest.args.includes('test'),
);
expect(
testRunsBeforeFix.length,
'Should run tests at least once before fix',
).toBeGreaterThanOrEqual(1);
// Check if it acknowledged the failure in thoughts or if it explicitly ran the test.
// The mandate is to "run this reproduction script and confirm it fails".
},
});
/**
* Verifies "Tail-First Navigation" for large logs.
*/
evalTest('USUALLY_PASSES', {
name: 'should use tail-first navigation for large logs',
files: {
'src/bug.ts': 'console.log("error");',
'large_log.log':
'Line 1\n'.repeat(1000) +
'CRITICAL ERROR: specific failure at the end\n',
},
prompt:
'There is a failure at the end of large_log.log. Find it and explain the cause.',
assert: async (rig) => {
const toolLogs = rig.readToolLogs();
const usedTailOrGrep = toolLogs.some(
(log) =>
log.toolRequest.name === 'run_shell_command' &&
(log.toolRequest.args.includes('tail') ||
log.toolRequest.args.includes('grep')),
);
const readWholeFile = toolLogs.some(
(log) =>
log.toolRequest.name === 'read_file' &&
log.toolRequest.args.includes('large_log.log') &&
!log.toolRequest.args.includes('limit'),
);
expect(usedTailOrGrep, 'Should use tail or grep for large logs').toBe(
true,
);
expect(readWholeFile, 'Should not read the entire large log file').toBe(
false,
);
},
});
});