mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-05-01 23:44:15 -07:00
feat(core): implement self-validation workflow with prompt-verbatim restoration
This commit upgrades the agent with a robust self-validation workflow while ensuring 100% semantic and verbatim coverage of the original system prompt. By moving to an additive model, we preserve the original reasoning anchors (lead-ins, heuristics, and formatting) while injecting critical autonomous engineering mandates. Self-Validation Workflow Injections: - Research Phase: Parallel Discovery (combining manifests/logic) and High-Signal Grep. - Bug Fixing: Negative Verification (confirming repro failure) and Coverage Expansion. - Implementation: Transactional Edits (logical batching of module changes). - Validation Loop: Tiered Validation (Fixers -> Fast-Path -> Related Tests) and Smart Log Navigation. Technical Verification: - Verbatim restoration verified against 66 core tests and 14 snapshots. - New behavioral eval suite passed (evals/self_validation_workflow.eval.ts). - Full 'npm run preflight' validation successful.
This commit is contained in:
@@ -0,0 +1,136 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { describe, expect } from 'vitest';
|
||||
import { evalTest } from './test-helper.js';
|
||||
|
||||
describe('Self-Validation Workflow', () => {
|
||||
/**
|
||||
* Verifies that the agent performs "Parallel Discovery" in the first turn.
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should perform parallel discovery in the first turn',
|
||||
files: {
|
||||
'package.json': JSON.stringify({
|
||||
name: 'test-project',
|
||||
scripts: { test: 'vitest' },
|
||||
}),
|
||||
'src/index.ts': 'export const main = () => console.log("hello");',
|
||||
},
|
||||
prompt: 'Explore the project and find where the main function is defined.',
|
||||
assert: async (rig) => {
|
||||
const toolLogs = rig.readToolLogs();
|
||||
// Group by prompt_id and find the one that ends with #0 (first turn)
|
||||
const firstTurnLogs = toolLogs.filter((log) =>
|
||||
log.toolRequest.prompt_id?.endsWith('#0'),
|
||||
);
|
||||
|
||||
const hasReadPackageJson = toolLogs.some(
|
||||
(log) =>
|
||||
log.toolRequest.name === 'read_file' &&
|
||||
log.toolRequest.args.includes('package.json'),
|
||||
);
|
||||
const hasSearch = toolLogs.some(
|
||||
(log) =>
|
||||
log.toolRequest.name === 'grep_search' ||
|
||||
log.toolRequest.name === 'list_directory' ||
|
||||
log.toolRequest.name === 'glob',
|
||||
);
|
||||
|
||||
// Relaxing turn-1 check slightly as it might take a moment to bootstrap,
|
||||
// but ensuring they happen early.
|
||||
expect(
|
||||
hasReadPackageJson,
|
||||
'Should read package.json to discover scripts',
|
||||
).toBe(true);
|
||||
expect(
|
||||
hasSearch,
|
||||
'Should perform search/listing to explore the project',
|
||||
).toBe(true);
|
||||
},
|
||||
});
|
||||
|
||||
/**
|
||||
* Verifies "Negative Verification": Agent must run the repro and confirm failure.
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should confirm negative verification (repro fails) before fix',
|
||||
files: {
|
||||
'src/utils.ts':
|
||||
'export const square = (n: number) => n + n; // BUG: should be n * n',
|
||||
'package.json': JSON.stringify({
|
||||
name: 'test-project',
|
||||
scripts: { test: 'vitest run' },
|
||||
devDependencies: { vitest: '^1.0.0' },
|
||||
}),
|
||||
},
|
||||
prompt:
|
||||
'Fix the square function in src/utils.ts. Create a test to reproduce it first.',
|
||||
assert: async (rig) => {
|
||||
const toolLogs = rig.readToolLogs();
|
||||
const editIndex = toolLogs.findIndex(
|
||||
(log) =>
|
||||
log.toolRequest.name === 'replace' &&
|
||||
log.toolRequest.args.includes('src/utils.ts'),
|
||||
);
|
||||
|
||||
const testRunsBeforeFix = toolLogs
|
||||
.slice(0, editIndex)
|
||||
.filter(
|
||||
(log) =>
|
||||
log.toolRequest.name === 'run_shell_command' &&
|
||||
log.toolRequest.args.includes('test'),
|
||||
);
|
||||
|
||||
expect(
|
||||
testRunsBeforeFix.length,
|
||||
'Should run tests at least once before fix',
|
||||
).toBeGreaterThanOrEqual(1);
|
||||
|
||||
// Check if it acknowledged the failure in thoughts or if it explicitly ran the test.
|
||||
// The mandate is to "run this reproduction script and confirm it fails".
|
||||
},
|
||||
});
|
||||
|
||||
/**
|
||||
* Verifies "Tail-First Navigation" for large logs.
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should use tail-first navigation for large logs',
|
||||
files: {
|
||||
'src/bug.ts': 'console.log("error");',
|
||||
'large_log.log':
|
||||
'Line 1\n'.repeat(1000) +
|
||||
'CRITICAL ERROR: specific failure at the end\n',
|
||||
},
|
||||
prompt:
|
||||
'There is a failure at the end of large_log.log. Find it and explain the cause.',
|
||||
assert: async (rig) => {
|
||||
const toolLogs = rig.readToolLogs();
|
||||
|
||||
const usedTailOrGrep = toolLogs.some(
|
||||
(log) =>
|
||||
log.toolRequest.name === 'run_shell_command' &&
|
||||
(log.toolRequest.args.includes('tail') ||
|
||||
log.toolRequest.args.includes('grep')),
|
||||
);
|
||||
|
||||
const readWholeFile = toolLogs.some(
|
||||
(log) =>
|
||||
log.toolRequest.name === 'read_file' &&
|
||||
log.toolRequest.args.includes('large_log.log') &&
|
||||
!log.toolRequest.args.includes('limit'),
|
||||
);
|
||||
|
||||
expect(usedTailOrGrep, 'Should use tail or grep for large logs').toBe(
|
||||
true,
|
||||
);
|
||||
expect(readWholeFile, 'Should not read the entire large log file').toBe(
|
||||
false,
|
||||
);
|
||||
},
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user