mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-03-10 22:21:22 -07:00
This commit upgrades the agent with a robust self-validation workflow while ensuring 100% semantic and verbatim coverage of the original system prompt. By moving to an additive model, we preserve the original reasoning anchors (lead-ins, heuristics, and formatting) while injecting critical autonomous engineering mandates. Self-Validation Workflow Injections: - Research Phase: Parallel Discovery (combining manifests/logic) and High-Signal Grep. - Bug Fixing: Negative Verification (confirming repro failure) and Coverage Expansion. - Implementation: Transactional Edits (logical batching of module changes). - Validation Loop: Tiered Validation (Fixers -> Fast-Path -> Related Tests) and Smart Log Navigation. Technical Verification: - Verbatim restoration verified against 66 core tests and 14 snapshots. - New behavioral eval suite passed (evals/self_validation_workflow.eval.ts). - Full 'npm run preflight' validation successful.
137 lines
4.2 KiB
TypeScript
137 lines
4.2 KiB
TypeScript
/**
|
|
* @license
|
|
* Copyright 2026 Google LLC
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*/
|
|
|
|
import { describe, expect } from 'vitest';
|
|
import { evalTest } from './test-helper.js';
|
|
|
|
describe('Self-Validation Workflow', () => {
|
|
/**
|
|
* Verifies that the agent performs "Parallel Discovery" in the first turn.
|
|
*/
|
|
evalTest('USUALLY_PASSES', {
|
|
name: 'should perform parallel discovery in the first turn',
|
|
files: {
|
|
'package.json': JSON.stringify({
|
|
name: 'test-project',
|
|
scripts: { test: 'vitest' },
|
|
}),
|
|
'src/index.ts': 'export const main = () => console.log("hello");',
|
|
},
|
|
prompt: 'Explore the project and find where the main function is defined.',
|
|
assert: async (rig) => {
|
|
const toolLogs = rig.readToolLogs();
|
|
// Group by prompt_id and find the one that ends with #0 (first turn)
|
|
const firstTurnLogs = toolLogs.filter((log) =>
|
|
log.toolRequest.prompt_id?.endsWith('#0'),
|
|
);
|
|
|
|
const hasReadPackageJson = toolLogs.some(
|
|
(log) =>
|
|
log.toolRequest.name === 'read_file' &&
|
|
log.toolRequest.args.includes('package.json'),
|
|
);
|
|
const hasSearch = toolLogs.some(
|
|
(log) =>
|
|
log.toolRequest.name === 'grep_search' ||
|
|
log.toolRequest.name === 'list_directory' ||
|
|
log.toolRequest.name === 'glob',
|
|
);
|
|
|
|
// Relaxing turn-1 check slightly as it might take a moment to bootstrap,
|
|
// but ensuring they happen early.
|
|
expect(
|
|
hasReadPackageJson,
|
|
'Should read package.json to discover scripts',
|
|
).toBe(true);
|
|
expect(
|
|
hasSearch,
|
|
'Should perform search/listing to explore the project',
|
|
).toBe(true);
|
|
},
|
|
});
|
|
|
|
/**
|
|
* Verifies "Negative Verification": Agent must run the repro and confirm failure.
|
|
*/
|
|
evalTest('USUALLY_PASSES', {
|
|
name: 'should confirm negative verification (repro fails) before fix',
|
|
files: {
|
|
'src/utils.ts':
|
|
'export const square = (n: number) => n + n; // BUG: should be n * n',
|
|
'package.json': JSON.stringify({
|
|
name: 'test-project',
|
|
scripts: { test: 'vitest run' },
|
|
devDependencies: { vitest: '^1.0.0' },
|
|
}),
|
|
},
|
|
prompt:
|
|
'Fix the square function in src/utils.ts. Create a test to reproduce it first.',
|
|
assert: async (rig) => {
|
|
const toolLogs = rig.readToolLogs();
|
|
const editIndex = toolLogs.findIndex(
|
|
(log) =>
|
|
log.toolRequest.name === 'replace' &&
|
|
log.toolRequest.args.includes('src/utils.ts'),
|
|
);
|
|
|
|
const testRunsBeforeFix = toolLogs
|
|
.slice(0, editIndex)
|
|
.filter(
|
|
(log) =>
|
|
log.toolRequest.name === 'run_shell_command' &&
|
|
log.toolRequest.args.includes('test'),
|
|
);
|
|
|
|
expect(
|
|
testRunsBeforeFix.length,
|
|
'Should run tests at least once before fix',
|
|
).toBeGreaterThanOrEqual(1);
|
|
|
|
// Check if it acknowledged the failure in thoughts or if it explicitly ran the test.
|
|
// The mandate is to "run this reproduction script and confirm it fails".
|
|
},
|
|
});
|
|
|
|
/**
|
|
* Verifies "Tail-First Navigation" for large logs.
|
|
*/
|
|
evalTest('USUALLY_PASSES', {
|
|
name: 'should use tail-first navigation for large logs',
|
|
files: {
|
|
'src/bug.ts': 'console.log("error");',
|
|
'large_log.log':
|
|
'Line 1\n'.repeat(1000) +
|
|
'CRITICAL ERROR: specific failure at the end\n',
|
|
},
|
|
prompt:
|
|
'There is a failure at the end of large_log.log. Find it and explain the cause.',
|
|
assert: async (rig) => {
|
|
const toolLogs = rig.readToolLogs();
|
|
|
|
const usedTailOrGrep = toolLogs.some(
|
|
(log) =>
|
|
log.toolRequest.name === 'run_shell_command' &&
|
|
(log.toolRequest.args.includes('tail') ||
|
|
log.toolRequest.args.includes('grep')),
|
|
);
|
|
|
|
const readWholeFile = toolLogs.some(
|
|
(log) =>
|
|
log.toolRequest.name === 'read_file' &&
|
|
log.toolRequest.args.includes('large_log.log') &&
|
|
!log.toolRequest.args.includes('limit'),
|
|
);
|
|
|
|
expect(usedTailOrGrep, 'Should use tail or grep for large logs').toBe(
|
|
true,
|
|
);
|
|
expect(readWholeFile, 'Should not read the entire large log file').toBe(
|
|
false,
|
|
);
|
|
},
|
|
});
|
|
});
|