mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-03-19 02:20:42 -07:00
137 lines
4.2 KiB
TypeScript
137 lines
4.2 KiB
TypeScript
|
|
/**
|
||
|
|
* @license
|
||
|
|
* Copyright 2026 Google LLC
|
||
|
|
* SPDX-License-Identifier: Apache-2.0
|
||
|
|
*/
|
||
|
|
|
||
|
|
import { describe, expect } from 'vitest';
|
||
|
|
import { evalTest } from './test-helper.js';
|
||
|
|
|
||
|
|
describe('Self-Validation Workflow', () => {
|
||
|
|
/**
|
||
|
|
* Verifies that the agent performs "Parallel Discovery" in the first turn.
|
||
|
|
*/
|
||
|
|
evalTest('USUALLY_PASSES', {
|
||
|
|
name: 'should perform parallel discovery in the first turn',
|
||
|
|
files: {
|
||
|
|
'package.json': JSON.stringify({
|
||
|
|
name: 'test-project',
|
||
|
|
scripts: { test: 'vitest' },
|
||
|
|
}),
|
||
|
|
'src/index.ts': 'export const main = () => console.log("hello");',
|
||
|
|
},
|
||
|
|
prompt: 'Explore the project and find where the main function is defined.',
|
||
|
|
assert: async (rig) => {
|
||
|
|
const toolLogs = rig.readToolLogs();
|
||
|
|
// Group by prompt_id and find the one that ends with #0 (first turn)
|
||
|
|
const firstTurnLogs = toolLogs.filter((log) =>
|
||
|
|
log.toolRequest.prompt_id?.endsWith('#0'),
|
||
|
|
);
|
||
|
|
|
||
|
|
const hasReadPackageJson = toolLogs.some(
|
||
|
|
(log) =>
|
||
|
|
log.toolRequest.name === 'read_file' &&
|
||
|
|
log.toolRequest.args.includes('package.json'),
|
||
|
|
);
|
||
|
|
const hasSearch = toolLogs.some(
|
||
|
|
(log) =>
|
||
|
|
log.toolRequest.name === 'grep_search' ||
|
||
|
|
log.toolRequest.name === 'list_directory' ||
|
||
|
|
log.toolRequest.name === 'glob',
|
||
|
|
);
|
||
|
|
|
||
|
|
// Relaxing turn-1 check slightly as it might take a moment to bootstrap,
|
||
|
|
// but ensuring they happen early.
|
||
|
|
expect(
|
||
|
|
hasReadPackageJson,
|
||
|
|
'Should read package.json to discover scripts',
|
||
|
|
).toBe(true);
|
||
|
|
expect(
|
||
|
|
hasSearch,
|
||
|
|
'Should perform search/listing to explore the project',
|
||
|
|
).toBe(true);
|
||
|
|
},
|
||
|
|
});
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Verifies "Negative Verification": Agent must run the repro and confirm failure.
|
||
|
|
*/
|
||
|
|
evalTest('USUALLY_PASSES', {
|
||
|
|
name: 'should confirm negative verification (repro fails) before fix',
|
||
|
|
files: {
|
||
|
|
'src/utils.ts':
|
||
|
|
'export const square = (n: number) => n + n; // BUG: should be n * n',
|
||
|
|
'package.json': JSON.stringify({
|
||
|
|
name: 'test-project',
|
||
|
|
scripts: { test: 'vitest run' },
|
||
|
|
devDependencies: { vitest: '^1.0.0' },
|
||
|
|
}),
|
||
|
|
},
|
||
|
|
prompt:
|
||
|
|
'Fix the square function in src/utils.ts. Create a test to reproduce it first.',
|
||
|
|
assert: async (rig) => {
|
||
|
|
const toolLogs = rig.readToolLogs();
|
||
|
|
const editIndex = toolLogs.findIndex(
|
||
|
|
(log) =>
|
||
|
|
log.toolRequest.name === 'replace' &&
|
||
|
|
log.toolRequest.args.includes('src/utils.ts'),
|
||
|
|
);
|
||
|
|
|
||
|
|
const testRunsBeforeFix = toolLogs
|
||
|
|
.slice(0, editIndex)
|
||
|
|
.filter(
|
||
|
|
(log) =>
|
||
|
|
log.toolRequest.name === 'run_shell_command' &&
|
||
|
|
log.toolRequest.args.includes('test'),
|
||
|
|
);
|
||
|
|
|
||
|
|
expect(
|
||
|
|
testRunsBeforeFix.length,
|
||
|
|
'Should run tests at least once before fix',
|
||
|
|
).toBeGreaterThanOrEqual(1);
|
||
|
|
|
||
|
|
// Check if it acknowledged the failure in thoughts or if it explicitly ran the test.
|
||
|
|
// The mandate is to "run this reproduction script and confirm it fails".
|
||
|
|
},
|
||
|
|
});
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Verifies "Tail-First Navigation" for large logs.
|
||
|
|
*/
|
||
|
|
evalTest('USUALLY_PASSES', {
|
||
|
|
name: 'should use tail-first navigation for large logs',
|
||
|
|
files: {
|
||
|
|
'src/bug.ts': 'console.log("error");',
|
||
|
|
'large_log.log':
|
||
|
|
'Line 1\n'.repeat(1000) +
|
||
|
|
'CRITICAL ERROR: specific failure at the end\n',
|
||
|
|
},
|
||
|
|
prompt:
|
||
|
|
'There is a failure at the end of large_log.log. Find it and explain the cause.',
|
||
|
|
assert: async (rig) => {
|
||
|
|
const toolLogs = rig.readToolLogs();
|
||
|
|
|
||
|
|
const usedTailOrGrep = toolLogs.some(
|
||
|
|
(log) =>
|
||
|
|
log.toolRequest.name === 'run_shell_command' &&
|
||
|
|
(log.toolRequest.args.includes('tail') ||
|
||
|
|
log.toolRequest.args.includes('grep')),
|
||
|
|
);
|
||
|
|
|
||
|
|
const readWholeFile = toolLogs.some(
|
||
|
|
(log) =>
|
||
|
|
log.toolRequest.name === 'read_file' &&
|
||
|
|
log.toolRequest.args.includes('large_log.log') &&
|
||
|
|
!log.toolRequest.args.includes('limit'),
|
||
|
|
);
|
||
|
|
|
||
|
|
expect(usedTailOrGrep, 'Should use tail or grep for large logs').toBe(
|
||
|
|
true,
|
||
|
|
);
|
||
|
|
expect(readWholeFile, 'Should not read the entire large log file').toBe(
|
||
|
|
false,
|
||
|
|
);
|
||
|
|
},
|
||
|
|
});
|
||
|
|
});
|