mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-03-24 04:52:43 -07:00
89 lines
2.8 KiB
TypeScript
89 lines
2.8 KiB
TypeScript
|
|
/**
|
||
|
|
* @license
|
||
|
|
* Copyright 2026 Google LLC
|
||
|
|
* SPDX-License-Identifier: Apache-2.0
|
||
|
|
*/
|
||
|
|
|
||
|
|
import { describe, expect } from 'vitest';
|
||
|
|
import { evalTest } from './test-helper.js';
|
||
|
|
|
||
|
|
describe('Incremental Validation', () => {
|
||
|
|
/**
|
||
|
|
* This evaluation verifies that the agent adheres to the "Incremental Validation" mandate
|
||
|
|
* by performing build or test checks between distinct, significant file changes.
|
||
|
|
*/
|
||
|
|
evalTest('USUALLY_PASSES', {
|
||
|
|
name: 'should perform incremental validation between distinct file changes',
|
||
|
|
files: {
|
||
|
|
'src/a.ts': 'export const valA = 1 - 2; // BUG: should be 1 + 2',
|
||
|
|
'src/b.ts': 'export const valB = 0;',
|
||
|
|
'package.json': JSON.stringify({
|
||
|
|
name: 'test-project',
|
||
|
|
type: 'module',
|
||
|
|
scripts: {
|
||
|
|
test: 'echo "running tests..."',
|
||
|
|
build: 'echo "building..."',
|
||
|
|
},
|
||
|
|
}),
|
||
|
|
},
|
||
|
|
prompt:
|
||
|
|
'1. Fix the bug in src/a.ts (change - to +). 2. After that is done, update src/b.ts to export valB = 42. Ensure the project is buildable and tested at each step.',
|
||
|
|
assert: async (rig) => {
|
||
|
|
const toolLogs = rig.readToolLogs();
|
||
|
|
|
||
|
|
// Find indices of edits to a.ts and b.ts
|
||
|
|
const editAIndex = toolLogs.findIndex(
|
||
|
|
(log) =>
|
||
|
|
(log.toolRequest.name === 'replace' ||
|
||
|
|
log.toolRequest.name === 'write_file') &&
|
||
|
|
log.toolRequest.args.includes('src/a.ts'),
|
||
|
|
);
|
||
|
|
|
||
|
|
const editBIndex = toolLogs.findIndex(
|
||
|
|
(log) =>
|
||
|
|
(log.toolRequest.name === 'replace' ||
|
||
|
|
log.toolRequest.name === 'write_file') &&
|
||
|
|
log.toolRequest.args.includes('src/b.ts'),
|
||
|
|
);
|
||
|
|
|
||
|
|
expect(editAIndex, 'Agent should have edited src/a.ts').toBeGreaterThan(
|
||
|
|
-1,
|
||
|
|
);
|
||
|
|
expect(editBIndex, 'Agent should have edited src/b.ts').toBeGreaterThan(
|
||
|
|
editAIndex,
|
||
|
|
);
|
||
|
|
|
||
|
|
const isValidationCommand = (log: any) => {
|
||
|
|
if (log.toolRequest.name !== 'run_shell_command') return false;
|
||
|
|
const cmd = log.toolRequest.args.toLowerCase();
|
||
|
|
return (
|
||
|
|
cmd.includes('build') ||
|
||
|
|
cmd.includes('test') ||
|
||
|
|
cmd.includes('npm run') ||
|
||
|
|
cmd.includes('tsc')
|
||
|
|
);
|
||
|
|
};
|
||
|
|
|
||
|
|
// Check for validation between editA and editB
|
||
|
|
const validationBetween = toolLogs
|
||
|
|
.slice(editAIndex + 1, editBIndex)
|
||
|
|
.some(isValidationCommand);
|
||
|
|
|
||
|
|
expect(
|
||
|
|
validationBetween,
|
||
|
|
'Expected a build/test command between two distinct file edits to ensure incremental stability',
|
||
|
|
).toBe(true);
|
||
|
|
|
||
|
|
// Also check for validation after editB to confirm final state
|
||
|
|
const validationAfter = toolLogs
|
||
|
|
.slice(editBIndex + 1)
|
||
|
|
.some(isValidationCommand);
|
||
|
|
|
||
|
|
expect(
|
||
|
|
validationAfter,
|
||
|
|
'Expected a build/test command after the final file edit',
|
||
|
|
).toBe(true);
|
||
|
|
},
|
||
|
|
});
|
||
|
|
});
|