Files
gemini-cli/evals/validation_fidelity.eval.ts

86 lines
2.5 KiB
TypeScript

/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, expect } from 'vitest';
import { evalTest } from './test-helper.js';
describe('validation_fidelity', () => {
evalTest('ALWAYS_PASSES', {
name: 'should perform exhaustive validation autonomously when guided by system instructions',
files: {
'src/types.ts': `
export interface LogEntry {
level: 'info' | 'warn' | 'error';
message: string;
}
`,
'src/logger.ts': `
import { LogEntry } from './types.js';
export function formatLog(entry: LogEntry): string {
return \`[\${entry.level.toUpperCase()}] \${entry.message}\`;
}
`,
'src/logger.test.ts': `
import { expect, test } from 'vitest';
import { formatLog } from './logger.js';
import { LogEntry } from './types.js';
test('formats log correctly', () => {
const entry: LogEntry = { level: 'info', message: 'test message' };
expect(formatLog(entry)).toBe('[INFO] test message');
});
`,
'package.json': JSON.stringify({
name: 'test-project',
type: 'module',
scripts: {
test: 'vitest run',
build: 'tsc --noEmit',
},
}),
'tsconfig.json': JSON.stringify({
compilerOptions: {
target: 'ESNext',
module: 'ESNext',
moduleResolution: 'node',
strict: true,
esModuleInterop: true,
skipLibCheck: true,
forceConsistentCasingInFileNames: true,
},
}),
},
prompt:
"Refactor the 'LogEntry' interface in 'src/types.ts' to rename the 'message' field to 'payload'.",
timeout: 600000,
assert: async (rig) => {
// The goal of this eval is to see if the agent realizes it needs to update usages
// AND run 'npm run build' or 'tsc' autonomously to ensure project-wide structural integrity.
const toolLogs = rig.readToolLogs();
const shellCalls = toolLogs.filter(
(log) => log.toolRequest.name === 'run_shell_command',
);
const hasBuildOrTsc = shellCalls.some((log) => {
const cmd = JSON.parse(log.toolRequest.args).command.toLowerCase();
return (
cmd.includes('npm run build') ||
cmd.includes('tsc') ||
cmd.includes('typecheck') ||
cmd.includes('npm run verify')
);
});
expect(
hasBuildOrTsc,
'Expected the agent to autonomously run a build or type-check command to verify the refactoring',
).toBe(true);
},
});
});