Generalize evals infra to support more types of evals, organization and queuing of named suites (#24941)

This commit is contained in:
Christian Gunderman
2026-04-08 23:57:26 +00:00
committed by GitHub
parent bc3ed61adb
commit f1bb2af6de
32 changed files with 475 additions and 133 deletions
+18 -6
View File
@@ -5,17 +5,21 @@
*/
import { describe, expect } from 'vitest';
import { appEvalTest, AppEvalCase } from './app-test-helper.js';
import { EvalPolicy } from './test-helper.js';
import { ApprovalMode, isRecord } from '@google/gemini-cli-core';
import { appEvalTest, type AppEvalCase } from './app-test-helper.js';
import { type EvalPolicy } from './test-helper.js';
function askUserEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
const existingGeneral = evalCase.configOverrides?.['general'];
const generalBase = isRecord(existingGeneral) ? existingGeneral : {};
return appEvalTest(policy, {
...evalCase,
configOverrides: {
...evalCase.configOverrides,
approvalMode: ApprovalMode.DEFAULT,
general: {
...evalCase.configOverrides?.general,
approvalMode: 'default',
...generalBase,
enableAutoUpdate: false,
enableAutoUpdateNotification: false,
},
@@ -28,6 +32,8 @@ function askUserEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
describe('ask_user', () => {
askUserEvalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'Agent uses AskUser tool to present multiple choice options',
prompt: `Use the ask_user tool to ask me what my favorite color is. Provide 3 options: red, green, or blue.`,
setup: async (rig) => {
@@ -43,6 +49,8 @@ describe('ask_user', () => {
});
askUserEvalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'Agent uses AskUser tool to clarify ambiguous requirements',
files: {
'package.json': JSON.stringify({ name: 'my-app', version: '1.0.0' }),
@@ -61,6 +69,8 @@ describe('ask_user', () => {
});
askUserEvalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'Agent uses AskUser tool before performing significant ambiguous rework',
files: {
'packages/core/src/index.ts': '// index\nexport const version = "1.0.0";',
@@ -82,8 +92,8 @@ describe('ask_user', () => {
]);
expect(confirmation, 'Expected a tool call confirmation').toBeDefined();
if (confirmation?.name === 'enter_plan_mode') {
rig.acceptConfirmation('enter_plan_mode');
if (confirmation?.toolName === 'enter_plan_mode') {
await rig.resolveTool('enter_plan_mode');
confirmation = await rig.waitForPendingConfirmation('ask_user');
}
@@ -101,6 +111,8 @@ describe('ask_user', () => {
// updates to clarify that shell command confirmation is handled by the UI.
// See fix: https://github.com/google-gemini/gemini-cli/pull/20504
askUserEvalTest('USUALLY_PASSES', {
suiteName: 'default',
suiteType: 'behavioral',
name: 'Agent does NOT use AskUser to confirm shell commands',
files: {
'package.json': JSON.stringify({