mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-04-22 02:54:31 -07:00
Generalize evals infra to support more types of evals, organization and queuing of named suites (#24941)
This commit is contained in:
committed by
GitHub
parent
bc3ed61adb
commit
f1bb2af6de
+18
-6
@@ -5,17 +5,21 @@
|
||||
*/
|
||||
|
||||
import { describe, expect } from 'vitest';
|
||||
import { appEvalTest, AppEvalCase } from './app-test-helper.js';
|
||||
import { EvalPolicy } from './test-helper.js';
|
||||
import { ApprovalMode, isRecord } from '@google/gemini-cli-core';
|
||||
import { appEvalTest, type AppEvalCase } from './app-test-helper.js';
|
||||
import { type EvalPolicy } from './test-helper.js';
|
||||
|
||||
function askUserEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
|
||||
const existingGeneral = evalCase.configOverrides?.['general'];
|
||||
const generalBase = isRecord(existingGeneral) ? existingGeneral : {};
|
||||
|
||||
return appEvalTest(policy, {
|
||||
...evalCase,
|
||||
configOverrides: {
|
||||
...evalCase.configOverrides,
|
||||
approvalMode: ApprovalMode.DEFAULT,
|
||||
general: {
|
||||
...evalCase.configOverrides?.general,
|
||||
approvalMode: 'default',
|
||||
...generalBase,
|
||||
enableAutoUpdate: false,
|
||||
enableAutoUpdateNotification: false,
|
||||
},
|
||||
@@ -28,6 +32,8 @@ function askUserEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
|
||||
|
||||
describe('ask_user', () => {
|
||||
askUserEvalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'Agent uses AskUser tool to present multiple choice options',
|
||||
prompt: `Use the ask_user tool to ask me what my favorite color is. Provide 3 options: red, green, or blue.`,
|
||||
setup: async (rig) => {
|
||||
@@ -43,6 +49,8 @@ describe('ask_user', () => {
|
||||
});
|
||||
|
||||
askUserEvalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'Agent uses AskUser tool to clarify ambiguous requirements',
|
||||
files: {
|
||||
'package.json': JSON.stringify({ name: 'my-app', version: '1.0.0' }),
|
||||
@@ -61,6 +69,8 @@ describe('ask_user', () => {
|
||||
});
|
||||
|
||||
askUserEvalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'Agent uses AskUser tool before performing significant ambiguous rework',
|
||||
files: {
|
||||
'packages/core/src/index.ts': '// index\nexport const version = "1.0.0";',
|
||||
@@ -82,8 +92,8 @@ describe('ask_user', () => {
|
||||
]);
|
||||
expect(confirmation, 'Expected a tool call confirmation').toBeDefined();
|
||||
|
||||
if (confirmation?.name === 'enter_plan_mode') {
|
||||
rig.acceptConfirmation('enter_plan_mode');
|
||||
if (confirmation?.toolName === 'enter_plan_mode') {
|
||||
await rig.resolveTool('enter_plan_mode');
|
||||
confirmation = await rig.waitForPendingConfirmation('ask_user');
|
||||
}
|
||||
|
||||
@@ -101,6 +111,8 @@ describe('ask_user', () => {
|
||||
// updates to clarify that shell command confirmation is handled by the UI.
|
||||
// See fix: https://github.com/google-gemini/gemini-cli/pull/20504
|
||||
askUserEvalTest('USUALLY_PASSES', {
|
||||
suiteName: 'default',
|
||||
suiteType: 'behavioral',
|
||||
name: 'Agent does NOT use AskUser to confirm shell commands',
|
||||
files: {
|
||||
'package.json': JSON.stringify({
|
||||
|
||||
Reference in New Issue
Block a user