diff --git a/evals/app-test-helper.ts b/evals/app-test-helper.ts index e9d5d695b7..1794573fe1 100644 --- a/evals/app-test-helper.ts +++ b/evals/app-test-helper.ts @@ -12,7 +12,7 @@ import { symlinkNodeModules, withEvalRetries, prepareWorkspace, - BaseEvalCase, + type BaseEvalCase, EVAL_MODEL, } from './test-helper.js'; import fs from 'node:fs'; @@ -32,6 +32,7 @@ interface EvalConfigOverrides { allowedTools?: never; /** Restricting tools via mainAgentTools in evals is forbidden. */ mainAgentTools?: never; + [key: string]: unknown; } diff --git a/evals/ask_user.eval.ts b/evals/ask_user.eval.ts index b31ef54ae5..580081108a 100644 --- a/evals/ask_user.eval.ts +++ b/evals/ask_user.eval.ts @@ -5,17 +5,21 @@ */ import { describe, expect } from 'vitest'; +import { ApprovalMode, isRecord } from '@google/gemini-cli-core'; import { appEvalTest, type AppEvalCase } from './app-test-helper.js'; import { type EvalPolicy } from './test-helper.js'; function askUserEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) { + const existingGeneral = evalCase.configOverrides?.['general']; + const generalBase = isRecord(existingGeneral) ? existingGeneral : {}; + return appEvalTest(policy, { ...evalCase, configOverrides: { ...evalCase.configOverrides, + approvalMode: ApprovalMode.DEFAULT, general: { - ...evalCase.configOverrides?.general, - approvalMode: 'default', + ...generalBase, enableAutoUpdate: false, enableAutoUpdateNotification: false, }, @@ -27,7 +31,7 @@ function askUserEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) { } describe('ask_user', () => { - askUserEvalTest('USUALLY_PASSES', { + askUserEvalTest('ALWAYS_PASSES', { suiteName: 'default', suiteType: 'behavioral', name: 'Agent uses AskUser tool to present multiple choice options', @@ -44,7 +48,7 @@ describe('ask_user', () => { }, }); - askUserEvalTest('USUALLY_PASSES', { + askUserEvalTest('ALWAYS_PASSES', { suiteName: 'default', suiteType: 'behavioral', name: 'Agent uses AskUser tool to clarify ambiguous requirements', @@ -64,7 +68,7 @@ describe('ask_user', () => { }, }); - askUserEvalTest('USUALLY_PASSES', { + askUserEvalTest('ALWAYS_PASSES', { suiteName: 'default', suiteType: 'behavioral', name: 'Agent uses AskUser tool before performing significant ambiguous rework', @@ -88,8 +92,8 @@ describe('ask_user', () => { ]); expect(confirmation, 'Expected a tool call confirmation').toBeDefined(); - if (confirmation?.name === 'enter_plan_mode') { - rig.acceptConfirmation('enter_plan_mode'); + if (confirmation?.toolName === 'enter_plan_mode') { + await rig.resolveTool('enter_plan_mode'); confirmation = await rig.waitForPendingConfirmation('ask_user'); } @@ -106,7 +110,7 @@ describe('ask_user', () => { // confirm shell commands. Fixed via prompt refinements and tool definition // updates to clarify that shell command confirmation is handled by the UI. // See fix: https://github.com/google-gemini/gemini-cli/pull/20504 - askUserEvalTest('USUALLY_PASSES', { + askUserEvalTest('ALWAYS_PASSES', { suiteName: 'default', suiteType: 'behavioral', name: 'Agent does NOT use AskUser to confirm shell commands', diff --git a/evals/frugalSearch.eval.ts b/evals/frugalSearch.eval.ts index 82438585e6..d5962b1534 100644 --- a/evals/frugalSearch.eval.ts +++ b/evals/frugalSearch.eval.ts @@ -13,18 +13,6 @@ import { evalTest } from './test-helper.js'; * This ensures the agent doesn't flood the context window with unnecessary search results. */ describe('Frugal Search', () => { - const getGrepParams = (call: any): any => { - let args = call.toolRequest.args; - if (typeof args === 'string') { - try { - args = JSON.parse(args); - } catch (e) { - // Ignore parse errors - } - } - return args; - }; - /** * Ensure that the agent makes use of either grep or ranged reads in fulfilling this task. * The task is specifically phrased to not evoke "view" or "search" specifically because diff --git a/evals/hierarchical_memory.eval.ts b/evals/hierarchical_memory.eval.ts index b7b58c79a1..7b673af6d6 100644 --- a/evals/hierarchical_memory.eval.ts +++ b/evals/hierarchical_memory.eval.ts @@ -5,8 +5,7 @@ */ import { describe, expect } from 'vitest'; -import { evalTest } from './test-helper.js'; -import { assertModelHasOutput } from '../integration-tests/test-helper.js'; +import { evalTest, assertModelHasOutput } from './test-helper.js'; describe('Hierarchical Memory', () => { const conflictResolutionTest = diff --git a/evals/save_memory.eval.ts b/evals/save_memory.eval.ts index bbb13d1c44..5a228ed065 100644 --- a/evals/save_memory.eval.ts +++ b/evals/save_memory.eval.ts @@ -5,11 +5,11 @@ */ import { describe, expect } from 'vitest'; -import { evalTest } from './test-helper.js'; import { + evalTest, assertModelHasOutput, checkModelOutputContent, -} from '../integration-tests/test-helper.js'; +} from './test-helper.js'; describe('save_memory', () => { const TEST_PREFIX = 'Save memory test: '; diff --git a/evals/test-helper.test.ts b/evals/test-helper.test.ts index c0147cda75..6be26e918a 100644 --- a/evals/test-helper.test.ts +++ b/evals/test-helper.test.ts @@ -49,6 +49,8 @@ describe('evalTest reliability logic', () => { // Execute the test function directly await internalEvalTest({ + suiteName: 'test', + suiteType: 'behavioral', name: 'test-api-failure', prompt: 'do something', assert: async () => {}, @@ -83,6 +85,8 @@ describe('evalTest reliability logic', () => { // Expect the test function to throw immediately await expect( internalEvalTest({ + suiteName: 'test', + suiteType: 'behavioral', name: 'test-logic-failure', prompt: 'do something', assert: async () => { @@ -108,6 +112,8 @@ describe('evalTest reliability logic', () => { .mockResolvedValueOnce('Success'); await internalEvalTest({ + suiteName: 'test', + suiteType: 'behavioral', name: 'test-recovery', prompt: 'do something', assert: async () => {}, @@ -135,6 +141,8 @@ describe('evalTest reliability logic', () => { ); await internalEvalTest({ + suiteName: 'test', + suiteType: 'behavioral', name: 'test-api-503', prompt: 'do something', assert: async () => {}, @@ -162,6 +170,8 @@ describe('evalTest reliability logic', () => { try { await expect( internalEvalTest({ + suiteName: 'test', + suiteType: 'behavioral', name: 'test-absolute-path', prompt: 'do something', files: { @@ -190,6 +200,8 @@ describe('evalTest reliability logic', () => { try { await expect( internalEvalTest({ + suiteName: 'test', + suiteType: 'behavioral', name: 'test-traversal', prompt: 'do something', files: { diff --git a/evals/tsconfig.json b/evals/tsconfig.json deleted file mode 100644 index edc9007206..0000000000 --- a/evals/tsconfig.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "extends": "../tsconfig.json", - "compilerOptions": { - "jsx": "react-jsx", - "lib": ["DOM", "DOM.Iterable", "ES2023"], - "types": ["node", "vitest/globals"] - }, - "include": ["**/*.ts", "**/*.tsx"], - "exclude": ["node_modules", "logs"] -}