diff --git a/evals/ask_user.eval.ts b/evals/ask_user.eval.ts index c67f995168..6495cb3f22 100644 --- a/evals/ask_user.eval.ts +++ b/evals/ask_user.eval.ts @@ -5,31 +5,62 @@ */ import { describe, expect } from 'vitest'; -import { evalTest } from './test-helper.js'; +import { appEvalTest, AppEvalCase } from './app-test-helper.js'; +import { EvalPolicy } from './test-helper.js'; + +function askUserEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) { + return appEvalTest(policy, { + ...evalCase, + configOverrides: { + ...evalCase.configOverrides, + general: { + ...evalCase.configOverrides?.general, + approvalMode: 'default', + enableAutoUpdate: false, + enableAutoUpdateNotification: false, + }, + }, + files: { + ...evalCase.files, + }, + }); +} describe('ask_user', () => { - evalTest('USUALLY_PASSES', { + askUserEvalTest('USUALLY_PASSES', { name: 'Agent uses AskUser tool to present multiple choice options', prompt: `Use the ask_user tool to ask me what my favorite color is. Provide 3 options: red, green, or blue.`, + setup: async (rig) => { + rig.setBreakpoint(['ask_user']); + }, assert: async (rig) => { - const wasToolCalled = await rig.waitForToolCall('ask_user'); - expect(wasToolCalled, 'Expected ask_user tool to be called').toBe(true); + const confirmation = await rig.waitForPendingConfirmation('ask_user'); + expect( + confirmation, + 'Expected a pending confirmation for ask_user tool', + ).toBeDefined(); }, }); - evalTest('USUALLY_PASSES', { + askUserEvalTest('USUALLY_PASSES', { name: 'Agent uses AskUser tool to clarify ambiguous requirements', files: { 'package.json': JSON.stringify({ name: 'my-app', version: '1.0.0' }), }, prompt: `I want to build a new feature in this app. Ask me questions to clarify the requirements before proceeding.`, + setup: async (rig) => { + rig.setBreakpoint(['ask_user']); + }, assert: async (rig) => { - const wasToolCalled = await rig.waitForToolCall('ask_user'); - expect(wasToolCalled, 'Expected ask_user tool to be called').toBe(true); + const confirmation = await rig.waitForPendingConfirmation('ask_user'); + expect( + confirmation, + 'Expected a pending confirmation for ask_user tool', + ).toBeDefined(); }, }); - evalTest('USUALLY_PASSES', { + askUserEvalTest('USUALLY_PASSES', { name: 'Agent uses AskUser tool before performing significant ambiguous rework', files: { 'packages/core/src/index.ts': '// index\nexport const version = "1.0.0";', @@ -39,28 +70,37 @@ describe('ask_user', () => { }), 'README.md': '# Gemini CLI', }, - prompt: `Refactor the entire core package to be better.`, + prompt: `I want to completely rewrite the core package to support the upcoming V2 architecture, but I haven't decided what that looks like yet. We need to figure out the requirements first. Can you ask me some questions to help nail down the design?`, + setup: async (rig) => { + rig.setBreakpoint(['enter_plan_mode', 'ask_user']); + }, assert: async (rig) => { - const wasPlanModeCalled = await rig.waitForToolCall('enter_plan_mode'); - expect(wasPlanModeCalled, 'Expected enter_plan_mode to be called').toBe( - true, - ); + // It might call enter_plan_mode first. + let confirmation = await rig.waitForPendingConfirmation([ + 'enter_plan_mode', + 'ask_user', + ]); + expect(confirmation, 'Expected a tool call confirmation').toBeDefined(); + + if (confirmation?.name === 'enter_plan_mode') { + rig.acceptConfirmation('enter_plan_mode'); + confirmation = await rig.waitForPendingConfirmation('ask_user'); + } - const wasAskUserCalled = await rig.waitForToolCall('ask_user'); expect( - wasAskUserCalled, - 'Expected ask_user tool to be called to clarify the significant rework', - ).toBe(true); + confirmation?.toolName, + 'Expected ask_user to be called to clarify the significant rework', + ).toBe('ask_user'); }, }); // --- Regression Tests for Recent Fixes --- - // Regression test for issue #20177: Ensure the agent does not use `ask_user` to + // Regression test for issue #20177: Ensure the agent does not use \`ask_user\` to // confirm shell commands. Fixed via prompt refinements and tool definition // updates to clarify that shell command confirmation is handled by the UI. // See fix: https://github.com/google-gemini/gemini-cli/pull/20504 - evalTest('USUALLY_PASSES', { + askUserEvalTest('USUALLY_PASSES', { name: 'Agent does NOT use AskUser to confirm shell commands', files: { 'package.json': JSON.stringify({ @@ -68,25 +108,24 @@ describe('ask_user', () => { }), }, prompt: `Run 'npm run build' in the current directory.`, + setup: async (rig) => { + rig.setBreakpoint(['run_shell_command', 'ask_user']); + }, assert: async (rig) => { - await rig.waitForTelemetryReady(); - - const toolLogs = rig.readToolLogs(); - const wasShellCalled = toolLogs.some( - (log) => log.toolRequest.name === 'run_shell_command', - ); - const wasAskUserCalled = toolLogs.some( - (log) => log.toolRequest.name === 'ask_user', - ); + const confirmation = await rig.waitForPendingConfirmation([ + 'run_shell_command', + 'ask_user', + ]); expect( - wasShellCalled, - 'Expected run_shell_command tool to be called', - ).toBe(true); + confirmation, + 'Expected a pending confirmation for a tool', + ).toBeDefined(); + expect( - wasAskUserCalled, + confirmation?.toolName, 'ask_user should not be called to confirm shell commands', - ).toBe(false); + ).toBe('run_shell_command'); }, }); }); diff --git a/packages/cli/src/test-utils/AppRig.tsx b/packages/cli/src/test-utils/AppRig.tsx index a9aea95376..6ee39c879c 100644 --- a/packages/cli/src/test-utils/AppRig.tsx +++ b/packages/cli/src/test-utils/AppRig.tsx @@ -487,7 +487,7 @@ export class AppRig { } async waitForPendingConfirmation( - toolNameOrDisplayName?: string | RegExp, + toolNameOrDisplayName?: string | RegExp | string[], timeout = 30000, ): Promise { const matches = (p: PendingConfirmation) => { @@ -498,6 +498,12 @@ export class AppRig { p.toolDisplayName === toolNameOrDisplayName ); } + if (Array.isArray(toolNameOrDisplayName)) { + return ( + toolNameOrDisplayName.includes(p.toolName) || + toolNameOrDisplayName.includes(p.toolDisplayName || '') + ); + } return ( toolNameOrDisplayName.test(p.toolName) || toolNameOrDisplayName.test(p.toolDisplayName || '') diff --git a/packages/test-utils/src/test-rig.ts b/packages/test-utils/src/test-rig.ts index 6d888aeef8..ee091bee92 100644 --- a/packages/test-utils/src/test-rig.ts +++ b/packages/test-utils/src/test-rig.ts @@ -353,6 +353,7 @@ export class TestRig { testName: string, options: { settings?: Record; + state?: Record; fakeResponsesPath?: string; } = {}, ) { @@ -382,6 +383,9 @@ export class TestRig { // Create a settings file to point the CLI to the local collector this._createSettingsFile(options.settings); + + // Create persistent state file + this._createStateFile(options.state); } private _cleanDir(dir: string) { @@ -473,6 +477,24 @@ export class TestRig { ); } + private _createStateFile(overrideState?: Record) { + if (!this.homeDir) throw new Error('TestRig homeDir is not initialized'); + const userGeminiDir = join(this.homeDir, GEMINI_DIR); + mkdirSync(userGeminiDir, { recursive: true }); + + const state = deepMerge( + { + terminalSetupPromptShown: true, // Default to true in tests to avoid blocking prompts + }, + overrideState ?? {}, + ); + + writeFileSync( + join(userGeminiDir, 'state.json'), + JSON.stringify(state, null, 2), + ); + } + createFile(fileName: string, content: string) { const filePath = join(this.testDir!, fileName); writeFileSync(filePath, content);