From d3766875f8d36c0f37f027e21743a968c22336d7 Mon Sep 17 00:00:00 2001 From: Sandy Tao Date: Fri, 20 Mar 2026 20:45:33 -0700 Subject: [PATCH] fix(evals): remove tool restrictions and add compile-time guards (#23312) --- evals/app-test-helper.ts | 19 +++++++- evals/generalist_delegation.eval.ts | 4 -- evals/model_steering.eval.ts | 2 - evals/save_memory.eval.ts | 68 ++++------------------------- evals/test-helper.ts | 18 +++++++- 5 files changed, 43 insertions(+), 68 deletions(-) diff --git a/evals/app-test-helper.ts b/evals/app-test-helper.ts index 89f1582bdc..2bcff41924 100644 --- a/evals/app-test-helper.ts +++ b/evals/app-test-helper.ts @@ -15,9 +15,26 @@ import fs from 'node:fs'; import path from 'node:path'; import { DEFAULT_GEMINI_MODEL } from '@google/gemini-cli-core'; +/** + * Config overrides for evals, with tool-restriction fields explicitly + * forbidden. Evals must test against the full, default tool set to ensure + * realistic behavior. + */ +interface EvalConfigOverrides { + /** Restricting tools via excludeTools in evals is forbidden. */ + excludeTools?: never; + /** Restricting tools via coreTools in evals is forbidden. */ + coreTools?: never; + /** Restricting tools via allowedTools in evals is forbidden. */ + allowedTools?: never; + /** Restricting tools via mainAgentTools in evals is forbidden. */ + mainAgentTools?: never; + [key: string]: unknown; +} + export interface AppEvalCase { name: string; - configOverrides?: any; + configOverrides?: EvalConfigOverrides; prompt: string; timeout?: number; files?: Record; diff --git a/evals/generalist_delegation.eval.ts b/evals/generalist_delegation.eval.ts index 7e6358ae1f..81252880eb 100644 --- a/evals/generalist_delegation.eval.ts +++ b/evals/generalist_delegation.eval.ts @@ -21,7 +21,6 @@ describe('generalist_delegation', () => { experimental: { enableAgents: true, }, - excludeTools: ['run_shell_command'], }, files: { 'file1.ts': 'console.log("no semi")', @@ -65,7 +64,6 @@ describe('generalist_delegation', () => { experimental: { enableAgents: true, }, - excludeTools: ['run_shell_command'], }, files: { 'src/a.ts': 'export const a = 1;', @@ -106,7 +104,6 @@ describe('generalist_delegation', () => { experimental: { enableAgents: true, }, - excludeTools: ['run_shell_command'], }, files: { 'README.md': 'This is a proyect.', @@ -141,7 +138,6 @@ describe('generalist_delegation', () => { experimental: { enableAgents: true, }, - excludeTools: ['run_shell_command'], }, files: { 'src/VERSION': '1.2.3', diff --git a/evals/model_steering.eval.ts b/evals/model_steering.eval.ts index 4a5ae46e3f..2cb87edcc2 100644 --- a/evals/model_steering.eval.ts +++ b/evals/model_steering.eval.ts @@ -15,7 +15,6 @@ describe('Model Steering Behavioral Evals', () => { appEvalTest('USUALLY_PASSES', { name: 'Corrective Hint: Model switches task based on hint during tool turn', configOverrides: { - excludeTools: ['run_shell_command', 'ls', 'google_web_search'], modelSteering: true, }, files: { @@ -55,7 +54,6 @@ describe('Model Steering Behavioral Evals', () => { appEvalTest('USUALLY_PASSES', { name: 'Suggestive Hint: Model incorporates user guidance mid-stream', configOverrides: { - excludeTools: ['run_shell_command', 'ls', 'google_web_search'], modelSteering: true, }, files: {}, diff --git a/evals/save_memory.eval.ts b/evals/save_memory.eval.ts index 901cbf3c17..8be7b39e35 100644 --- a/evals/save_memory.eval.ts +++ b/evals/save_memory.eval.ts @@ -16,9 +16,7 @@ describe('save_memory', () => { const rememberingFavoriteColor = "Agent remembers user's favorite color"; evalTest('ALWAYS_PASSES', { name: rememberingFavoriteColor, - params: { - settings: { tools: { core: ['save_memory'] } }, - }, + prompt: `remember that my favorite color is blue. what is my favorite color? tell me that and surround it with $ symbol`, @@ -38,9 +36,7 @@ describe('save_memory', () => { const rememberingCommandRestrictions = 'Agent remembers command restrictions'; evalTest('USUALLY_PASSES', { name: rememberingCommandRestrictions, - params: { - settings: { tools: { core: ['save_memory'] } }, - }, + prompt: `I don't want you to ever run npm commands.`, assert: async (rig, result) => { const wasToolCalled = await rig.waitForToolCall('save_memory'); @@ -59,9 +55,7 @@ describe('save_memory', () => { const rememberingWorkflow = 'Agent remembers workflow preferences'; evalTest('USUALLY_PASSES', { name: rememberingWorkflow, - params: { - settings: { tools: { core: ['save_memory'] } }, - }, + prompt: `I want you to always lint after building.`, assert: async (rig, result) => { const wasToolCalled = await rig.waitForToolCall('save_memory'); @@ -81,9 +75,7 @@ describe('save_memory', () => { 'Agent ignores temporary conversation details'; evalTest('ALWAYS_PASSES', { name: ignoringTemporaryInformation, - params: { - settings: { tools: { core: ['save_memory'] } }, - }, + prompt: `I'm going to get a coffee.`, assert: async (rig, result) => { await rig.waitForTelemetryReady(); @@ -106,9 +98,7 @@ describe('save_memory', () => { const rememberingPetName = "Agent remembers user's pet's name"; evalTest('ALWAYS_PASSES', { name: rememberingPetName, - params: { - settings: { tools: { core: ['save_memory'] } }, - }, + prompt: `Please remember that my dog's name is Buddy.`, assert: async (rig, result) => { const wasToolCalled = await rig.waitForToolCall('save_memory'); @@ -127,9 +117,7 @@ describe('save_memory', () => { const rememberingCommandAlias = 'Agent remembers custom command aliases'; evalTest('ALWAYS_PASSES', { name: rememberingCommandAlias, - params: { - settings: { tools: { core: ['save_memory'] } }, - }, + prompt: `When I say 'start server', you should run 'npm run dev'.`, assert: async (rig, result) => { const wasToolCalled = await rig.waitForToolCall('save_memory'); @@ -149,18 +137,6 @@ describe('save_memory', () => { "Agent ignores workspace's database schema location"; evalTest('USUALLY_PASSES', { name: ignoringDbSchemaLocation, - params: { - settings: { - tools: { - core: [ - 'save_memory', - 'list_directory', - 'read_file', - 'run_shell_command', - ], - }, - }, - }, prompt: `The database schema for this workspace is located in \`db/schema.sql\`.`, assert: async (rig, result) => { await rig.waitForTelemetryReady(); @@ -180,9 +156,7 @@ describe('save_memory', () => { "Agent remembers user's coding style preference"; evalTest('ALWAYS_PASSES', { name: rememberingCodingStyle, - params: { - settings: { tools: { core: ['save_memory'] } }, - }, + prompt: `I prefer to use tabs instead of spaces for indentation.`, assert: async (rig, result) => { const wasToolCalled = await rig.waitForToolCall('save_memory'); @@ -202,18 +176,6 @@ describe('save_memory', () => { 'Agent ignores workspace build artifact location'; evalTest('USUALLY_PASSES', { name: ignoringBuildArtifactLocation, - params: { - settings: { - tools: { - core: [ - 'save_memory', - 'list_directory', - 'read_file', - 'run_shell_command', - ], - }, - }, - }, prompt: `In this workspace, build artifacts are stored in the \`dist/artifacts\` directory.`, assert: async (rig, result) => { await rig.waitForTelemetryReady(); @@ -232,18 +194,6 @@ describe('save_memory', () => { const ignoringMainEntryPoint = "Agent ignores workspace's main entry point"; evalTest('USUALLY_PASSES', { name: ignoringMainEntryPoint, - params: { - settings: { - tools: { - core: [ - 'save_memory', - 'list_directory', - 'read_file', - 'run_shell_command', - ], - }, - }, - }, prompt: `The main entry point for this workspace is \`src/index.js\`.`, assert: async (rig, result) => { await rig.waitForTelemetryReady(); @@ -262,9 +212,7 @@ describe('save_memory', () => { const rememberingBirthday = "Agent remembers user's birthday"; evalTest('ALWAYS_PASSES', { name: rememberingBirthday, - params: { - settings: { tools: { core: ['save_memory'] } }, - }, + prompt: `My birthday is on June 15th.`, assert: async (rig, result) => { const wasToolCalled = await rig.waitForToolCall('save_memory'); diff --git a/evals/test-helper.ts b/evals/test-helper.ts index 786ec0e418..66143ddfb6 100644 --- a/evals/test-helper.ts +++ b/evals/test-helper.ts @@ -197,9 +197,25 @@ export function symlinkNodeModules(testDir: string) { } } +/** + * Settings that are forbidden in evals. Evals should never restrict which + * tools are available — they must test against the full, default tool set + * to ensure realistic behavior. + */ +interface ForbiddenToolSettings { + tools?: { + /** Restricting core tools in evals is forbidden. */ + core?: never; + [key: string]: unknown; + }; +} + export interface EvalCase { name: string; - params?: Record; + params?: { + settings?: ForbiddenToolSettings & Record; + [key: string]: unknown; + }; prompt: string; timeout?: number; files?: Record;