From d3766875f8d36c0f37f027e21743a968c22336d7 Mon Sep 17 00:00:00 2001
From: Sandy Tao <sandytao520@icloud.com>
Date: Fri, 20 Mar 2026 20:45:33 -0700
Subject: [PATCH] fix(evals): remove tool restrictions and add compile-time
 guards (#23312)

---
 evals/app-test-helper.ts            | 19 +++++++-
 evals/generalist_delegation.eval.ts |  4 --
 evals/model_steering.eval.ts        |  2 -
 evals/save_memory.eval.ts           | 68 ++++-------------------------
 evals/test-helper.ts                | 18 +++++++-
 5 files changed, 43 insertions(+), 68 deletions(-)

diff --git a/evals/app-test-helper.ts b/evals/app-test-helper.ts
index 89f1582bdc..2bcff41924 100644
--- a/evals/app-test-helper.ts
+++ b/evals/app-test-helper.ts
@@ -15,9 +15,26 @@ import fs from 'node:fs';
 import path from 'node:path';
 import { DEFAULT_GEMINI_MODEL } from '@google/gemini-cli-core';
 
+/**
+ * Config overrides for evals, with tool-restriction fields explicitly
+ * forbidden. Evals must test against the full, default tool set to ensure
+ * realistic behavior.
+ */
+interface EvalConfigOverrides {
+  /** Restricting tools via excludeTools in evals is forbidden. */
+  excludeTools?: never;
+  /** Restricting tools via coreTools in evals is forbidden. */
+  coreTools?: never;
+  /** Restricting tools via allowedTools in evals is forbidden. */
+  allowedTools?: never;
+  /** Restricting tools via mainAgentTools in evals is forbidden. */
+  mainAgentTools?: never;
+  [key: string]: unknown;
+}
+
 export interface AppEvalCase {
   name: string;
-  configOverrides?: any;
+  configOverrides?: EvalConfigOverrides;
   prompt: string;
   timeout?: number;
   files?: Record<string, string>;
diff --git a/evals/generalist_delegation.eval.ts b/evals/generalist_delegation.eval.ts
index 7e6358ae1f..81252880eb 100644
--- a/evals/generalist_delegation.eval.ts
+++ b/evals/generalist_delegation.eval.ts
@@ -21,7 +21,6 @@ describe('generalist_delegation', () => {
       experimental: {
         enableAgents: true,
       },
-      excludeTools: ['run_shell_command'],
     },
     files: {
       'file1.ts': 'console.log("no semi")',
@@ -65,7 +64,6 @@ describe('generalist_delegation', () => {
       experimental: {
         enableAgents: true,
       },
-      excludeTools: ['run_shell_command'],
     },
     files: {
       'src/a.ts': 'export const a = 1;',
@@ -106,7 +104,6 @@ describe('generalist_delegation', () => {
       experimental: {
         enableAgents: true,
       },
-      excludeTools: ['run_shell_command'],
     },
     files: {
       'README.md': 'This is a proyect.',
@@ -141,7 +138,6 @@ describe('generalist_delegation', () => {
       experimental: {
         enableAgents: true,
       },
-      excludeTools: ['run_shell_command'],
     },
     files: {
       'src/VERSION': '1.2.3',
diff --git a/evals/model_steering.eval.ts b/evals/model_steering.eval.ts
index 4a5ae46e3f..2cb87edcc2 100644
--- a/evals/model_steering.eval.ts
+++ b/evals/model_steering.eval.ts
@@ -15,7 +15,6 @@ describe('Model Steering Behavioral Evals', () => {
   appEvalTest('USUALLY_PASSES', {
     name: 'Corrective Hint: Model switches task based on hint during tool turn',
     configOverrides: {
-      excludeTools: ['run_shell_command', 'ls', 'google_web_search'],
       modelSteering: true,
     },
     files: {
@@ -55,7 +54,6 @@ describe('Model Steering Behavioral Evals', () => {
   appEvalTest('USUALLY_PASSES', {
     name: 'Suggestive Hint: Model incorporates user guidance mid-stream',
     configOverrides: {
-      excludeTools: ['run_shell_command', 'ls', 'google_web_search'],
       modelSteering: true,
     },
     files: {},
diff --git a/evals/save_memory.eval.ts b/evals/save_memory.eval.ts
index 901cbf3c17..8be7b39e35 100644
--- a/evals/save_memory.eval.ts
+++ b/evals/save_memory.eval.ts
@@ -16,9 +16,7 @@ describe('save_memory', () => {
   const rememberingFavoriteColor = "Agent remembers user's favorite color";
   evalTest('ALWAYS_PASSES', {
     name: rememberingFavoriteColor,
-    params: {
-      settings: { tools: { core: ['save_memory'] } },
-    },
+
     prompt: `remember that my favorite color is  blue.
   
     what is my favorite color? tell me that and surround it with $ symbol`,
@@ -38,9 +36,7 @@ describe('save_memory', () => {
   const rememberingCommandRestrictions = 'Agent remembers command restrictions';
   evalTest('USUALLY_PASSES', {
     name: rememberingCommandRestrictions,
-    params: {
-      settings: { tools: { core: ['save_memory'] } },
-    },
+
     prompt: `I don't want you to ever run npm commands.`,
     assert: async (rig, result) => {
       const wasToolCalled = await rig.waitForToolCall('save_memory');
@@ -59,9 +55,7 @@ describe('save_memory', () => {
   const rememberingWorkflow = 'Agent remembers workflow preferences';
   evalTest('USUALLY_PASSES', {
     name: rememberingWorkflow,
-    params: {
-      settings: { tools: { core: ['save_memory'] } },
-    },
+
     prompt: `I want you to always lint after building.`,
     assert: async (rig, result) => {
       const wasToolCalled = await rig.waitForToolCall('save_memory');
@@ -81,9 +75,7 @@ describe('save_memory', () => {
     'Agent ignores temporary conversation details';
   evalTest('ALWAYS_PASSES', {
     name: ignoringTemporaryInformation,
-    params: {
-      settings: { tools: { core: ['save_memory'] } },
-    },
+
     prompt: `I'm going to get a coffee.`,
     assert: async (rig, result) => {
       await rig.waitForTelemetryReady();
@@ -106,9 +98,7 @@ describe('save_memory', () => {
   const rememberingPetName = "Agent remembers user's pet's name";
   evalTest('ALWAYS_PASSES', {
     name: rememberingPetName,
-    params: {
-      settings: { tools: { core: ['save_memory'] } },
-    },
+
     prompt: `Please remember that my dog's name is Buddy.`,
     assert: async (rig, result) => {
       const wasToolCalled = await rig.waitForToolCall('save_memory');
@@ -127,9 +117,7 @@ describe('save_memory', () => {
   const rememberingCommandAlias = 'Agent remembers custom command aliases';
   evalTest('ALWAYS_PASSES', {
     name: rememberingCommandAlias,
-    params: {
-      settings: { tools: { core: ['save_memory'] } },
-    },
+
     prompt: `When I say 'start server', you should run 'npm run dev'.`,
     assert: async (rig, result) => {
       const wasToolCalled = await rig.waitForToolCall('save_memory');
@@ -149,18 +137,6 @@ describe('save_memory', () => {
     "Agent ignores workspace's database schema location";
   evalTest('USUALLY_PASSES', {
     name: ignoringDbSchemaLocation,
-    params: {
-      settings: {
-        tools: {
-          core: [
-            'save_memory',
-            'list_directory',
-            'read_file',
-            'run_shell_command',
-          ],
-        },
-      },
-    },
     prompt: `The database schema for this workspace is located in \`db/schema.sql\`.`,
     assert: async (rig, result) => {
       await rig.waitForTelemetryReady();
@@ -180,9 +156,7 @@ describe('save_memory', () => {
     "Agent remembers user's coding style preference";
   evalTest('ALWAYS_PASSES', {
     name: rememberingCodingStyle,
-    params: {
-      settings: { tools: { core: ['save_memory'] } },
-    },
+
     prompt: `I prefer to use tabs instead of spaces for indentation.`,
     assert: async (rig, result) => {
       const wasToolCalled = await rig.waitForToolCall('save_memory');
@@ -202,18 +176,6 @@ describe('save_memory', () => {
     'Agent ignores workspace build artifact location';
   evalTest('USUALLY_PASSES', {
     name: ignoringBuildArtifactLocation,
-    params: {
-      settings: {
-        tools: {
-          core: [
-            'save_memory',
-            'list_directory',
-            'read_file',
-            'run_shell_command',
-          ],
-        },
-      },
-    },
     prompt: `In this workspace, build artifacts are stored in the \`dist/artifacts\` directory.`,
     assert: async (rig, result) => {
       await rig.waitForTelemetryReady();
@@ -232,18 +194,6 @@ describe('save_memory', () => {
   const ignoringMainEntryPoint = "Agent ignores workspace's main entry point";
   evalTest('USUALLY_PASSES', {
     name: ignoringMainEntryPoint,
-    params: {
-      settings: {
-        tools: {
-          core: [
-            'save_memory',
-            'list_directory',
-            'read_file',
-            'run_shell_command',
-          ],
-        },
-      },
-    },
     prompt: `The main entry point for this workspace is \`src/index.js\`.`,
     assert: async (rig, result) => {
       await rig.waitForTelemetryReady();
@@ -262,9 +212,7 @@ describe('save_memory', () => {
   const rememberingBirthday = "Agent remembers user's birthday";
   evalTest('ALWAYS_PASSES', {
     name: rememberingBirthday,
-    params: {
-      settings: { tools: { core: ['save_memory'] } },
-    },
+
     prompt: `My birthday is on June 15th.`,
     assert: async (rig, result) => {
       const wasToolCalled = await rig.waitForToolCall('save_memory');
diff --git a/evals/test-helper.ts b/evals/test-helper.ts
index 786ec0e418..66143ddfb6 100644
--- a/evals/test-helper.ts
+++ b/evals/test-helper.ts
@@ -197,9 +197,25 @@ export function symlinkNodeModules(testDir: string) {
   }
 }
 
+/**
+ * Settings that are forbidden in evals. Evals should never restrict which
+ * tools are available — they must test against the full, default tool set
+ * to ensure realistic behavior.
+ */
+interface ForbiddenToolSettings {
+  tools?: {
+    /** Restricting core tools in evals is forbidden. */
+    core?: never;
+    [key: string]: unknown;
+  };
+}
+
 export interface EvalCase {
   name: string;
-  params?: Record<string, any>;
+  params?: {
+    settings?: ForbiddenToolSettings & Record<string, unknown>;
+    [key: string]: unknown;
+  };
   prompt: string;
   timeout?: number;
   files?: Record<string, string>;