From b4053dc8dc7ae7643d06c6a34b87d97db124e403 Mon Sep 17 00:00:00 2001
From: Christian Gunderman <gundermanc@google.com>
Date: Tue, 7 Apr 2026 08:44:25 -0700
Subject: [PATCH] Use 3 flash and add another scenario.

---
 evals/app-test-helper.ts                      |  4 +-
 evals/compression.eval.ts                     | 92 ++++++++++++++++++-
 .../compression/scenario-blind-guess.json.js  | 73 +++++++++++++++
 evals/test-helper.ts                          | 16 +++-
 evals/tool_output_masking.eval.ts             |  6 +-
 5 files changed, 181 insertions(+), 10 deletions(-)
 create mode 100644 evals/data/compression/scenario-blind-guess.json.js

diff --git a/evals/app-test-helper.ts b/evals/app-test-helper.ts
index 017063a3f7..ce54adaaf4 100644
--- a/evals/app-test-helper.ts
+++ b/evals/app-test-helper.ts
@@ -13,10 +13,10 @@ import {
   withEvalRetries,
   prepareWorkspace,
   BaseEvalCase,
+  EVAL_MODEL,
 } from './test-helper.js';
 import fs from 'node:fs';
 import path from 'node:path';
-import { DEFAULT_GEMINI_MODEL } from '@google/gemini-cli-core';
 
 /**
  * Config overrides for evals, with tool-restriction fields explicitly
@@ -51,7 +51,7 @@ export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
     await withEvalRetries(evalCase.name, async () => {
       const rig = new AppRig({
         configOverrides: {
-          model: DEFAULT_GEMINI_MODEL,
+          model: EVAL_MODEL,
           ...evalCase.configOverrides,
         },
       });
diff --git a/evals/compression.eval.ts b/evals/compression.eval.ts
index 589991cfb7..0929089a7e 100644
--- a/evals/compression.eval.ts
+++ b/evals/compression.eval.ts
@@ -10,13 +10,12 @@ import {
   AgentHistoryProvider,
   ChatCompressionService,
   GeminiChat,
-  DEFAULT_GEMINI_MODEL,
 } from '@google/gemini-cli-core';
 import type { Content } from '@google/genai';
 import fs from 'node:fs';
 import path from 'node:path';
 import { fileURLToPath } from 'node:url';
-import { type EvalPolicy } from './test-helper.js';
+import { type EvalPolicy, EVAL_MODEL } from './test-helper.js';
 
 const __filename = fileURLToPath(import.meta.url);
 const __dirname = path.dirname(__filename);
@@ -117,6 +116,7 @@ const configs: Record<string, CompressionConfig> = {
 };
 
 function componentEval(
+  name: string,
   policy: EvalPolicy,
   scenario: Scenario,
   configName: keyof typeof configs,
@@ -131,8 +131,8 @@ function componentEval(
   }
 
   componentEvalTest(policy, {
-    name: `Compression | ${scenario.scenarioId} | ${compConfig.name} | ${question.id}`,
-    configOverrides: { model: DEFAULT_GEMINI_MODEL },
+    name,
+    configOverrides: { model: EVAL_MODEL },
     assert: async (config) => {
       const originalTokens =
         (
@@ -205,18 +205,21 @@ describe('Compression Benchmark', () => {
   describe('Blind Guessing', () => {
     const scenario = getScenario('scenario-blind-guess');
     componentEval(
+      'ChatCompressionService - exact-error-string',
       'ALWAYS_PASSES',
       scenario,
       'ChatCompressionService',
       'exact-error-string',
     );
     componentEval(
+      'AgentHistoryProvider (Tight) - exact-error-string',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Tight)',
       'exact-error-string',
     );
     componentEval(
+      'AgentHistoryProvider (Generous) - exact-error-string',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Generous)',
@@ -227,18 +230,21 @@ describe('Compression Benchmark', () => {
   describe('Conditional Instructions', () => {
     const scenario = getScenario('scenario-conditional-instruction');
     componentEval(
+      'ChatCompressionService - error-handler',
       'ALWAYS_PASSES',
       scenario,
       'ChatCompressionService',
       'error-handler',
     );
     componentEval(
+      'AgentHistoryProvider (Tight) - error-handler',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Tight)',
       'error-handler',
     );
     componentEval(
+      'AgentHistoryProvider (Generous) - error-handler',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Generous)',
@@ -249,36 +255,42 @@ describe('Compression Benchmark', () => {
   describe('Constraints', () => {
     const scenario = getScenario('scenario-constraints');
     componentEval(
+      'ChatCompressionService - variable-naming',
       'ALWAYS_PASSES',
       scenario,
       'ChatCompressionService',
       'variable-naming',
     );
     componentEval(
+      'AgentHistoryProvider (Tight) - variable-naming',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Tight)',
       'variable-naming',
     );
     componentEval(
+      'AgentHistoryProvider (Generous) - variable-naming',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Generous)',
       'variable-naming',
     );
     componentEval(
+      'ChatCompressionService - deployment-target',
       'ALWAYS_PASSES',
       scenario,
       'ChatCompressionService',
       'deployment-target',
     );
     componentEval(
+      'AgentHistoryProvider (Tight) - deployment-target',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Tight)',
       'deployment-target',
     );
     componentEval(
+      'AgentHistoryProvider (Generous) - deployment-target',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Generous)',
@@ -289,18 +301,21 @@ describe('Compression Benchmark', () => {
   describe('Context Amnesia', () => {
     const scenario = getScenario('scenario-context-amnesia');
     componentEval(
+      'ChatCompressionService - recall-file-content',
       'ALWAYS_PASSES',
       scenario,
       'ChatCompressionService',
       'recall-file-content',
     );
     componentEval(
+      'AgentHistoryProvider (Tight) - recall-file-content',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Tight)',
       'recall-file-content',
     );
     componentEval(
+      'AgentHistoryProvider (Generous) - recall-file-content',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Generous)',
@@ -311,18 +326,21 @@ describe('Compression Benchmark', () => {
   describe('Context Thrashing', () => {
     const scenario = getScenario('scenario-context-thrashing');
     componentEval(
+      'ChatCompressionService - failed-edit-reason',
       'ALWAYS_PASSES',
       scenario,
       'ChatCompressionService',
       'failed-edit-reason',
     );
     componentEval(
+      'AgentHistoryProvider (Tight) - failed-edit-reason',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Tight)',
       'failed-edit-reason',
     );
     componentEval(
+      'AgentHistoryProvider (Generous) - failed-edit-reason',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Generous)',
@@ -333,18 +351,21 @@ describe('Compression Benchmark', () => {
   describe('Dependency Chain', () => {
     const scenario = getScenario('scenario-dependency-chain');
     componentEval(
+      'ChatCompressionService - migration-dependency',
       'ALWAYS_PASSES',
       scenario,
       'ChatCompressionService',
       'migration-dependency',
     );
     componentEval(
+      'AgentHistoryProvider (Tight) - migration-dependency',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Tight)',
       'migration-dependency',
     );
     componentEval(
+      'AgentHistoryProvider (Generous) - migration-dependency',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Generous)',
@@ -355,18 +376,21 @@ describe('Compression Benchmark', () => {
   describe('Massive Working Set', () => {
     const scenario = getScenario('scenario-massive-working-set');
     componentEval(
+      'ChatCompressionService - file-25-presence',
       'ALWAYS_PASSES',
       scenario,
       'ChatCompressionService',
       'file-25-presence',
     );
     componentEval(
+      'AgentHistoryProvider (Tight) - file-25-presence',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Tight)',
       'file-25-presence',
     );
     componentEval(
+      'AgentHistoryProvider (Generous) - file-25-presence',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Generous)',
@@ -377,18 +401,21 @@ describe('Compression Benchmark', () => {
   describe('Middle Reasoning', () => {
     const scenario = getScenario('scenario-middle-reasoning');
     componentEval(
+      'ChatCompressionService - conflict-pid',
       'ALWAYS_PASSES',
       scenario,
       'ChatCompressionService',
       'conflict-pid',
     );
     componentEval(
+      'AgentHistoryProvider (Tight) - conflict-pid',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Tight)',
       'conflict-pid',
     );
     componentEval(
+      'AgentHistoryProvider (Generous) - conflict-pid',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Generous)',
@@ -399,36 +426,42 @@ describe('Compression Benchmark', () => {
   describe('Milestones', () => {
     const scenario = getScenario('scenario-milestones');
     componentEval(
+      'ChatCompressionService - tax-function-name',
       'ALWAYS_PASSES',
       scenario,
       'ChatCompressionService',
       'tax-function-name',
     );
     componentEval(
+      'AgentHistoryProvider (Tight) - tax-function-name',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Tight)',
       'tax-function-name',
     );
     componentEval(
+      'AgentHistoryProvider (Generous) - tax-function-name',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Generous)',
       'tax-function-name',
     );
     componentEval(
+      'ChatCompressionService - milestone-name',
       'ALWAYS_PASSES',
       scenario,
       'ChatCompressionService',
       'milestone-name',
     );
     componentEval(
+      'AgentHistoryProvider (Tight) - milestone-name',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Tight)',
       'milestone-name',
     );
     componentEval(
+      'AgentHistoryProvider (Generous) - milestone-name',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Generous)',
@@ -439,18 +472,21 @@ describe('Compression Benchmark', () => {
   describe('Multi-Bug Tracking', () => {
     const scenario = getScenario('scenario-multi-bug-tracking');
     componentEval(
+      'ChatCompressionService - bug-list',
       'ALWAYS_PASSES',
       scenario,
       'ChatCompressionService',
       'bug-list',
     );
     componentEval(
+      'AgentHistoryProvider (Tight) - bug-list',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Tight)',
       'bug-list',
     );
     componentEval(
+      'AgentHistoryProvider (Generous) - bug-list',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Generous)',
@@ -461,18 +497,21 @@ describe('Compression Benchmark', () => {
   describe('Myopic Keyhole', () => {
     const scenario = getScenario('scenario-myopic-keyhole');
     componentEval(
+      'ChatCompressionService - macro-context-inheritance',
       'ALWAYS_PASSES',
       scenario,
       'ChatCompressionService',
       'macro-context-inheritance',
     );
     componentEval(
+      'AgentHistoryProvider (Tight) - macro-context-inheritance',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Tight)',
       'macro-context-inheritance',
     );
     componentEval(
+      'AgentHistoryProvider (Generous) - macro-context-inheritance',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Generous)',
@@ -483,18 +522,21 @@ describe('Compression Benchmark', () => {
   describe('Negative Constraints', () => {
     const scenario = getScenario('scenario-negative-constraints');
     componentEval(
+      'ChatCompressionService - import-constraint',
       'ALWAYS_PASSES',
       scenario,
       'ChatCompressionService',
       'import-constraint',
     );
     componentEval(
+      'AgentHistoryProvider (Tight) - import-constraint',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Tight)',
       'import-constraint',
     );
     componentEval(
+      'AgentHistoryProvider (Generous) - import-constraint',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Generous)',
@@ -505,18 +547,21 @@ describe('Compression Benchmark', () => {
   describe('Nested Logic', () => {
     const scenario = getScenario('scenario-nested-logic');
     componentEval(
+      'ChatCompressionService - fallback-mode',
       'ALWAYS_PASSES',
       scenario,
       'ChatCompressionService',
       'fallback-mode',
     );
     componentEval(
+      'AgentHistoryProvider (Tight) - fallback-mode',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Tight)',
       'fallback-mode',
     );
     componentEval(
+      'AgentHistoryProvider (Generous) - fallback-mode',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Generous)',
@@ -527,18 +572,21 @@ describe('Compression Benchmark', () => {
   describe('Replace Loop', () => {
     const scenario = getScenario('scenario-replace-loop');
     componentEval(
+      'ChatCompressionService - recall-exact-line',
       'ALWAYS_PASSES',
       scenario,
       'ChatCompressionService',
       'recall-exact-line',
     );
     componentEval(
+      'AgentHistoryProvider (Tight) - recall-exact-line',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Tight)',
       'recall-exact-line',
     );
     componentEval(
+      'AgentHistoryProvider (Generous) - recall-exact-line',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Generous)',
@@ -549,18 +597,21 @@ describe('Compression Benchmark', () => {
   describe('Spatial Scattering', () => {
     const scenario = getScenario('scenario-spatial-scattering');
     componentEval(
+      'ChatCompressionService - recall-target-directory',
       'ALWAYS_PASSES',
       scenario,
       'ChatCompressionService',
       'recall-target-directory',
     );
     componentEval(
+      'AgentHistoryProvider (Tight) - recall-target-directory',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Tight)',
       'recall-target-directory',
     );
     componentEval(
+      'AgentHistoryProvider (Generous) - recall-target-directory',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Generous)',
@@ -571,18 +622,21 @@ describe('Compression Benchmark', () => {
   describe('State Tracking', () => {
     const scenario = getScenario('scenario-state-tracking');
     componentEval(
+      'ChatCompressionService - next-step',
       'ALWAYS_PASSES',
       scenario,
       'ChatCompressionService',
       'next-step',
     );
     componentEval(
+      'AgentHistoryProvider (Tight) - next-step',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Tight)',
       'next-step',
     );
     componentEval(
+      'AgentHistoryProvider (Generous) - next-step',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Generous)',
@@ -593,18 +647,21 @@ describe('Compression Benchmark', () => {
   describe('Strategy Abandonment', () => {
     const scenario = getScenario('scenario-strategy-abandonment');
     componentEval(
+      'ChatCompressionService - recall-next-action',
       'ALWAYS_PASSES',
       scenario,
       'ChatCompressionService',
       'recall-next-action',
     );
     componentEval(
+      'AgentHistoryProvider (Tight) - recall-next-action',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Tight)',
       'recall-next-action',
     );
     componentEval(
+      'AgentHistoryProvider (Generous) - recall-next-action',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Generous)',
@@ -615,18 +672,21 @@ describe('Compression Benchmark', () => {
   describe('Subtle Preference', () => {
     const scenario = getScenario('scenario-subtle-preference');
     componentEval(
+      'ChatCompressionService - commit-prefix',
       'ALWAYS_PASSES',
       scenario,
       'ChatCompressionService',
       'commit-prefix',
     );
     componentEval(
+      'AgentHistoryProvider (Tight) - commit-prefix',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Tight)',
       'commit-prefix',
     );
     componentEval(
+      'AgentHistoryProvider (Generous) - commit-prefix',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Generous)',
@@ -637,18 +697,21 @@ describe('Compression Benchmark', () => {
   describe('Surgical Imprecision', () => {
     const scenario = getScenario('scenario-surgical-imprecision');
     componentEval(
+      'ChatCompressionService - surgical-edit-check',
       'ALWAYS_PASSES',
       scenario,
       'ChatCompressionService',
       'surgical-edit-check',
     );
     componentEval(
+      'AgentHistoryProvider (Tight) - surgical-edit-check',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Tight)',
       'surgical-edit-check',
     );
     componentEval(
+      'AgentHistoryProvider (Generous) - surgical-edit-check',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Generous)',
@@ -659,18 +722,21 @@ describe('Compression Benchmark', () => {
   describe('Symbol Location Amnesia', () => {
     const scenario = getScenario('scenario-symbol-location-amnesia');
     componentEval(
+      'ChatCompressionService - recall-symbol-file',
       'ALWAYS_PASSES',
       scenario,
       'ChatCompressionService',
       'recall-symbol-file',
     );
     componentEval(
+      'AgentHistoryProvider (Tight) - recall-symbol-file',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Tight)',
       'recall-symbol-file',
     );
     componentEval(
+      'AgentHistoryProvider (Generous) - recall-symbol-file',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Generous)',
@@ -681,36 +747,42 @@ describe('Compression Benchmark', () => {
   describe('Tool Noise', () => {
     const scenario = getScenario('scenario-tool-noise');
     componentEval(
+      'ChatCompressionService - fatal-error',
       'ALWAYS_PASSES',
       scenario,
       'ChatCompressionService',
       'fatal-error',
     );
     componentEval(
+      'AgentHistoryProvider (Tight) - fatal-error',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Tight)',
       'fatal-error',
     );
     componentEval(
+      'AgentHistoryProvider (Generous) - fatal-error',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Generous)',
       'fatal-error',
     );
     componentEval(
+      'ChatCompressionService - success-token',
       'ALWAYS_PASSES',
       scenario,
       'ChatCompressionService',
       'success-token',
     );
     componentEval(
+      'AgentHistoryProvider (Tight) - success-token',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Tight)',
       'success-token',
     );
     componentEval(
+      'AgentHistoryProvider (Generous) - success-token',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Generous)',
@@ -721,18 +793,21 @@ describe('Compression Benchmark', () => {
   describe('Variable Leak', () => {
     const scenario = getScenario('scenario-variable-leak');
     componentEval(
+      'ChatCompressionService - debug-token',
       'ALWAYS_PASSES',
       scenario,
       'ChatCompressionService',
       'debug-token',
     );
     componentEval(
+      'AgentHistoryProvider (Tight) - debug-token',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Tight)',
       'debug-token',
     );
     componentEval(
+      'AgentHistoryProvider (Generous) - debug-token',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Generous)',
@@ -743,18 +818,21 @@ describe('Compression Benchmark', () => {
   describe('Verification Abandonment', () => {
     const scenario = getScenario('scenario-verification-abandonment');
     componentEval(
+      'ChatCompressionService - recall-next-discipline',
       'ALWAYS_PASSES',
       scenario,
       'ChatCompressionService',
       'recall-next-discipline',
     );
     componentEval(
+      'AgentHistoryProvider (Tight) - recall-next-discipline',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Tight)',
       'recall-next-discipline',
     );
     componentEval(
+      'AgentHistoryProvider (Generous) - recall-next-discipline',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Generous)',
@@ -765,18 +843,21 @@ describe('Compression Benchmark', () => {
   describe('Working Set Amnesia', () => {
     const scenario = getScenario('scenario-working-set-amnesia');
     componentEval(
+      'ChatCompressionService - recall-modified-file',
       'ALWAYS_PASSES',
       scenario,
       'ChatCompressionService',
       'recall-modified-file',
     );
     componentEval(
+      'AgentHistoryProvider (Tight) - recall-modified-file',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Tight)',
       'recall-modified-file',
     );
     componentEval(
+      'AgentHistoryProvider (Generous) - recall-modified-file',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Generous)',
@@ -787,18 +868,21 @@ describe('Compression Benchmark', () => {
   describe('XML Robustness', () => {
     const scenario = getScenario('scenario-xml-robustness');
     componentEval(
+      'ChatCompressionService - xml-injection',
       'ALWAYS_PASSES',
       scenario,
       'ChatCompressionService',
       'xml-injection',
     );
     componentEval(
+      'AgentHistoryProvider (Tight) - xml-injection',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Tight)',
       'xml-injection',
     );
     componentEval(
+      'AgentHistoryProvider (Generous) - xml-injection',
       'ALWAYS_PASSES',
       scenario,
       'AgentHistoryProvider (Generous)',
diff --git a/evals/data/compression/scenario-blind-guess.json.js b/evals/data/compression/scenario-blind-guess.json.js
new file mode 100644
index 0000000000..daed142cc1
--- /dev/null
+++ b/evals/data/compression/scenario-blind-guess.json.js
@@ -0,0 +1,73 @@
+{
+  "scenarioId": "scenario-blind-guess",
+  "description": "Tests if the agent attempts to \"guess\" code content from a summarized description rather than literal content. Targets the \"Shoot from the Hip\" error.",
+  "history": [
+    {
+      "role": "user",
+      "parts": [
+        {
+          "text": "Read src/auth/validator.ts."
+        }
+      ]
+    },
+    {
+      "role": "model",
+      "parts": [
+        {
+          "functionCall": {
+            "name": "read_file",
+            "args": {
+              "file_path": "src/auth/validator.ts"
+            }
+          }
+        }
+      ]
+    },
+    {
+      "role": "user",
+      "parts": [
+        {
+          "functionResponse": {
+            "name": "read_file",
+            "response": {
+              "content": "export const validate = (token: string) => {\n  if (!token) throw new Error(\"Missing token\");\n  return true;\n};"
+            }
+          }
+        }
+      ]
+    },
+    {
+      "role": "model",
+      "parts": [
+        {
+          "text": "The validator throws an error if the token is missing."
+        }
+      ]
+    },
+    ...Array.from({ length: 30 }, (_, i) => [
+      {
+        "role": "user",
+        "parts": [
+          {
+            "text": `Unrelated task ${i}`
+          }
+        ]
+      },
+      {
+        "role": "model",
+        "parts": [
+          {
+            "text": `Task ${i} done.`
+          }
+        ]
+      }
+    ]).flat()
+  ],
+  "questions": [
+    {
+      "id": "exact-error-string",
+      "prompt": "I want to change the error message in src/auth/validator.ts to \"Token required\". What is the EXACT literal \"old_string\" I should use for the replace tool? Provide the full statement including the throw new Error.",
+      "expectedSubstring": "throw new Error(\"Missing token\")"
+    }
+  ]
+}
diff --git a/evals/test-helper.ts b/evals/test-helper.ts
index cf7ef273e9..85484d9874 100644
--- a/evals/test-helper.ts
+++ b/evals/test-helper.ts
@@ -16,10 +16,17 @@ import {
   Storage,
   getProjectHash,
   SESSION_FILE_PREFIX,
+  PREVIEW_GEMINI_FLASH_MODEL,
 } from '@google/gemini-cli-core';
 
 export * from '@google/gemini-cli-test-utils';
 
+/**
+ * The default model used for all evaluations.
+ * Can be overridden by setting the GEMINI_MODEL environment variable.
+ */
+export const EVAL_MODEL = process.env.GEMINI_MODEL || PREVIEW_GEMINI_FLASH_MODEL;
+
 // Indicates the consistency expectation for this test.
 // - ALWAYS_PASSES - Means that the test is expected to pass 100% of the time. These
 //   These tests are typically trivial and test basic functionality with unambiguous
@@ -95,7 +102,14 @@ export async function internalEvalTest(evalCase: EvalCase) {
     let isSuccess = false;
 
     try {
-      rig.setup(evalCase.name, evalCase.params);
+      const setupOptions = {
+        ...evalCase.params,
+        settings: {
+          model: { name: EVAL_MODEL },
+          ...evalCase.params?.settings,
+        },
+      };
+      rig.setup(evalCase.name, setupOptions);
 
       if (evalCase.setup) {
         await evalCase.setup(rig);
diff --git a/evals/tool_output_masking.eval.ts b/evals/tool_output_masking.eval.ts
index dff639e421..28f044c1b5 100644
--- a/evals/tool_output_masking.eval.ts
+++ b/evals/tool_output_masking.eval.ts
@@ -5,7 +5,7 @@
  */
 
 import { describe, expect } from 'vitest';
-import { evalTest } from './test-helper.js';
+import { evalTest, EVAL_MODEL } from './test-helper.js';
 import path from 'node:path';
 import fs from 'node:fs';
 import crypto from 'node:crypto';
@@ -87,7 +87,7 @@ Output too large. Full output available at: ${outputFilePath}
             id: 'msg_2',
             timestamp: new Date().toISOString(),
             type: 'gemini',
-            model: 'gemini-3-flash-preview',
+            model: EVAL_MODEL,
             toolCalls: [
               {
                 id: 'call_1',
@@ -222,7 +222,7 @@ Output too large. Full output available at: ${outputFilePath}
             id: 'msg_2',
             timestamp: new Date().toISOString(),
             type: 'gemini',
-            model: 'gemini-3-flash-preview',
+            model: EVAL_MODEL,
             toolCalls: [
               {
                 id: 'call_1',