From b4053dc8dc7ae7643d06c6a34b87d97db124e403 Mon Sep 17 00:00:00 2001 From: Christian Gunderman Date: Tue, 7 Apr 2026 08:44:25 -0700 Subject: [PATCH] Use 3 flash and add another scenario. --- evals/app-test-helper.ts | 4 +- evals/compression.eval.ts | 92 ++++++++++++++++++- .../compression/scenario-blind-guess.json.js | 73 +++++++++++++++ evals/test-helper.ts | 16 +++- evals/tool_output_masking.eval.ts | 6 +- 5 files changed, 181 insertions(+), 10 deletions(-) create mode 100644 evals/data/compression/scenario-blind-guess.json.js diff --git a/evals/app-test-helper.ts b/evals/app-test-helper.ts index 017063a3f7..ce54adaaf4 100644 --- a/evals/app-test-helper.ts +++ b/evals/app-test-helper.ts @@ -13,10 +13,10 @@ import { withEvalRetries, prepareWorkspace, BaseEvalCase, + EVAL_MODEL, } from './test-helper.js'; import fs from 'node:fs'; import path from 'node:path'; -import { DEFAULT_GEMINI_MODEL } from '@google/gemini-cli-core'; /** * Config overrides for evals, with tool-restriction fields explicitly @@ -51,7 +51,7 @@ export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) { await withEvalRetries(evalCase.name, async () => { const rig = new AppRig({ configOverrides: { - model: DEFAULT_GEMINI_MODEL, + model: EVAL_MODEL, ...evalCase.configOverrides, }, }); diff --git a/evals/compression.eval.ts b/evals/compression.eval.ts index 589991cfb7..0929089a7e 100644 --- a/evals/compression.eval.ts +++ b/evals/compression.eval.ts @@ -10,13 +10,12 @@ import { AgentHistoryProvider, ChatCompressionService, GeminiChat, - DEFAULT_GEMINI_MODEL, } from '@google/gemini-cli-core'; import type { Content } from '@google/genai'; import fs from 'node:fs'; import path from 'node:path'; import { fileURLToPath } from 'node:url'; -import { type EvalPolicy } from './test-helper.js'; +import { type EvalPolicy, EVAL_MODEL } from './test-helper.js'; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); @@ -117,6 +116,7 @@ const configs: Record = { }; function componentEval( + name: string, policy: EvalPolicy, scenario: Scenario, configName: keyof typeof configs, @@ -131,8 +131,8 @@ function componentEval( } componentEvalTest(policy, { - name: `Compression | ${scenario.scenarioId} | ${compConfig.name} | ${question.id}`, - configOverrides: { model: DEFAULT_GEMINI_MODEL }, + name, + configOverrides: { model: EVAL_MODEL }, assert: async (config) => { const originalTokens = ( @@ -205,18 +205,21 @@ describe('Compression Benchmark', () => { describe('Blind Guessing', () => { const scenario = getScenario('scenario-blind-guess'); componentEval( + 'ChatCompressionService - exact-error-string', 'ALWAYS_PASSES', scenario, 'ChatCompressionService', 'exact-error-string', ); componentEval( + 'AgentHistoryProvider (Tight) - exact-error-string', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Tight)', 'exact-error-string', ); componentEval( + 'AgentHistoryProvider (Generous) - exact-error-string', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Generous)', @@ -227,18 +230,21 @@ describe('Compression Benchmark', () => { describe('Conditional Instructions', () => { const scenario = getScenario('scenario-conditional-instruction'); componentEval( + 'ChatCompressionService - error-handler', 'ALWAYS_PASSES', scenario, 'ChatCompressionService', 'error-handler', ); componentEval( + 'AgentHistoryProvider (Tight) - error-handler', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Tight)', 'error-handler', ); componentEval( + 'AgentHistoryProvider (Generous) - error-handler', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Generous)', @@ -249,36 +255,42 @@ describe('Compression Benchmark', () => { describe('Constraints', () => { const scenario = getScenario('scenario-constraints'); componentEval( + 'ChatCompressionService - variable-naming', 'ALWAYS_PASSES', scenario, 'ChatCompressionService', 'variable-naming', ); componentEval( + 'AgentHistoryProvider (Tight) - variable-naming', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Tight)', 'variable-naming', ); componentEval( + 'AgentHistoryProvider (Generous) - variable-naming', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Generous)', 'variable-naming', ); componentEval( + 'ChatCompressionService - deployment-target', 'ALWAYS_PASSES', scenario, 'ChatCompressionService', 'deployment-target', ); componentEval( + 'AgentHistoryProvider (Tight) - deployment-target', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Tight)', 'deployment-target', ); componentEval( + 'AgentHistoryProvider (Generous) - deployment-target', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Generous)', @@ -289,18 +301,21 @@ describe('Compression Benchmark', () => { describe('Context Amnesia', () => { const scenario = getScenario('scenario-context-amnesia'); componentEval( + 'ChatCompressionService - recall-file-content', 'ALWAYS_PASSES', scenario, 'ChatCompressionService', 'recall-file-content', ); componentEval( + 'AgentHistoryProvider (Tight) - recall-file-content', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Tight)', 'recall-file-content', ); componentEval( + 'AgentHistoryProvider (Generous) - recall-file-content', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Generous)', @@ -311,18 +326,21 @@ describe('Compression Benchmark', () => { describe('Context Thrashing', () => { const scenario = getScenario('scenario-context-thrashing'); componentEval( + 'ChatCompressionService - failed-edit-reason', 'ALWAYS_PASSES', scenario, 'ChatCompressionService', 'failed-edit-reason', ); componentEval( + 'AgentHistoryProvider (Tight) - failed-edit-reason', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Tight)', 'failed-edit-reason', ); componentEval( + 'AgentHistoryProvider (Generous) - failed-edit-reason', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Generous)', @@ -333,18 +351,21 @@ describe('Compression Benchmark', () => { describe('Dependency Chain', () => { const scenario = getScenario('scenario-dependency-chain'); componentEval( + 'ChatCompressionService - migration-dependency', 'ALWAYS_PASSES', scenario, 'ChatCompressionService', 'migration-dependency', ); componentEval( + 'AgentHistoryProvider (Tight) - migration-dependency', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Tight)', 'migration-dependency', ); componentEval( + 'AgentHistoryProvider (Generous) - migration-dependency', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Generous)', @@ -355,18 +376,21 @@ describe('Compression Benchmark', () => { describe('Massive Working Set', () => { const scenario = getScenario('scenario-massive-working-set'); componentEval( + 'ChatCompressionService - file-25-presence', 'ALWAYS_PASSES', scenario, 'ChatCompressionService', 'file-25-presence', ); componentEval( + 'AgentHistoryProvider (Tight) - file-25-presence', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Tight)', 'file-25-presence', ); componentEval( + 'AgentHistoryProvider (Generous) - file-25-presence', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Generous)', @@ -377,18 +401,21 @@ describe('Compression Benchmark', () => { describe('Middle Reasoning', () => { const scenario = getScenario('scenario-middle-reasoning'); componentEval( + 'ChatCompressionService - conflict-pid', 'ALWAYS_PASSES', scenario, 'ChatCompressionService', 'conflict-pid', ); componentEval( + 'AgentHistoryProvider (Tight) - conflict-pid', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Tight)', 'conflict-pid', ); componentEval( + 'AgentHistoryProvider (Generous) - conflict-pid', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Generous)', @@ -399,36 +426,42 @@ describe('Compression Benchmark', () => { describe('Milestones', () => { const scenario = getScenario('scenario-milestones'); componentEval( + 'ChatCompressionService - tax-function-name', 'ALWAYS_PASSES', scenario, 'ChatCompressionService', 'tax-function-name', ); componentEval( + 'AgentHistoryProvider (Tight) - tax-function-name', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Tight)', 'tax-function-name', ); componentEval( + 'AgentHistoryProvider (Generous) - tax-function-name', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Generous)', 'tax-function-name', ); componentEval( + 'ChatCompressionService - milestone-name', 'ALWAYS_PASSES', scenario, 'ChatCompressionService', 'milestone-name', ); componentEval( + 'AgentHistoryProvider (Tight) - milestone-name', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Tight)', 'milestone-name', ); componentEval( + 'AgentHistoryProvider (Generous) - milestone-name', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Generous)', @@ -439,18 +472,21 @@ describe('Compression Benchmark', () => { describe('Multi-Bug Tracking', () => { const scenario = getScenario('scenario-multi-bug-tracking'); componentEval( + 'ChatCompressionService - bug-list', 'ALWAYS_PASSES', scenario, 'ChatCompressionService', 'bug-list', ); componentEval( + 'AgentHistoryProvider (Tight) - bug-list', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Tight)', 'bug-list', ); componentEval( + 'AgentHistoryProvider (Generous) - bug-list', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Generous)', @@ -461,18 +497,21 @@ describe('Compression Benchmark', () => { describe('Myopic Keyhole', () => { const scenario = getScenario('scenario-myopic-keyhole'); componentEval( + 'ChatCompressionService - macro-context-inheritance', 'ALWAYS_PASSES', scenario, 'ChatCompressionService', 'macro-context-inheritance', ); componentEval( + 'AgentHistoryProvider (Tight) - macro-context-inheritance', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Tight)', 'macro-context-inheritance', ); componentEval( + 'AgentHistoryProvider (Generous) - macro-context-inheritance', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Generous)', @@ -483,18 +522,21 @@ describe('Compression Benchmark', () => { describe('Negative Constraints', () => { const scenario = getScenario('scenario-negative-constraints'); componentEval( + 'ChatCompressionService - import-constraint', 'ALWAYS_PASSES', scenario, 'ChatCompressionService', 'import-constraint', ); componentEval( + 'AgentHistoryProvider (Tight) - import-constraint', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Tight)', 'import-constraint', ); componentEval( + 'AgentHistoryProvider (Generous) - import-constraint', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Generous)', @@ -505,18 +547,21 @@ describe('Compression Benchmark', () => { describe('Nested Logic', () => { const scenario = getScenario('scenario-nested-logic'); componentEval( + 'ChatCompressionService - fallback-mode', 'ALWAYS_PASSES', scenario, 'ChatCompressionService', 'fallback-mode', ); componentEval( + 'AgentHistoryProvider (Tight) - fallback-mode', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Tight)', 'fallback-mode', ); componentEval( + 'AgentHistoryProvider (Generous) - fallback-mode', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Generous)', @@ -527,18 +572,21 @@ describe('Compression Benchmark', () => { describe('Replace Loop', () => { const scenario = getScenario('scenario-replace-loop'); componentEval( + 'ChatCompressionService - recall-exact-line', 'ALWAYS_PASSES', scenario, 'ChatCompressionService', 'recall-exact-line', ); componentEval( + 'AgentHistoryProvider (Tight) - recall-exact-line', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Tight)', 'recall-exact-line', ); componentEval( + 'AgentHistoryProvider (Generous) - recall-exact-line', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Generous)', @@ -549,18 +597,21 @@ describe('Compression Benchmark', () => { describe('Spatial Scattering', () => { const scenario = getScenario('scenario-spatial-scattering'); componentEval( + 'ChatCompressionService - recall-target-directory', 'ALWAYS_PASSES', scenario, 'ChatCompressionService', 'recall-target-directory', ); componentEval( + 'AgentHistoryProvider (Tight) - recall-target-directory', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Tight)', 'recall-target-directory', ); componentEval( + 'AgentHistoryProvider (Generous) - recall-target-directory', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Generous)', @@ -571,18 +622,21 @@ describe('Compression Benchmark', () => { describe('State Tracking', () => { const scenario = getScenario('scenario-state-tracking'); componentEval( + 'ChatCompressionService - next-step', 'ALWAYS_PASSES', scenario, 'ChatCompressionService', 'next-step', ); componentEval( + 'AgentHistoryProvider (Tight) - next-step', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Tight)', 'next-step', ); componentEval( + 'AgentHistoryProvider (Generous) - next-step', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Generous)', @@ -593,18 +647,21 @@ describe('Compression Benchmark', () => { describe('Strategy Abandonment', () => { const scenario = getScenario('scenario-strategy-abandonment'); componentEval( + 'ChatCompressionService - recall-next-action', 'ALWAYS_PASSES', scenario, 'ChatCompressionService', 'recall-next-action', ); componentEval( + 'AgentHistoryProvider (Tight) - recall-next-action', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Tight)', 'recall-next-action', ); componentEval( + 'AgentHistoryProvider (Generous) - recall-next-action', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Generous)', @@ -615,18 +672,21 @@ describe('Compression Benchmark', () => { describe('Subtle Preference', () => { const scenario = getScenario('scenario-subtle-preference'); componentEval( + 'ChatCompressionService - commit-prefix', 'ALWAYS_PASSES', scenario, 'ChatCompressionService', 'commit-prefix', ); componentEval( + 'AgentHistoryProvider (Tight) - commit-prefix', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Tight)', 'commit-prefix', ); componentEval( + 'AgentHistoryProvider (Generous) - commit-prefix', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Generous)', @@ -637,18 +697,21 @@ describe('Compression Benchmark', () => { describe('Surgical Imprecision', () => { const scenario = getScenario('scenario-surgical-imprecision'); componentEval( + 'ChatCompressionService - surgical-edit-check', 'ALWAYS_PASSES', scenario, 'ChatCompressionService', 'surgical-edit-check', ); componentEval( + 'AgentHistoryProvider (Tight) - surgical-edit-check', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Tight)', 'surgical-edit-check', ); componentEval( + 'AgentHistoryProvider (Generous) - surgical-edit-check', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Generous)', @@ -659,18 +722,21 @@ describe('Compression Benchmark', () => { describe('Symbol Location Amnesia', () => { const scenario = getScenario('scenario-symbol-location-amnesia'); componentEval( + 'ChatCompressionService - recall-symbol-file', 'ALWAYS_PASSES', scenario, 'ChatCompressionService', 'recall-symbol-file', ); componentEval( + 'AgentHistoryProvider (Tight) - recall-symbol-file', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Tight)', 'recall-symbol-file', ); componentEval( + 'AgentHistoryProvider (Generous) - recall-symbol-file', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Generous)', @@ -681,36 +747,42 @@ describe('Compression Benchmark', () => { describe('Tool Noise', () => { const scenario = getScenario('scenario-tool-noise'); componentEval( + 'ChatCompressionService - fatal-error', 'ALWAYS_PASSES', scenario, 'ChatCompressionService', 'fatal-error', ); componentEval( + 'AgentHistoryProvider (Tight) - fatal-error', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Tight)', 'fatal-error', ); componentEval( + 'AgentHistoryProvider (Generous) - fatal-error', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Generous)', 'fatal-error', ); componentEval( + 'ChatCompressionService - success-token', 'ALWAYS_PASSES', scenario, 'ChatCompressionService', 'success-token', ); componentEval( + 'AgentHistoryProvider (Tight) - success-token', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Tight)', 'success-token', ); componentEval( + 'AgentHistoryProvider (Generous) - success-token', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Generous)', @@ -721,18 +793,21 @@ describe('Compression Benchmark', () => { describe('Variable Leak', () => { const scenario = getScenario('scenario-variable-leak'); componentEval( + 'ChatCompressionService - debug-token', 'ALWAYS_PASSES', scenario, 'ChatCompressionService', 'debug-token', ); componentEval( + 'AgentHistoryProvider (Tight) - debug-token', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Tight)', 'debug-token', ); componentEval( + 'AgentHistoryProvider (Generous) - debug-token', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Generous)', @@ -743,18 +818,21 @@ describe('Compression Benchmark', () => { describe('Verification Abandonment', () => { const scenario = getScenario('scenario-verification-abandonment'); componentEval( + 'ChatCompressionService - recall-next-discipline', 'ALWAYS_PASSES', scenario, 'ChatCompressionService', 'recall-next-discipline', ); componentEval( + 'AgentHistoryProvider (Tight) - recall-next-discipline', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Tight)', 'recall-next-discipline', ); componentEval( + 'AgentHistoryProvider (Generous) - recall-next-discipline', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Generous)', @@ -765,18 +843,21 @@ describe('Compression Benchmark', () => { describe('Working Set Amnesia', () => { const scenario = getScenario('scenario-working-set-amnesia'); componentEval( + 'ChatCompressionService - recall-modified-file', 'ALWAYS_PASSES', scenario, 'ChatCompressionService', 'recall-modified-file', ); componentEval( + 'AgentHistoryProvider (Tight) - recall-modified-file', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Tight)', 'recall-modified-file', ); componentEval( + 'AgentHistoryProvider (Generous) - recall-modified-file', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Generous)', @@ -787,18 +868,21 @@ describe('Compression Benchmark', () => { describe('XML Robustness', () => { const scenario = getScenario('scenario-xml-robustness'); componentEval( + 'ChatCompressionService - xml-injection', 'ALWAYS_PASSES', scenario, 'ChatCompressionService', 'xml-injection', ); componentEval( + 'AgentHistoryProvider (Tight) - xml-injection', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Tight)', 'xml-injection', ); componentEval( + 'AgentHistoryProvider (Generous) - xml-injection', 'ALWAYS_PASSES', scenario, 'AgentHistoryProvider (Generous)', diff --git a/evals/data/compression/scenario-blind-guess.json.js b/evals/data/compression/scenario-blind-guess.json.js new file mode 100644 index 0000000000..daed142cc1 --- /dev/null +++ b/evals/data/compression/scenario-blind-guess.json.js @@ -0,0 +1,73 @@ +{ + "scenarioId": "scenario-blind-guess", + "description": "Tests if the agent attempts to \"guess\" code content from a summarized description rather than literal content. Targets the \"Shoot from the Hip\" error.", + "history": [ + { + "role": "user", + "parts": [ + { + "text": "Read src/auth/validator.ts." + } + ] + }, + { + "role": "model", + "parts": [ + { + "functionCall": { + "name": "read_file", + "args": { + "file_path": "src/auth/validator.ts" + } + } + } + ] + }, + { + "role": "user", + "parts": [ + { + "functionResponse": { + "name": "read_file", + "response": { + "content": "export const validate = (token: string) => {\n if (!token) throw new Error(\"Missing token\");\n return true;\n};" + } + } + } + ] + }, + { + "role": "model", + "parts": [ + { + "text": "The validator throws an error if the token is missing." + } + ] + }, + ...Array.from({ length: 30 }, (_, i) => [ + { + "role": "user", + "parts": [ + { + "text": `Unrelated task ${i}` + } + ] + }, + { + "role": "model", + "parts": [ + { + "text": `Task ${i} done.` + } + ] + } + ]).flat() + ], + "questions": [ + { + "id": "exact-error-string", + "prompt": "I want to change the error message in src/auth/validator.ts to \"Token required\". What is the EXACT literal \"old_string\" I should use for the replace tool? Provide the full statement including the throw new Error.", + "expectedSubstring": "throw new Error(\"Missing token\")" + } + ] +} diff --git a/evals/test-helper.ts b/evals/test-helper.ts index cf7ef273e9..85484d9874 100644 --- a/evals/test-helper.ts +++ b/evals/test-helper.ts @@ -16,10 +16,17 @@ import { Storage, getProjectHash, SESSION_FILE_PREFIX, + PREVIEW_GEMINI_FLASH_MODEL, } from '@google/gemini-cli-core'; export * from '@google/gemini-cli-test-utils'; +/** + * The default model used for all evaluations. + * Can be overridden by setting the GEMINI_MODEL environment variable. + */ +export const EVAL_MODEL = process.env.GEMINI_MODEL || PREVIEW_GEMINI_FLASH_MODEL; + // Indicates the consistency expectation for this test. // - ALWAYS_PASSES - Means that the test is expected to pass 100% of the time. These // These tests are typically trivial and test basic functionality with unambiguous @@ -95,7 +102,14 @@ export async function internalEvalTest(evalCase: EvalCase) { let isSuccess = false; try { - rig.setup(evalCase.name, evalCase.params); + const setupOptions = { + ...evalCase.params, + settings: { + model: { name: EVAL_MODEL }, + ...evalCase.params?.settings, + }, + }; + rig.setup(evalCase.name, setupOptions); if (evalCase.setup) { await evalCase.setup(rig); diff --git a/evals/tool_output_masking.eval.ts b/evals/tool_output_masking.eval.ts index dff639e421..28f044c1b5 100644 --- a/evals/tool_output_masking.eval.ts +++ b/evals/tool_output_masking.eval.ts @@ -5,7 +5,7 @@ */ import { describe, expect } from 'vitest'; -import { evalTest } from './test-helper.js'; +import { evalTest, EVAL_MODEL } from './test-helper.js'; import path from 'node:path'; import fs from 'node:fs'; import crypto from 'node:crypto'; @@ -87,7 +87,7 @@ Output too large. Full output available at: ${outputFilePath} id: 'msg_2', timestamp: new Date().toISOString(), type: 'gemini', - model: 'gemini-3-flash-preview', + model: EVAL_MODEL, toolCalls: [ { id: 'call_1', @@ -222,7 +222,7 @@ Output too large. Full output available at: ${outputFilePath} id: 'msg_2', timestamp: new Date().toISOString(), type: 'gemini', - model: 'gemini-3-flash-preview', + model: EVAL_MODEL, toolCalls: [ { id: 'call_1',