From 117a2d384465394f7883e60977fa56772e150584 Mon Sep 17 00:00:00 2001 From: Christian Gunderman Date: Mon, 30 Mar 2026 22:02:53 +0000 Subject: [PATCH] fix(evals): add update_topic behavioral eval (#24223) --- evals/update_topic.eval.ts | 116 +++++++++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 evals/update_topic.eval.ts diff --git a/evals/update_topic.eval.ts b/evals/update_topic.eval.ts new file mode 100644 index 0000000000..1836e7f61b --- /dev/null +++ b/evals/update_topic.eval.ts @@ -0,0 +1,116 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, expect } from 'vitest'; +import { evalTest } from './test-helper.js'; + +describe('update_topic_behavior', () => { + // Constants for tool names and params for robustness + const UPDATE_TOPIC_TOOL_NAME = 'update_topic'; + + /** + * Verifies the desired behavior of the update_topic tool. update_topic is used by the + * agent to share periodic, concise updates about what the agent is working on, independent + * of the regular model output and/or thoughts. This tool is expected to be called at least + * at the start and end of the session, and typically at least once in the middle, but no + * more than 1/4 turns. + */ + evalTest('USUALLY_PASSES', { + name: 'update_topic should be used at start, end and middle for complex tasks', + prompt: `Create a simple users REST API using Express. +1. Initialize a new npm project and install express. +2. Create src/app.ts as the main entry point. +3. Create src/routes/userRoutes.ts for user routes. +4. Create src/controllers/userController.ts for user logic. +5. Implement GET /users, POST /users, and GET /users/:id using an in-memory array. +6. Add a 'start' script to package.json. +7. Finally, run a quick grep to verify the routes are in src/app.ts.`, + files: { + 'package.json': JSON.stringify( + { + name: 'users-api', + version: '1.0.0', + private: true, + }, + null, + 2, + ), + '.gemini/settings.json': JSON.stringify({ + experimental: { + topicUpdateNarration: true, + }, + }), + }, + assert: async (rig, result) => { + const toolLogs = rig.readToolLogs(); + const topicCalls = toolLogs.filter( + (l) => l.toolRequest.name === UPDATE_TOPIC_TOOL_NAME, + ); + + // 1. Assert that update_topic is called at least 3 times (start, middle, end) + expect( + topicCalls.length, + `Expected at least 3 update_topic calls, but found ${topicCalls.length}`, + ).toBeGreaterThanOrEqual(3); + + // 2. Assert update_topic is called at the very beginning (first tool call) + expect( + toolLogs[0].toolRequest.name, + 'First tool call should be update_topic', + ).toBe(UPDATE_TOPIC_TOOL_NAME); + + // 3. Assert update_topic is called near the end + const lastTopicCallIndex = toolLogs + .map((l) => l.toolRequest.name) + .lastIndexOf(UPDATE_TOPIC_TOOL_NAME); + expect( + lastTopicCallIndex, + 'Expected update_topic to be used near the end of the task', + ).toBeGreaterThanOrEqual(toolLogs.length * 0.7); + + // 4. Assert there is at least one update_topic call in the middle (between start and end phases) + const middleTopicCalls = topicCalls.slice(1, -1); + + expect( + middleTopicCalls.length, + 'Expected at least one update_topic call in the middle of the task', + ).toBeGreaterThanOrEqual(1); + + // 5. Turn Ratio Assertion: update_topic should be <= 1/2 of total turns. + // We only enforce this for tasks that take more than 5 turns, as shorter tasks + // naturally have a higher ratio when following the "start, middle, end" rule. + const uniquePromptIds = new Set( + toolLogs + .map((l) => l.toolRequest.prompt_id) + .filter((id) => id !== undefined), + ); + const totalTurns = uniquePromptIds.size; + + if (totalTurns > 5) { + const topicTurns = new Set( + topicCalls + .map((l) => l.toolRequest.prompt_id) + .filter((id) => id !== undefined), + ); + const topicTurnCount = topicTurns.size; + + const ratio = topicTurnCount / totalTurns; + + expect( + ratio, + `update_topic was used in ${topicTurnCount} out of ${totalTurns} turns (${(ratio * 100).toFixed(1)}%). Expected <= 50%.`, + ).toBeLessThanOrEqual(0.5); + + // Ideal ratio is closer to 1/5 (20%). We log high usage as a warning. + if (ratio > 0.25) { + console.warn( + `[Efficiency Warning] update_topic usage is high: ${(ratio * 100).toFixed(1)}% (Goal: ~20%)`, + ); + } + } + }, + }); +});