2026-03-30 22:02:53 +00:00
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe , expect } from 'vitest' ;
2026-04-03 16:52:24 -07:00
import fs from 'node:fs' ;
import path from 'node:path' ;
2026-03-30 22:02:53 +00:00
import { evalTest } from './test-helper.js' ;
describe ( 'update_topic_behavior' , ( ) = > {
// Constants for tool names and params for robustness
const UPDATE_TOPIC_TOOL_NAME = 'update_topic' ;
/**
* Verifies the desired behavior of the update_topic tool. update_topic is used by the
* agent to share periodic, concise updates about what the agent is working on, independent
* of the regular model output and/or thoughts. This tool is expected to be called at least
* at the start and end of the session, and typically at least once in the middle, but no
* more than 1/4 turns.
*/
evalTest ( 'USUALLY_PASSES' , {
name : 'update_topic should be used at start, end and middle for complex tasks' ,
prompt : ` Create a simple users REST API using Express.
1. Initialize a new npm project and install express.
2. Create src/app.ts as the main entry point.
3. Create src/routes/userRoutes.ts for user routes.
4. Create src/controllers/userController.ts for user logic.
5. Implement GET /users, POST /users, and GET /users/:id using an in-memory array.
6. Add a 'start' script to package.json.
7. Finally, run a quick grep to verify the routes are in src/app.ts. ` ,
files : {
'package.json' : JSON . stringify (
{
name : 'users-api' ,
version : '1.0.0' ,
private : true ,
} ,
null ,
2 ,
) ,
'.gemini/settings.json' : JSON . stringify ( {
experimental : {
topicUpdateNarration : true ,
} ,
} ) ,
} ,
assert : async ( rig , result ) = > {
const toolLogs = rig . readToolLogs ( ) ;
const topicCalls = toolLogs . filter (
( l ) = > l . toolRequest . name === UPDATE_TOPIC_TOOL_NAME ,
) ;
// 1. Assert that update_topic is called at least 3 times (start, middle, end)
expect (
topicCalls . length ,
` Expected at least 3 update_topic calls, but found ${ topicCalls . length } ` ,
) . toBeGreaterThanOrEqual ( 3 ) ;
// 2. Assert update_topic is called at the very beginning (first tool call)
expect (
toolLogs [ 0 ] . toolRequest . name ,
'First tool call should be update_topic' ,
) . toBe ( UPDATE_TOPIC_TOOL_NAME ) ;
// 3. Assert update_topic is called near the end
const lastTopicCallIndex = toolLogs
. map ( ( l ) = > l . toolRequest . name )
. lastIndexOf ( UPDATE_TOPIC_TOOL_NAME ) ;
expect (
lastTopicCallIndex ,
'Expected update_topic to be used near the end of the task' ,
) . toBeGreaterThanOrEqual ( toolLogs . length * 0.7 ) ;
// 4. Assert there is at least one update_topic call in the middle (between start and end phases)
const middleTopicCalls = topicCalls . slice ( 1 , - 1 ) ;
expect (
middleTopicCalls . length ,
'Expected at least one update_topic call in the middle of the task' ,
) . toBeGreaterThanOrEqual ( 1 ) ;
// 5. Turn Ratio Assertion: update_topic should be <= 1/2 of total turns.
// We only enforce this for tasks that take more than 5 turns, as shorter tasks
// naturally have a higher ratio when following the "start, middle, end" rule.
const uniquePromptIds = new Set (
toolLogs
. map ( ( l ) = > l . toolRequest . prompt_id )
. filter ( ( id ) = > id !== undefined ) ,
) ;
const totalTurns = uniquePromptIds . size ;
if ( totalTurns > 5 ) {
const topicTurns = new Set (
topicCalls
. map ( ( l ) = > l . toolRequest . prompt_id )
. filter ( ( id ) = > id !== undefined ) ,
) ;
const topicTurnCount = topicTurns . size ;
const ratio = topicTurnCount / totalTurns ;
expect (
ratio ,
` update_topic was used in ${ topicTurnCount } out of ${ totalTurns } turns ( ${ ( ratio * 100 ) . toFixed ( 1 ) } %). Expected <= 50%. ` ,
) . toBeLessThanOrEqual ( 0.5 ) ;
// Ideal ratio is closer to 1/5 (20%). We log high usage as a warning.
if ( ratio > 0.25 ) {
console . warn (
` [Efficiency Warning] update_topic usage is high: ${ ( ratio * 100 ) . toFixed ( 1 ) } % (Goal: ~20%) ` ,
) ;
}
}
} ,
} ) ;
2026-04-03 16:52:24 -07:00
evalTest ( 'USUALLY_PASSES' , {
name : 'update_topic should NOT be used for informational coding tasks (Obvious)' ,
approvalMode : 'default' ,
prompt :
'Explain the difference between Map and Object in JavaScript and provide a performance-focused code snippet for each.' ,
files : {
'.gemini/settings.json' : JSON . stringify ( {
experimental : {
topicUpdateNarration : true ,
} ,
} ) ,
} ,
assert : async ( rig ) = > {
const toolLogs = rig . readToolLogs ( ) ;
const topicCalls = toolLogs . filter (
( l ) = > l . toolRequest . name === UPDATE_TOPIC_TOOL_NAME ,
) ;
expect (
topicCalls . length ,
` Expected 0 update_topic calls for an informational task, but found ${ topicCalls . length } ` ,
) . toBe ( 0 ) ;
} ,
} ) ;
evalTest ( 'USUALLY_PASSES' , {
name : 'update_topic should NOT be used for surgical symbol searches (Grey Area)' ,
approvalMode : 'default' ,
prompt :
"Find the file where the 'UPDATE_TOPIC_TOOL_NAME' constant is defined." ,
files : {
'packages/core/src/tools/tool-names.ts' :
"export const UPDATE_TOPIC_TOOL_NAME = 'update_topic';" ,
'.gemini/settings.json' : JSON . stringify ( {
experimental : {
topicUpdateNarration : true ,
} ,
} ) ,
} ,
assert : async ( rig ) = > {
const toolLogs = rig . readToolLogs ( ) ;
const topicCalls = toolLogs . filter (
( l ) = > l . toolRequest . name === UPDATE_TOPIC_TOOL_NAME ,
) ;
expect (
topicCalls . length ,
` Expected 0 update_topic calls for a surgical symbol search, but found ${ topicCalls . length } ` ,
) . toBe ( 0 ) ;
} ,
} ) ;
evalTest ( 'USUALLY_PASSES' , {
name : 'update_topic should be used for medium complexity multi-step tasks' ,
prompt :
'Refactor the `users-api` project. Move the routing logic from src/app.ts into a new file src/routes.ts, and update app.ts to use the new routes file.' ,
files : {
'package.json' : JSON . stringify (
{
name : 'users-api' ,
version : '1.0.0' ,
} ,
null ,
2 ,
) ,
'src/app.ts' : `
import express from 'express';
const app = express();
app.get('/users', (req, res) => {
res.json([{id: 1, name: 'Alice'}]);
});
app.post('/users', (req, res) => {
res.status(201).send();
});
export default app;
` ,
'.gemini/settings.json' : JSON . stringify ( {
experimental : {
topicUpdateNarration : true ,
} ,
} ) ,
} ,
assert : async ( rig ) = > {
const toolLogs = rig . readToolLogs ( ) ;
const topicCalls = toolLogs . filter (
( l ) = > l . toolRequest . name === UPDATE_TOPIC_TOOL_NAME ,
) ;
// This is a multi-step task (read, create new file, edit old file).
// It should clear the bar and use update_topic at least at the start and end.
expect ( topicCalls . length ) . toBeGreaterThanOrEqual ( 2 ) ;
// Verify it actually did the refactoring to ensure it didn't just fail immediately
expect ( fs . existsSync ( path . join ( rig . testDir , 'src/routes.ts' ) ) ) . toBe ( true ) ;
} ,
} ) ;
2026-04-06 18:36:22 +00:00
/**
* Regression test for a bug where update_topic was called multiple times in a
* row. We have seen cases of this occurring in earlier versions of the update_topic
* system instruction, prior to https://github.com/google-gemini/gemini-cli/pull/24640.
* This test demonstrated that there are cases where it can still occur and validates
* the prompt change that improves the behavior.
*/
evalTest ( 'USUALLY_PASSES' , {
name : 'update_topic should not be called twice in a row' ,
prompt : `
We need to build a C compiler.
Before you write any code, you must formally declare your strategy.
First, declare that you will build a Lexer.
Then, immediately realize that is wrong and declare that you will actually build a Parser instead.
Finally, create 'parser.c'.
` ,
files : {
'package.json' : JSON . stringify ( { name : 'test-project' } ) ,
'.gemini/settings.json' : JSON . stringify ( {
experimental : {
topicUpdateNarration : true ,
} ,
} ) ,
} ,
assert : async ( rig ) = > {
const toolLogs = rig . readToolLogs ( ) ;
// Check for back-to-back update_topic calls
for ( let i = 1 ; i < toolLogs . length ; i ++ ) {
if (
toolLogs [ i - 1 ] . toolRequest . name === UPDATE_TOPIC_TOOL_NAME &&
toolLogs [ i ] . toolRequest . name === UPDATE_TOPIC_TOOL_NAME
) {
throw new Error (
` Detected back-to-back ${ UPDATE_TOPIC_TOOL_NAME } calls at index ${ i - 1 } and ${ i } ` ,
) ;
}
}
} ,
} ) ;
2026-03-30 22:02:53 +00:00
} ) ;