2026-03-30 22:02:53 +00:00
/ * *
* @license
* Copyright 2026 Google LLC
* SPDX - License - Identifier : Apache - 2.0
* /
import { describe , expect } from 'vitest' ;
2026-04-03 16:52:24 -07:00
import fs from 'node:fs' ;
import path from 'node:path' ;
2026-03-30 22:02:53 +00:00
import { evalTest } from './test-helper.js' ;
describe ( 'update_topic_behavior' , ( ) = > {
// Constants for tool names and params for robustness
const UPDATE_TOPIC_TOOL_NAME = 'update_topic' ;
/ * *
* Verifies the desired behavior of the update_topic tool . update_topic is used by the
* agent to share periodic , concise updates about what the agent is working on , independent
* of the regular model output and / or thoughts . This tool is expected to be called at least
* at the start and end of the session , and typically at least once in the middle , but no
* more than 1 / 4 turns .
* /
evalTest ( 'USUALLY_PASSES' , {
name : 'update_topic should be used at start, end and middle for complex tasks' ,
prompt : ` Create a simple users REST API using Express.
1 . Initialize a new npm project and install express .
2 . Create src / app . ts as the main entry point .
3 . Create src / routes / userRoutes . ts for user routes .
4 . Create src / controllers / userController . ts for user logic .
5 . Implement GET / users , POST / users , and GET / users / : id using an in - memory array .
6 . Add a 'start' script to package . json .
7 . Finally , run a quick grep to verify the routes are in src / app . ts . ` ,
files : {
'package.json' : JSON . stringify (
{
name : 'users-api' ,
version : '1.0.0' ,
private : true ,
} ,
null ,
2 ,
) ,
'.gemini/settings.json' : JSON . stringify ( {
experimental : {
topicUpdateNarration : true ,
} ,
} ) ,
} ,
assert : async ( rig , result ) = > {
const toolLogs = rig . readToolLogs ( ) ;
const topicCalls = toolLogs . filter (
( l ) = > l . toolRequest . name === UPDATE_TOPIC_TOOL_NAME ,
) ;
// 1. Assert that update_topic is called at least 3 times (start, middle, end)
expect (
topicCalls . length ,
` Expected at least 3 update_topic calls, but found ${ topicCalls . length } ` ,
) . toBeGreaterThanOrEqual ( 3 ) ;
// 2. Assert update_topic is called at the very beginning (first tool call)
expect (
toolLogs [ 0 ] . toolRequest . name ,
'First tool call should be update_topic' ,
) . toBe ( UPDATE_TOPIC_TOOL_NAME ) ;
// 3. Assert update_topic is called near the end
const lastTopicCallIndex = toolLogs
. map ( ( l ) = > l . toolRequest . name )
. lastIndexOf ( UPDATE_TOPIC_TOOL_NAME ) ;
expect (
lastTopicCallIndex ,
'Expected update_topic to be used near the end of the task' ,
) . toBeGreaterThanOrEqual ( toolLogs . length * 0.7 ) ;
// 4. Assert there is at least one update_topic call in the middle (between start and end phases)
const middleTopicCalls = topicCalls . slice ( 1 , - 1 ) ;
expect (
middleTopicCalls . length ,
'Expected at least one update_topic call in the middle of the task' ,
) . toBeGreaterThanOrEqual ( 1 ) ;
// 5. Turn Ratio Assertion: update_topic should be <= 1/2 of total turns.
// We only enforce this for tasks that take more than 5 turns, as shorter tasks
// naturally have a higher ratio when following the "start, middle, end" rule.
const uniquePromptIds = new Set (
toolLogs
. map ( ( l ) = > l . toolRequest . prompt_id )
. filter ( ( id ) = > id !== undefined ) ,
) ;
const totalTurns = uniquePromptIds . size ;
if ( totalTurns > 5 ) {
const topicTurns = new Set (
topicCalls
. map ( ( l ) = > l . toolRequest . prompt_id )
. filter ( ( id ) = > id !== undefined ) ,
) ;
const topicTurnCount = topicTurns . size ;
const ratio = topicTurnCount / totalTurns ;
expect (
ratio ,
` update_topic was used in ${ topicTurnCount } out of ${ totalTurns } turns ( ${ ( ratio * 100 ) . toFixed ( 1 ) } %). Expected <= 50%. ` ,
) . toBeLessThanOrEqual ( 0.5 ) ;
// Ideal ratio is closer to 1/5 (20%). We log high usage as a warning.
if ( ratio > 0.25 ) {
console . warn (
` [Efficiency Warning] update_topic usage is high: ${ ( ratio * 100 ) . toFixed ( 1 ) } % (Goal: ~20%) ` ,
) ;
}
}
} ,
} ) ;
2026-04-03 16:52:24 -07:00
evalTest ( 'USUALLY_PASSES' , {
name : 'update_topic should NOT be used for informational coding tasks (Obvious)' ,
approvalMode : 'default' ,
prompt :
'Explain the difference between Map and Object in JavaScript and provide a performance-focused code snippet for each.' ,
files : {
'.gemini/settings.json' : JSON . stringify ( {
experimental : {
topicUpdateNarration : true ,
} ,
} ) ,
} ,
assert : async ( rig ) = > {
const toolLogs = rig . readToolLogs ( ) ;
const topicCalls = toolLogs . filter (
( l ) = > l . toolRequest . name === UPDATE_TOPIC_TOOL_NAME ,
) ;
expect (
topicCalls . length ,
` Expected 0 update_topic calls for an informational task, but found ${ topicCalls . length } ` ,
) . toBe ( 0 ) ;
} ,
} ) ;
evalTest ( 'USUALLY_PASSES' , {
name : 'update_topic should NOT be used for surgical symbol searches (Grey Area)' ,
approvalMode : 'default' ,
prompt :
"Find the file where the 'UPDATE_TOPIC_TOOL_NAME' constant is defined." ,
files : {
'packages/core/src/tools/tool-names.ts' :
"export const UPDATE_TOPIC_TOOL_NAME = 'update_topic';" ,
'.gemini/settings.json' : JSON . stringify ( {
experimental : {
topicUpdateNarration : true ,
} ,
} ) ,
} ,
assert : async ( rig ) = > {
const toolLogs = rig . readToolLogs ( ) ;
const topicCalls = toolLogs . filter (
( l ) = > l . toolRequest . name === UPDATE_TOPIC_TOOL_NAME ,
) ;
expect (
topicCalls . length ,
` Expected 0 update_topic calls for a surgical symbol search, but found ${ topicCalls . length } ` ,
) . toBe ( 0 ) ;
} ,
} ) ;
evalTest ( 'USUALLY_PASSES' , {
name : 'update_topic should be used for medium complexity multi-step tasks' ,
prompt :
'Refactor the `users-api` project. Move the routing logic from src/app.ts into a new file src/routes.ts, and update app.ts to use the new routes file.' ,
files : {
'package.json' : JSON . stringify (
{
name : 'users-api' ,
version : '1.0.0' ,
} ,
null ,
2 ,
) ,
'src/app.ts' : `
import express from 'express' ;
const app = express ( ) ;
app . get ( '/users' , ( req , res ) = > {
res . json ( [ { id : 1 , name : 'Alice' } ] ) ;
} ) ;
app . post ( '/users' , ( req , res ) = > {
res . status ( 201 ) . send ( ) ;
} ) ;
export default app ;
` ,
'.gemini/settings.json' : JSON . stringify ( {
experimental : {
topicUpdateNarration : true ,
} ,
} ) ,
} ,
assert : async ( rig ) = > {
const toolLogs = rig . readToolLogs ( ) ;
const topicCalls = toolLogs . filter (
( l ) = > l . toolRequest . name === UPDATE_TOPIC_TOOL_NAME ,
) ;
// This is a multi-step task (read, create new file, edit old file).
// It should clear the bar and use update_topic at least at the start and end.
expect ( topicCalls . length ) . toBeGreaterThanOrEqual ( 2 ) ;
// Verify it actually did the refactoring to ensure it didn't just fail immediately
expect ( fs . existsSync ( path . join ( rig . testDir , 'src/routes.ts' ) ) ) . toBe ( true ) ;
} ,
} ) ;
2026-04-06 18:36:22 +00:00
/ * *
* Regression test for a bug where update_topic was called multiple times in a
* row . We have seen cases of this occurring in earlier versions of the update_topic
* system instruction , prior to https : //github.com/google-gemini/gemini-cli/pull/24640.
* This test demonstrated that there are cases where it can still occur and validates
* the prompt change that improves the behavior .
* /
evalTest ( 'USUALLY_PASSES' , {
name : 'update_topic should not be called twice in a row' ,
prompt : `
We need to build a C compiler .
Before you write any code , you must formally declare your strategy .
First , declare that you will build a Lexer .
Then , immediately realize that is wrong and declare that you will actually build a Parser instead .
Finally , create 'parser.c' .
` ,
files : {
'package.json' : JSON . stringify ( { name : 'test-project' } ) ,
'.gemini/settings.json' : JSON . stringify ( {
experimental : {
topicUpdateNarration : true ,
} ,
} ) ,
} ,
assert : async ( rig ) = > {
const toolLogs = rig . readToolLogs ( ) ;
// Check for back-to-back update_topic calls
for ( let i = 1 ; i < toolLogs . length ; i ++ ) {
if (
toolLogs [ i - 1 ] . toolRequest . name === UPDATE_TOPIC_TOOL_NAME &&
toolLogs [ i ] . toolRequest . name === UPDATE_TOPIC_TOOL_NAME
) {
throw new Error (
` Detected back-to-back ${ UPDATE_TOPIC_TOOL_NAME } calls at index ${ i - 1 } and ${ i } ` ,
) ;
}
}
} ,
} ) ;
2026-03-30 22:02:53 +00:00
} ) ;