diff --git a/evals/update_topic.eval.ts b/evals/update_topic.eval.ts index ce895d5ad7..8a6f3f75ac 100644 --- a/evals/update_topic.eval.ts +++ b/evals/update_topic.eval.ts @@ -215,4 +215,47 @@ export default app; expect(fs.existsSync(path.join(rig.testDir, 'src/routes.ts'))).toBe(true); }, }); + + /** + * Regression test for a bug where update_topic was called multiple times in a + * row. We have seen cases of this occurring in earlier versions of the update_topic + * system instruction, prior to https://github.com/google-gemini/gemini-cli/pull/24640. + * This test demonstrated that there are cases where it can still occur and validates + * the prompt change that improves the behavior. + */ + evalTest('USUALLY_PASSES', { + name: 'update_topic should not be called twice in a row', + prompt: ` + We need to build a C compiler. + + Before you write any code, you must formally declare your strategy. + First, declare that you will build a Lexer. + Then, immediately realize that is wrong and declare that you will actually build a Parser instead. + + Finally, create 'parser.c'. + `, + files: { + 'package.json': JSON.stringify({ name: 'test-project' }), + '.gemini/settings.json': JSON.stringify({ + experimental: { + topicUpdateNarration: true, + }, + }), + }, + assert: async (rig) => { + const toolLogs = rig.readToolLogs(); + + // Check for back-to-back update_topic calls + for (let i = 1; i < toolLogs.length; i++) { + if ( + toolLogs[i - 1].toolRequest.name === UPDATE_TOPIC_TOOL_NAME && + toolLogs[i].toolRequest.name === UPDATE_TOPIC_TOOL_NAME + ) { + throw new Error( + `Detected back-to-back ${UPDATE_TOPIC_TOOL_NAME} calls at index ${i - 1} and ${i}`, + ); + } + } + }, + }); }); diff --git a/packages/core/src/prompts/snippets.legacy.ts b/packages/core/src/prompts/snippets.legacy.ts index 16734b9e08..d80d096049 100644 --- a/packages/core/src/prompts/snippets.legacy.ts +++ b/packages/core/src/prompts/snippets.legacy.ts @@ -518,7 +518,9 @@ function mandateTopicUpdateModel(): string { ## Topic Updates As you work, the user follows along by reading topic updates that you publish with ${UPDATE_TOPIC_TOOL_NAME}. Keep them informed by doing the following: -- Usage Exception: NEVER use ${UPDATE_TOPIC_TOOL_NAME} for answering questions, providing explanations, or performing isolated lookup tasks (e.g. reading a single file, running a quick search, or checking a version). It is STRICTLY for orchestrating multi-step codebase modifications or complex investigations involving 3 or more tool calls.\n- Always call ${UPDATE_TOPIC_TOOL_NAME} in your first and last turn for tasks that require 3 or more tool calls. The final turn should always recap what was done. +- Usage Exception: NEVER use ${UPDATE_TOPIC_TOOL_NAME} for answering questions, providing explanations, or performing isolated lookup tasks (e.g. reading a single file, running a quick search, or checking a version). It is STRICTLY for orchestrating multi-step codebase modifications or complex investigations involving 3 or more tool calls. +- Always call ${UPDATE_TOPIC_TOOL_NAME} in your first turn. +- For tasks taking multiple turns, also call ${UPDATE_TOPIC_TOOL_NAME} in your last turn to recap what was done. - Each topic update should give a concise description of what you are doing for the next few turns in the \`${TOPIC_PARAM_SUMMARY}\` parameter. - Provide topic updates whenever you change "topics". A topic is typically a discrete subgoal and will be every 3 to 10 turns. Do not use ${UPDATE_TOPIC_TOOL_NAME} on every turn. - The typical complex user message should call ${UPDATE_TOPIC_TOOL_NAME} 3 or more times. Each corresponds to a distinct phase of the task, such as "Researching X", "Researching Y", "Implementing Z with X", and "Testing Z". diff --git a/packages/core/src/prompts/snippets.ts b/packages/core/src/prompts/snippets.ts index b26cba3b44..7f6aa2ee30 100644 --- a/packages/core/src/prompts/snippets.ts +++ b/packages/core/src/prompts/snippets.ts @@ -629,7 +629,9 @@ function mandateTopicUpdateModel(): string { ## Topic Updates As you work, the user follows along by reading topic updates that you publish with ${UPDATE_TOPIC_TOOL_NAME}. Keep them informed by doing the following: -- Usage Exception: NEVER use ${UPDATE_TOPIC_TOOL_NAME} for answering questions, providing explanations, or performing isolated lookup tasks (e.g. reading a single file, running a quick search, or checking a version). It is STRICTLY for orchestrating multi-step codebase modifications or complex investigations involving 3 or more tool calls.\n- Always call ${UPDATE_TOPIC_TOOL_NAME} in your first and last turn for tasks that require 3 or more tool calls. The final turn should always recap what was done. +- Usage Exception: NEVER use ${UPDATE_TOPIC_TOOL_NAME} for answering questions, providing explanations, or performing isolated lookup tasks (e.g. reading a single file, running a quick search, or checking a version). It is STRICTLY for orchestrating multi-step codebase modifications or complex investigations involving 3 or more tool calls. +- Always call ${UPDATE_TOPIC_TOOL_NAME} in your first turn. +- For tasks taking multiple turns, also call ${UPDATE_TOPIC_TOOL_NAME} in your last turn to recap what was done. - Each topic update should give a concise description of what you are doing for the next few turns in the \`${TOPIC_PARAM_SUMMARY}\` parameter. - Provide topic updates whenever you change "topics". A topic is typically a discrete subgoal and will be every 3 to 10 turns. Do not use ${UPDATE_TOPIC_TOOL_NAME} on every turn. - The typical complex user message should call ${UPDATE_TOPIC_TOOL_NAME} 3 or more times. Each corresponds to a distinct phase of the task, such as "Researching X", "Researching Y", "Implementing Z with X", and "Testing Z".