mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-04-08 04:10:35 -07:00
feat(core): discourage update topic tool for simple tasks (#24640)
Co-authored-by: Samee Zahid <sameez@google.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
@@ -5,6 +5,8 @@
|
||||
*/
|
||||
|
||||
import { describe, expect } from 'vitest';
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
import { evalTest } from './test-helper.js';
|
||||
|
||||
describe('update_topic_behavior', () => {
|
||||
@@ -113,4 +115,104 @@ describe('update_topic_behavior', () => {
|
||||
}
|
||||
},
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'update_topic should NOT be used for informational coding tasks (Obvious)',
|
||||
approvalMode: 'default',
|
||||
prompt:
|
||||
'Explain the difference between Map and Object in JavaScript and provide a performance-focused code snippet for each.',
|
||||
files: {
|
||||
'.gemini/settings.json': JSON.stringify({
|
||||
experimental: {
|
||||
topicUpdateNarration: true,
|
||||
},
|
||||
}),
|
||||
},
|
||||
assert: async (rig) => {
|
||||
const toolLogs = rig.readToolLogs();
|
||||
const topicCalls = toolLogs.filter(
|
||||
(l) => l.toolRequest.name === UPDATE_TOPIC_TOOL_NAME,
|
||||
);
|
||||
|
||||
expect(
|
||||
topicCalls.length,
|
||||
`Expected 0 update_topic calls for an informational task, but found ${topicCalls.length}`,
|
||||
).toBe(0);
|
||||
},
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'update_topic should NOT be used for surgical symbol searches (Grey Area)',
|
||||
approvalMode: 'default',
|
||||
prompt:
|
||||
"Find the file where the 'UPDATE_TOPIC_TOOL_NAME' constant is defined.",
|
||||
files: {
|
||||
'packages/core/src/tools/tool-names.ts':
|
||||
"export const UPDATE_TOPIC_TOOL_NAME = 'update_topic';",
|
||||
'.gemini/settings.json': JSON.stringify({
|
||||
experimental: {
|
||||
topicUpdateNarration: true,
|
||||
},
|
||||
}),
|
||||
},
|
||||
assert: async (rig) => {
|
||||
const toolLogs = rig.readToolLogs();
|
||||
const topicCalls = toolLogs.filter(
|
||||
(l) => l.toolRequest.name === UPDATE_TOPIC_TOOL_NAME,
|
||||
);
|
||||
|
||||
expect(
|
||||
topicCalls.length,
|
||||
`Expected 0 update_topic calls for a surgical symbol search, but found ${topicCalls.length}`,
|
||||
).toBe(0);
|
||||
},
|
||||
});
|
||||
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'update_topic should be used for medium complexity multi-step tasks',
|
||||
prompt:
|
||||
'Refactor the `users-api` project. Move the routing logic from src/app.ts into a new file src/routes.ts, and update app.ts to use the new routes file.',
|
||||
files: {
|
||||
'package.json': JSON.stringify(
|
||||
{
|
||||
name: 'users-api',
|
||||
version: '1.0.0',
|
||||
},
|
||||
null,
|
||||
2,
|
||||
),
|
||||
'src/app.ts': `
|
||||
import express from 'express';
|
||||
const app = express();
|
||||
|
||||
app.get('/users', (req, res) => {
|
||||
res.json([{id: 1, name: 'Alice'}]);
|
||||
});
|
||||
|
||||
app.post('/users', (req, res) => {
|
||||
res.status(201).send();
|
||||
});
|
||||
|
||||
export default app;
|
||||
`,
|
||||
'.gemini/settings.json': JSON.stringify({
|
||||
experimental: {
|
||||
topicUpdateNarration: true,
|
||||
},
|
||||
}),
|
||||
},
|
||||
assert: async (rig) => {
|
||||
const toolLogs = rig.readToolLogs();
|
||||
const topicCalls = toolLogs.filter(
|
||||
(l) => l.toolRequest.name === UPDATE_TOPIC_TOOL_NAME,
|
||||
);
|
||||
|
||||
// This is a multi-step task (read, create new file, edit old file).
|
||||
// It should clear the bar and use update_topic at least at the start and end.
|
||||
expect(topicCalls.length).toBeGreaterThanOrEqual(2);
|
||||
|
||||
// Verify it actually did the refactoring to ensure it didn't just fail immediately
|
||||
expect(fs.existsSync(path.join(rig.testDir, 'src/routes.ts'))).toBe(true);
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
@@ -518,10 +518,10 @@ function mandateTopicUpdateModel(): string {
|
||||
## Topic Updates
|
||||
As you work, the user follows along by reading topic updates that you publish with ${UPDATE_TOPIC_TOOL_NAME}. Keep them informed by doing the following:
|
||||
|
||||
- Always call ${UPDATE_TOPIC_TOOL_NAME} in your first and last turn. The final turn should always recap what was done.
|
||||
- Usage Exception: NEVER use ${UPDATE_TOPIC_TOOL_NAME} for answering questions, providing explanations, or performing isolated lookup tasks (e.g. reading a single file, running a quick search, or checking a version). It is STRICTLY for orchestrating multi-step codebase modifications or complex investigations involving 3 or more tool calls.\n- Always call ${UPDATE_TOPIC_TOOL_NAME} in your first and last turn for tasks that require 3 or more tool calls. The final turn should always recap what was done.
|
||||
- Each topic update should give a concise description of what you are doing for the next few turns in the \`${TOPIC_PARAM_SUMMARY}\` parameter.
|
||||
- Provide topic updates whenever you change "topics". A topic is typically a discrete subgoal and will be every 3 to 10 turns. Do not use ${UPDATE_TOPIC_TOOL_NAME} on every turn.
|
||||
- The typical user message should call ${UPDATE_TOPIC_TOOL_NAME} 3 or more times. Each corresponds to a distinct phase of the task, such as "Researching X", "Researching Y", "Implementing Z with X", and "Testing Z".
|
||||
- The typical complex user message should call ${UPDATE_TOPIC_TOOL_NAME} 3 or more times. Each corresponds to a distinct phase of the task, such as "Researching X", "Researching Y", "Implementing Z with X", and "Testing Z".
|
||||
- Remember to call ${UPDATE_TOPIC_TOOL_NAME} when you experience an unexpected event (e.g., a test failure, compilation error, environment issue, or unexpected learning) that requires a strategic detour.
|
||||
- **Examples:**
|
||||
- ${UPDATE_TOPIC_TOOL_NAME}(${TOPIC_PARAM_TITLE}="Researching Parser", ${TOPIC_PARAM_SUMMARY}="I am starting an investigation into the parser timeout bug. My goal is to first understand the current test coverage and then attempt to reproduce the failure. This phase will focus on identifying the bottleneck in the main loop before we move to implementation.")
|
||||
|
||||
@@ -629,10 +629,10 @@ function mandateTopicUpdateModel(): string {
|
||||
## Topic Updates
|
||||
As you work, the user follows along by reading topic updates that you publish with ${UPDATE_TOPIC_TOOL_NAME}. Keep them informed by doing the following:
|
||||
|
||||
- Always call ${UPDATE_TOPIC_TOOL_NAME} in your first and last turn. The final turn should always recap what was done.
|
||||
- Usage Exception: NEVER use ${UPDATE_TOPIC_TOOL_NAME} for answering questions, providing explanations, or performing isolated lookup tasks (e.g. reading a single file, running a quick search, or checking a version). It is STRICTLY for orchestrating multi-step codebase modifications or complex investigations involving 3 or more tool calls.\n- Always call ${UPDATE_TOPIC_TOOL_NAME} in your first and last turn for tasks that require 3 or more tool calls. The final turn should always recap what was done.
|
||||
- Each topic update should give a concise description of what you are doing for the next few turns in the \`${TOPIC_PARAM_SUMMARY}\` parameter.
|
||||
- Provide topic updates whenever you change "topics". A topic is typically a discrete subgoal and will be every 3 to 10 turns. Do not use ${UPDATE_TOPIC_TOOL_NAME} on every turn.
|
||||
- The typical user message should call ${UPDATE_TOPIC_TOOL_NAME} 3 or more times. Each corresponds to a distinct phase of the task, such as "Researching X", "Researching Y", "Implementing Z with X", and "Testing Z".
|
||||
- The typical complex user message should call ${UPDATE_TOPIC_TOOL_NAME} 3 or more times. Each corresponds to a distinct phase of the task, such as "Researching X", "Researching Y", "Implementing Z with X", and "Testing Z".
|
||||
- Remember to call ${UPDATE_TOPIC_TOOL_NAME} when you experience an unexpected event (e.g., a test failure, compilation error, environment issue, or unexpected learning) that requires a strategic detour.
|
||||
- **Examples:**
|
||||
- \`update_topic(${TOPIC_PARAM_TITLE}="Researching Parser", ${TOPIC_PARAM_SUMMARY}="I am starting an investigation into the parser timeout bug. My goal is to first understand the current test coverage and then attempt to reproduce the failure. This phase will focus on identifying the bottleneck in the main loop before we move to implementation.")\`
|
||||
|
||||
Reference in New Issue
Block a user