mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-04-07 03:40:36 -07:00
262 lines
8.7 KiB
TypeScript
262 lines
8.7 KiB
TypeScript
/**
|
|
* @license
|
|
* Copyright 2026 Google LLC
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*/
|
|
|
|
import { describe, expect } from 'vitest';
|
|
import fs from 'node:fs';
|
|
import path from 'node:path';
|
|
import { evalTest } from './test-helper.js';
|
|
|
|
describe('update_topic_behavior', () => {
|
|
// Constants for tool names and params for robustness
|
|
const UPDATE_TOPIC_TOOL_NAME = 'update_topic';
|
|
|
|
/**
|
|
* Verifies the desired behavior of the update_topic tool. update_topic is used by the
|
|
* agent to share periodic, concise updates about what the agent is working on, independent
|
|
* of the regular model output and/or thoughts. This tool is expected to be called at least
|
|
* at the start and end of the session, and typically at least once in the middle, but no
|
|
* more than 1/4 turns.
|
|
*/
|
|
evalTest('USUALLY_PASSES', {
|
|
name: 'update_topic should be used at start, end and middle for complex tasks',
|
|
prompt: `Create a simple users REST API using Express.
|
|
1. Initialize a new npm project and install express.
|
|
2. Create src/app.ts as the main entry point.
|
|
3. Create src/routes/userRoutes.ts for user routes.
|
|
4. Create src/controllers/userController.ts for user logic.
|
|
5. Implement GET /users, POST /users, and GET /users/:id using an in-memory array.
|
|
6. Add a 'start' script to package.json.
|
|
7. Finally, run a quick grep to verify the routes are in src/app.ts.`,
|
|
files: {
|
|
'package.json': JSON.stringify(
|
|
{
|
|
name: 'users-api',
|
|
version: '1.0.0',
|
|
private: true,
|
|
},
|
|
null,
|
|
2,
|
|
),
|
|
'.gemini/settings.json': JSON.stringify({
|
|
experimental: {
|
|
topicUpdateNarration: true,
|
|
},
|
|
}),
|
|
},
|
|
assert: async (rig, result) => {
|
|
const toolLogs = rig.readToolLogs();
|
|
const topicCalls = toolLogs.filter(
|
|
(l) => l.toolRequest.name === UPDATE_TOPIC_TOOL_NAME,
|
|
);
|
|
|
|
// 1. Assert that update_topic is called at least 3 times (start, middle, end)
|
|
expect(
|
|
topicCalls.length,
|
|
`Expected at least 3 update_topic calls, but found ${topicCalls.length}`,
|
|
).toBeGreaterThanOrEqual(3);
|
|
|
|
// 2. Assert update_topic is called at the very beginning (first tool call)
|
|
expect(
|
|
toolLogs[0].toolRequest.name,
|
|
'First tool call should be update_topic',
|
|
).toBe(UPDATE_TOPIC_TOOL_NAME);
|
|
|
|
// 3. Assert update_topic is called near the end
|
|
const lastTopicCallIndex = toolLogs
|
|
.map((l) => l.toolRequest.name)
|
|
.lastIndexOf(UPDATE_TOPIC_TOOL_NAME);
|
|
expect(
|
|
lastTopicCallIndex,
|
|
'Expected update_topic to be used near the end of the task',
|
|
).toBeGreaterThanOrEqual(toolLogs.length * 0.7);
|
|
|
|
// 4. Assert there is at least one update_topic call in the middle (between start and end phases)
|
|
const middleTopicCalls = topicCalls.slice(1, -1);
|
|
|
|
expect(
|
|
middleTopicCalls.length,
|
|
'Expected at least one update_topic call in the middle of the task',
|
|
).toBeGreaterThanOrEqual(1);
|
|
|
|
// 5. Turn Ratio Assertion: update_topic should be <= 1/2 of total turns.
|
|
// We only enforce this for tasks that take more than 5 turns, as shorter tasks
|
|
// naturally have a higher ratio when following the "start, middle, end" rule.
|
|
const uniquePromptIds = new Set(
|
|
toolLogs
|
|
.map((l) => l.toolRequest.prompt_id)
|
|
.filter((id) => id !== undefined),
|
|
);
|
|
const totalTurns = uniquePromptIds.size;
|
|
|
|
if (totalTurns > 5) {
|
|
const topicTurns = new Set(
|
|
topicCalls
|
|
.map((l) => l.toolRequest.prompt_id)
|
|
.filter((id) => id !== undefined),
|
|
);
|
|
const topicTurnCount = topicTurns.size;
|
|
|
|
const ratio = topicTurnCount / totalTurns;
|
|
|
|
expect(
|
|
ratio,
|
|
`update_topic was used in ${topicTurnCount} out of ${totalTurns} turns (${(ratio * 100).toFixed(1)}%). Expected <= 50%.`,
|
|
).toBeLessThanOrEqual(0.5);
|
|
|
|
// Ideal ratio is closer to 1/5 (20%). We log high usage as a warning.
|
|
if (ratio > 0.25) {
|
|
console.warn(
|
|
`[Efficiency Warning] update_topic usage is high: ${(ratio * 100).toFixed(1)}% (Goal: ~20%)`,
|
|
);
|
|
}
|
|
}
|
|
},
|
|
});
|
|
|
|
evalTest('USUALLY_PASSES', {
|
|
name: 'update_topic should NOT be used for informational coding tasks (Obvious)',
|
|
approvalMode: 'default',
|
|
prompt:
|
|
'Explain the difference between Map and Object in JavaScript and provide a performance-focused code snippet for each.',
|
|
files: {
|
|
'.gemini/settings.json': JSON.stringify({
|
|
experimental: {
|
|
topicUpdateNarration: true,
|
|
},
|
|
}),
|
|
},
|
|
assert: async (rig) => {
|
|
const toolLogs = rig.readToolLogs();
|
|
const topicCalls = toolLogs.filter(
|
|
(l) => l.toolRequest.name === UPDATE_TOPIC_TOOL_NAME,
|
|
);
|
|
|
|
expect(
|
|
topicCalls.length,
|
|
`Expected 0 update_topic calls for an informational task, but found ${topicCalls.length}`,
|
|
).toBe(0);
|
|
},
|
|
});
|
|
|
|
evalTest('USUALLY_PASSES', {
|
|
name: 'update_topic should NOT be used for surgical symbol searches (Grey Area)',
|
|
approvalMode: 'default',
|
|
prompt:
|
|
"Find the file where the 'UPDATE_TOPIC_TOOL_NAME' constant is defined.",
|
|
files: {
|
|
'packages/core/src/tools/tool-names.ts':
|
|
"export const UPDATE_TOPIC_TOOL_NAME = 'update_topic';",
|
|
'.gemini/settings.json': JSON.stringify({
|
|
experimental: {
|
|
topicUpdateNarration: true,
|
|
},
|
|
}),
|
|
},
|
|
assert: async (rig) => {
|
|
const toolLogs = rig.readToolLogs();
|
|
const topicCalls = toolLogs.filter(
|
|
(l) => l.toolRequest.name === UPDATE_TOPIC_TOOL_NAME,
|
|
);
|
|
|
|
expect(
|
|
topicCalls.length,
|
|
`Expected 0 update_topic calls for a surgical symbol search, but found ${topicCalls.length}`,
|
|
).toBe(0);
|
|
},
|
|
});
|
|
|
|
evalTest('USUALLY_PASSES', {
|
|
name: 'update_topic should be used for medium complexity multi-step tasks',
|
|
prompt:
|
|
'Refactor the `users-api` project. Move the routing logic from src/app.ts into a new file src/routes.ts, and update app.ts to use the new routes file.',
|
|
files: {
|
|
'package.json': JSON.stringify(
|
|
{
|
|
name: 'users-api',
|
|
version: '1.0.0',
|
|
},
|
|
null,
|
|
2,
|
|
),
|
|
'src/app.ts': `
|
|
import express from 'express';
|
|
const app = express();
|
|
|
|
app.get('/users', (req, res) => {
|
|
res.json([{id: 1, name: 'Alice'}]);
|
|
});
|
|
|
|
app.post('/users', (req, res) => {
|
|
res.status(201).send();
|
|
});
|
|
|
|
export default app;
|
|
`,
|
|
'.gemini/settings.json': JSON.stringify({
|
|
experimental: {
|
|
topicUpdateNarration: true,
|
|
},
|
|
}),
|
|
},
|
|
assert: async (rig) => {
|
|
const toolLogs = rig.readToolLogs();
|
|
const topicCalls = toolLogs.filter(
|
|
(l) => l.toolRequest.name === UPDATE_TOPIC_TOOL_NAME,
|
|
);
|
|
|
|
// This is a multi-step task (read, create new file, edit old file).
|
|
// It should clear the bar and use update_topic at least at the start and end.
|
|
expect(topicCalls.length).toBeGreaterThanOrEqual(2);
|
|
|
|
// Verify it actually did the refactoring to ensure it didn't just fail immediately
|
|
expect(fs.existsSync(path.join(rig.testDir, 'src/routes.ts'))).toBe(true);
|
|
},
|
|
});
|
|
|
|
/**
|
|
* Regression test for a bug where update_topic was called multiple times in a
|
|
* row. We have seen cases of this occurring in earlier versions of the update_topic
|
|
* system instruction, prior to https://github.com/google-gemini/gemini-cli/pull/24640.
|
|
* This test demonstrated that there are cases where it can still occur and validates
|
|
* the prompt change that improves the behavior.
|
|
*/
|
|
evalTest('USUALLY_PASSES', {
|
|
name: 'update_topic should not be called twice in a row',
|
|
prompt: `
|
|
We need to build a C compiler.
|
|
|
|
Before you write any code, you must formally declare your strategy.
|
|
First, declare that you will build a Lexer.
|
|
Then, immediately realize that is wrong and declare that you will actually build a Parser instead.
|
|
|
|
Finally, create 'parser.c'.
|
|
`,
|
|
files: {
|
|
'package.json': JSON.stringify({ name: 'test-project' }),
|
|
'.gemini/settings.json': JSON.stringify({
|
|
experimental: {
|
|
topicUpdateNarration: true,
|
|
},
|
|
}),
|
|
},
|
|
assert: async (rig) => {
|
|
const toolLogs = rig.readToolLogs();
|
|
|
|
// Check for back-to-back update_topic calls
|
|
for (let i = 1; i < toolLogs.length; i++) {
|
|
if (
|
|
toolLogs[i - 1].toolRequest.name === UPDATE_TOPIC_TOOL_NAME &&
|
|
toolLogs[i].toolRequest.name === UPDATE_TOPIC_TOOL_NAME
|
|
) {
|
|
throw new Error(
|
|
`Detected back-to-back ${UPDATE_TOPIC_TOOL_NAME} calls at index ${i - 1} and ${i}`,
|
|
);
|
|
}
|
|
}
|
|
},
|
|
});
|
|
});
|