mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-04-02 01:11:24 -07:00
117 lines
4.1 KiB
TypeScript
117 lines
4.1 KiB
TypeScript
/**
|
|
* @license
|
|
* Copyright 2026 Google LLC
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*/
|
|
|
|
import { describe, expect } from 'vitest';
|
|
import { evalTest } from './test-helper.js';
|
|
|
|
describe('update_topic_behavior', () => {
|
|
// Constants for tool names and params for robustness
|
|
const UPDATE_TOPIC_TOOL_NAME = 'update_topic';
|
|
|
|
/**
|
|
* Verifies the desired behavior of the update_topic tool. update_topic is used by the
|
|
* agent to share periodic, concise updates about what the agent is working on, independent
|
|
* of the regular model output and/or thoughts. This tool is expected to be called at least
|
|
* at the start and end of the session, and typically at least once in the middle, but no
|
|
* more than 1/4 turns.
|
|
*/
|
|
evalTest('USUALLY_PASSES', {
|
|
name: 'update_topic should be used at start, end and middle for complex tasks',
|
|
prompt: `Create a simple users REST API using Express.
|
|
1. Initialize a new npm project and install express.
|
|
2. Create src/app.ts as the main entry point.
|
|
3. Create src/routes/userRoutes.ts for user routes.
|
|
4. Create src/controllers/userController.ts for user logic.
|
|
5. Implement GET /users, POST /users, and GET /users/:id using an in-memory array.
|
|
6. Add a 'start' script to package.json.
|
|
7. Finally, run a quick grep to verify the routes are in src/app.ts.`,
|
|
files: {
|
|
'package.json': JSON.stringify(
|
|
{
|
|
name: 'users-api',
|
|
version: '1.0.0',
|
|
private: true,
|
|
},
|
|
null,
|
|
2,
|
|
),
|
|
'.gemini/settings.json': JSON.stringify({
|
|
experimental: {
|
|
topicUpdateNarration: true,
|
|
},
|
|
}),
|
|
},
|
|
assert: async (rig, result) => {
|
|
const toolLogs = rig.readToolLogs();
|
|
const topicCalls = toolLogs.filter(
|
|
(l) => l.toolRequest.name === UPDATE_TOPIC_TOOL_NAME,
|
|
);
|
|
|
|
// 1. Assert that update_topic is called at least 3 times (start, middle, end)
|
|
expect(
|
|
topicCalls.length,
|
|
`Expected at least 3 update_topic calls, but found ${topicCalls.length}`,
|
|
).toBeGreaterThanOrEqual(3);
|
|
|
|
// 2. Assert update_topic is called at the very beginning (first tool call)
|
|
expect(
|
|
toolLogs[0].toolRequest.name,
|
|
'First tool call should be update_topic',
|
|
).toBe(UPDATE_TOPIC_TOOL_NAME);
|
|
|
|
// 3. Assert update_topic is called near the end
|
|
const lastTopicCallIndex = toolLogs
|
|
.map((l) => l.toolRequest.name)
|
|
.lastIndexOf(UPDATE_TOPIC_TOOL_NAME);
|
|
expect(
|
|
lastTopicCallIndex,
|
|
'Expected update_topic to be used near the end of the task',
|
|
).toBeGreaterThanOrEqual(toolLogs.length * 0.7);
|
|
|
|
// 4. Assert there is at least one update_topic call in the middle (between start and end phases)
|
|
const middleTopicCalls = topicCalls.slice(1, -1);
|
|
|
|
expect(
|
|
middleTopicCalls.length,
|
|
'Expected at least one update_topic call in the middle of the task',
|
|
).toBeGreaterThanOrEqual(1);
|
|
|
|
// 5. Turn Ratio Assertion: update_topic should be <= 1/2 of total turns.
|
|
// We only enforce this for tasks that take more than 5 turns, as shorter tasks
|
|
// naturally have a higher ratio when following the "start, middle, end" rule.
|
|
const uniquePromptIds = new Set(
|
|
toolLogs
|
|
.map((l) => l.toolRequest.prompt_id)
|
|
.filter((id) => id !== undefined),
|
|
);
|
|
const totalTurns = uniquePromptIds.size;
|
|
|
|
if (totalTurns > 5) {
|
|
const topicTurns = new Set(
|
|
topicCalls
|
|
.map((l) => l.toolRequest.prompt_id)
|
|
.filter((id) => id !== undefined),
|
|
);
|
|
const topicTurnCount = topicTurns.size;
|
|
|
|
const ratio = topicTurnCount / totalTurns;
|
|
|
|
expect(
|
|
ratio,
|
|
`update_topic was used in ${topicTurnCount} out of ${totalTurns} turns (${(ratio * 100).toFixed(1)}%). Expected <= 50%.`,
|
|
).toBeLessThanOrEqual(0.5);
|
|
|
|
// Ideal ratio is closer to 1/5 (20%). We log high usage as a warning.
|
|
if (ratio > 0.25) {
|
|
console.warn(
|
|
`[Efficiency Warning] update_topic usage is high: ${(ratio * 100).toFixed(1)}% (Goal: ~20%)`,
|
|
);
|
|
}
|
|
}
|
|
},
|
|
});
|
|
});
|