Files
gemini-cli/evals/topic_grouping.eval.ts
Abhijit Balaji 400da42cf3 feat(core): implement linear semantic topic grouping system
Introduces a 'create_new_topic' tool to organize agent work into logical chapters in the UI.

Key changes:
- Added TopicManager singleton to track active session phase.
- Updated Scheduler to prioritize topic markers at the start of batches.
- Injected active topic context into prompt footers for model alignment.
- Removed 'Explain Before Acting' mandate in favor of semantic headers.
- Added behavioral eval to ensure model compliance with grouping.
2026-03-13 09:16:25 -07:00

86 lines
3.0 KiB
TypeScript

/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, expect } from 'vitest';
import { CREATE_NEW_TOPIC_TOOL_NAME } from '@google/gemini-cli-core';
import { evalTest, assertModelHasOutput } from './test-helper.js';
describe('topic_grouping', () => {
evalTest('USUALLY_PASSES', {
name: 'should use create_new_topic to mark major phase transitions',
prompt:
'I want you to fix a bug in src/utils.js. First, read the file to understand the bug, then research if there are any related tests, and finally fix it. Use create_new_topic to organize your work into logical chapters (e.g., Researching, Fixing).',
files: {
'src/utils.js':
'export function add(a, b) { return a - b; } // BUG: should be +',
'tests/utils.test.js': '// test file',
},
assert: async (rig, result) => {
// Expect at least two topic changes: one for research, one for fixing
const toolLogs = rig.readToolLogs();
const topicCalls = toolLogs.filter(
(log) => log.toolRequest.name === CREATE_NEW_TOPIC_TOOL_NAME,
);
expect(
topicCalls.length,
`Expected at least 2 topic calls, but got ${topicCalls.length}`,
).toBeGreaterThanOrEqual(2);
// Verify that the topics are distinct and descriptive
const titles = topicCalls.map((call) => {
const args = JSON.parse(call.toolRequest.args);
console.log('Topic call args:', args);
const title = args.title || '';
return title.toLowerCase();
});
console.log('Observed topic titles:', titles);
const hasResearch = titles.some(
(t) =>
t.includes('research') ||
t.includes('analyz') ||
t.includes('understand'),
);
const hasFix = titles.some(
(t) =>
t.includes('fix') || t.includes('implement') || t.includes('apply'),
);
expect(
hasResearch,
'Should have a topic call for research/analysis',
).toBe(true);
expect(hasFix, 'Should have a topic call for fixing/implementation').toBe(
true,
);
assertModelHasOutput(result);
},
});
evalTest('USUALLY_PASSES', {
name: 'should sort create_new_topic to the top of the turn',
prompt:
'Immediately start by creating a new topic called "Deep Research" and then list the contents of the current directory. Do both in the same turn if possible.',
assert: async (rig, result) => {
// In the same turn, create_new_topic should be sorted to index 0
const toolLogs = rig.readToolLogs();
// We look for a turn where both were called
// Since it's a simple prompt, they should both be in the first turn
expect(toolLogs.length).toBeGreaterThanOrEqual(2);
// The first tool call in the logs should be create_new_topic
// if they were executed in the same batch.
expect(toolLogs[0].toolRequest.name).toBe(CREATE_NEW_TOPIC_TOOL_NAME);
assertModelHasOutput(result);
},
});
});