feat(core): implement linear semantic topic grouping system

Introduces a 'create_new_topic' tool to organize agent work into logical chapters in the UI. Key changes: - Added TopicManager singleton to track active session phase. - Updated Scheduler to prioritize topic markers at the start of batches. - Injected active topic context into prompt footers for model alignment. - Removed 'Explain Before Acting' mandate in favor of semantic headers. - Added behavioral eval to ensure model compliance with grouping.
2026-04-22 19:14:33 -07:00 · 2026-03-13 09:16:25 -07:00
parent f368e80baf
commit 400da42cf3
19 changed files with 480 additions and 44 deletions
@@ -0,0 +1,85 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, expect } from 'vitest';
+import { CREATE_NEW_TOPIC_TOOL_NAME } from '@google/gemini-cli-core';
+import { evalTest, assertModelHasOutput } from './test-helper.js';
+
+describe('topic_grouping', () => {
+  evalTest('USUALLY_PASSES', {
+    name: 'should use create_new_topic to mark major phase transitions',
+    prompt:
+      'I want you to fix a bug in src/utils.js. First, read the file to understand the bug, then research if there are any related tests, and finally fix it. Use create_new_topic to organize your work into logical chapters (e.g., Researching, Fixing).',
+    files: {
+      'src/utils.js':
+        'export function add(a, b) { return a - b; } // BUG: should be +',
+      'tests/utils.test.js': '// test file',
+    },
+    assert: async (rig, result) => {
+      // Expect at least two topic changes: one for research, one for fixing
+      const toolLogs = rig.readToolLogs();
+      const topicCalls = toolLogs.filter(
+        (log) => log.toolRequest.name === CREATE_NEW_TOPIC_TOOL_NAME,
+      );
+
+      expect(
+        topicCalls.length,
+        `Expected at least 2 topic calls, but got ${topicCalls.length}`,
+      ).toBeGreaterThanOrEqual(2);
+
+      // Verify that the topics are distinct and descriptive
+      const titles = topicCalls.map((call) => {
+        const args = JSON.parse(call.toolRequest.args);
+        console.log('Topic call args:', args);
+        const title = args.title || '';
+        return title.toLowerCase();
+      });
+
+      console.log('Observed topic titles:', titles);
+
+      const hasResearch = titles.some(
+        (t) =>
+          t.includes('research') ||
+          t.includes('analyz') ||
+          t.includes('understand'),
+      );
+      const hasFix = titles.some(
+        (t) =>
+          t.includes('fix') || t.includes('implement') || t.includes('apply'),
+      );
+
+      expect(
+        hasResearch,
+        'Should have a topic call for research/analysis',
+      ).toBe(true);
+      expect(hasFix, 'Should have a topic call for fixing/implementation').toBe(
+        true,
+      );
+
+      assertModelHasOutput(result);
+    },
+  });
+
+  evalTest('USUALLY_PASSES', {
+    name: 'should sort create_new_topic to the top of the turn',
+    prompt:
+      'Immediately start by creating a new topic called "Deep Research" and then list the contents of the current directory. Do both in the same turn if possible.',
+    assert: async (rig, result) => {
+      // In the same turn, create_new_topic should be sorted to index 0
+      const toolLogs = rig.readToolLogs();
+
+      // We look for a turn where both were called
+      // Since it's a simple prompt, they should both be in the first turn
+      expect(toolLogs.length).toBeGreaterThanOrEqual(2);
+
+      // The first tool call in the logs should be create_new_topic
+      // if they were executed in the same batch.
+      expect(toolLogs[0].toolRequest.name).toBe(CREATE_NEW_TOPIC_TOOL_NAME);
+
+      assertModelHasOutput(result);
+    },
+  });
+});