gemini-cli/evals/topic_grouping.eval.ts

/**
 * @license
 * Copyright 2026 Google LLC
 * SPDX-License-Identifier: Apache-2.0
 */

import { describe, expect } from 'vitest';
import { CREATE_NEW_TOPIC_TOOL_NAME } from '@google/gemini-cli-core';
import { evalTest, assertModelHasOutput } from './test-helper.js';

describe('topic_grouping', () => {
  evalTest('USUALLY_PASSES', {
    name: 'should use create_new_topic to mark major phase transitions',
    prompt:
      'I want you to fix a bug in src/utils.js. First, read the file to understand the bug, then research if there are any related tests, and finally fix it. Use create_new_topic to organize your work into logical chapters (e.g., Researching, Fixing).',
    files: {
      'src/utils.js':
        'export function add(a, b) { return a - b; } // BUG: should be +',
      'tests/utils.test.js': '// test file',
    },
    assert: async (rig, result) => {
      // Expect at least two topic changes: one for research, one for fixing
      const toolLogs = rig.readToolLogs();
      const topicCalls = toolLogs.filter(
        (log) => log.toolRequest.name === CREATE_NEW_TOPIC_TOOL_NAME,
      );

      expect(
        topicCalls.length,
        `Expected at least 2 topic calls, but got ${topicCalls.length}`,
      ).toBeGreaterThanOrEqual(2);

      // Verify that the topics are distinct and descriptive
      const titles = topicCalls.map((call) => {
        const args = JSON.parse(call.toolRequest.args);
        console.log('Topic call args:', args);
        const title = args.title || '';
        return title.toLowerCase();
      });

      console.log('Observed topic titles:', titles);

      const hasResearch = titles.some(
        (t) =>
          t.includes('research') ||
          t.includes('analyz') ||
          t.includes('understand'),
      );
      const hasFix = titles.some(
        (t) =>
          t.includes('fix') || t.includes('implement') || t.includes('apply'),
      );

      expect(
        hasResearch,
        'Should have a topic call for research/analysis',
      ).toBe(true);
      expect(hasFix, 'Should have a topic call for fixing/implementation').toBe(
        true,
      );

      assertModelHasOutput(result);
    },
  });

  evalTest('USUALLY_PASSES', {
    name: 'should sort create_new_topic to the top of the turn',
    prompt:
      'Immediately start by creating a new topic called "Deep Research" and then list the contents of the current directory. Do both in the same turn if possible.',
    assert: async (rig, result) => {
      // In the same turn, create_new_topic should be sorted to index 0
      const toolLogs = rig.readToolLogs();

      // We look for a turn where both were called
      // Since it's a simple prompt, they should both be in the first turn
      expect(toolLogs.length).toBeGreaterThanOrEqual(2);

      // The first tool call in the logs should be create_new_topic
      // if they were executed in the same batch.
      expect(toolLogs[0].toolRequest.name).toBe(CREATE_NEW_TOPIC_TOOL_NAME);

      assertModelHasOutput(result);
    },
  });
});