evals/update_topic.eval.ts

/**
 * @license
 * Copyright 2026 Google LLC
 * SPDX-License-Identifier: Apache-2.0
 */

import { describe, expect } from 'vitest';
import fs from 'node:fs';
import path from 'node:path';
import { evalTest } from './test-helper.js';

describe('update_topic_behavior', () => {
  // Constants for tool names and params for robustness
  const UPDATE_TOPIC_TOOL_NAME = 'update_topic';

  /**
   * Verifies the desired behavior of the update_topic tool. update_topic is used by the
   * agent to share periodic, concise updates about what the agent is working on, independent
   * of the regular model output and/or thoughts. This tool is expected to be called at least
   * at the start and end of the session, and typically at least once in the middle, but no
   * more than 1/4 turns.
   */
  evalTest('USUALLY_PASSES', {
    name: 'update_topic should be used at start, end and middle for complex tasks',
    prompt: `Create a simple users REST API using Express. 
1. Initialize a new npm project and install express.
2. Create src/app.ts as the main entry point.
3. Create src/routes/userRoutes.ts for user routes.
4. Create src/controllers/userController.ts for user logic.
5. Implement GET /users, POST /users, and GET /users/:id using an in-memory array.
6. Add a 'start' script to package.json.
7. Finally, run a quick grep to verify the routes are in src/app.ts.`,
    files: {
      'package.json': JSON.stringify(
        {
          name: 'users-api',
          version: '1.0.0',
          private: true,
        },
        null,
        2,
      ),
      '.gemini/settings.json': JSON.stringify({
        experimental: {
          topicUpdateNarration: true,
        },
      }),
    },
    assert: async (rig, result) => {
      const toolLogs = rig.readToolLogs();
      const topicCalls = toolLogs.filter(
        (l) => l.toolRequest.name === UPDATE_TOPIC_TOOL_NAME,
      );

      // 1. Assert that update_topic is called at least 3 times (start, middle, end)
      expect(
        topicCalls.length,
        `Expected at least 3 update_topic calls, but found ${topicCalls.length}`,
      ).toBeGreaterThanOrEqual(3);

      // 2. Assert update_topic is called at the very beginning (first tool call)
      expect(
        toolLogs[0].toolRequest.name,
        'First tool call should be update_topic',
      ).toBe(UPDATE_TOPIC_TOOL_NAME);

      // 3. Assert update_topic is called near the end
      const lastTopicCallIndex = toolLogs
        .map((l) => l.toolRequest.name)
        .lastIndexOf(UPDATE_TOPIC_TOOL_NAME);
      expect(
        lastTopicCallIndex,
        'Expected update_topic to be used near the end of the task',
      ).toBeGreaterThanOrEqual(toolLogs.length * 0.7);

      // 4. Assert there is at least one update_topic call in the middle (between start and end phases)
      const middleTopicCalls = topicCalls.slice(1, -1);

      expect(
        middleTopicCalls.length,
        'Expected at least one update_topic call in the middle of the task',
      ).toBeGreaterThanOrEqual(1);

      // 5. Turn Ratio Assertion: update_topic should be <= 1/2 of total turns.
      // We only enforce this for tasks that take more than 5 turns, as shorter tasks
      // naturally have a higher ratio when following the "start, middle, end" rule.
      const uniquePromptIds = new Set(
        toolLogs
          .map((l) => l.toolRequest.prompt_id)
          .filter((id) => id !== undefined),
      );
      const totalTurns = uniquePromptIds.size;

      if (totalTurns > 5) {
        const topicTurns = new Set(
          topicCalls
            .map((l) => l.toolRequest.prompt_id)
            .filter((id) => id !== undefined),
        );
        const topicTurnCount = topicTurns.size;

        const ratio = topicTurnCount / totalTurns;

        expect(
          ratio,
          `update_topic was used in ${topicTurnCount} out of ${totalTurns} turns (${(ratio * 100).toFixed(1)}%). Expected <= 50%.`,
        ).toBeLessThanOrEqual(0.5);

        // Ideal ratio is closer to 1/5 (20%). We log high usage as a warning.
        if (ratio > 0.25) {
          console.warn(
            `[Efficiency Warning] update_topic usage is high: ${(ratio * 100).toFixed(1)}% (Goal: ~20%)`,
          );
        }
      }
    },
  });

  evalTest('USUALLY_PASSES', {
    name: 'update_topic should NOT be used for informational coding tasks (Obvious)',
    approvalMode: 'default',
    prompt:
      'Explain the difference between Map and Object in JavaScript and provide a performance-focused code snippet for each.',
    files: {
      '.gemini/settings.json': JSON.stringify({
        experimental: {
          topicUpdateNarration: true,
        },
      }),
    },
    assert: async (rig) => {
      const toolLogs = rig.readToolLogs();
      const topicCalls = toolLogs.filter(
        (l) => l.toolRequest.name === UPDATE_TOPIC_TOOL_NAME,
      );

      expect(
        topicCalls.length,
        `Expected 0 update_topic calls for an informational task, but found ${topicCalls.length}`,
      ).toBe(0);
    },
  });

  evalTest('USUALLY_PASSES', {
    name: 'update_topic should NOT be used for surgical symbol searches (Grey Area)',
    approvalMode: 'default',
    prompt:
      "Find the file where the 'UPDATE_TOPIC_TOOL_NAME' constant is defined.",
    files: {
      'packages/core/src/tools/tool-names.ts':
        "export const UPDATE_TOPIC_TOOL_NAME = 'update_topic';",
      '.gemini/settings.json': JSON.stringify({
        experimental: {
          topicUpdateNarration: true,
        },
      }),
    },
    assert: async (rig) => {
      const toolLogs = rig.readToolLogs();
      const topicCalls = toolLogs.filter(
        (l) => l.toolRequest.name === UPDATE_TOPIC_TOOL_NAME,
      );

      expect(
        topicCalls.length,
        `Expected 0 update_topic calls for a surgical symbol search, but found ${topicCalls.length}`,
      ).toBe(0);
    },
  });

  evalTest('USUALLY_PASSES', {
    name: 'update_topic should be used for medium complexity multi-step tasks',
    prompt:
      'Refactor the `users-api` project. Move the routing logic from src/app.ts into a new file src/routes.ts, and update app.ts to use the new routes file.',
    files: {
      'package.json': JSON.stringify(
        {
          name: 'users-api',
          version: '1.0.0',
        },
        null,
        2,
      ),
      'src/app.ts': `
import express from 'express';
const app = express();

app.get('/users', (req, res) => {
  res.json([{id: 1, name: 'Alice'}]);
});

app.post('/users', (req, res) => {
  res.status(201).send();
});

export default app;
      `,
      '.gemini/settings.json': JSON.stringify({
        experimental: {
          topicUpdateNarration: true,
        },
      }),
    },
    assert: async (rig) => {
      const toolLogs = rig.readToolLogs();
      const topicCalls = toolLogs.filter(
        (l) => l.toolRequest.name === UPDATE_TOPIC_TOOL_NAME,
      );

      // This is a multi-step task (read, create new file, edit old file).
      // It should clear the bar and use update_topic at least at the start and end.
      expect(topicCalls.length).toBeGreaterThanOrEqual(2);

      // Verify it actually did the refactoring to ensure it didn't just fail immediately
      expect(fs.existsSync(path.join(rig.testDir, 'src/routes.ts'))).toBe(true);
    },
  });

  /**
   * Regression test for a bug where update_topic was called multiple times in a
   * row. We have seen cases of this occurring in earlier versions of the update_topic
   * system instruction, prior to https://github.com/google-gemini/gemini-cli/pull/24640.
   * This test demonstrated that there are cases where it can still occur and validates
   * the prompt change that improves the behavior.
   */
  evalTest('USUALLY_PASSES', {
    name: 'update_topic should not be called twice in a row',
    prompt: `
      We need to build a C compiler.

      Before you write any code, you must formally declare your strategy.
      First, declare that you will build a Lexer.
      Then, immediately realize that is wrong and declare that you will actually build a Parser instead.

      Finally, create 'parser.c'.
    `,
    files: {
      'package.json': JSON.stringify({ name: 'test-project' }),
      '.gemini/settings.json': JSON.stringify({
        experimental: {
          topicUpdateNarration: true,
        },
      }),
    },
    assert: async (rig) => {
      const toolLogs = rig.readToolLogs();

      // Check for back-to-back update_topic calls
      for (let i = 1; i < toolLogs.length; i++) {
        if (
          toolLogs[i - 1].toolRequest.name === UPDATE_TOPIC_TOOL_NAME &&
          toolLogs[i].toolRequest.name === UPDATE_TOPIC_TOOL_NAME
        ) {
          throw new Error(
            `Detected back-to-back ${UPDATE_TOPIC_TOOL_NAME} calls at index ${i - 1} and ${i}`,
          );
        }
      }
    },
  });
});
fix(evals): add update_topic behavioral eval (#24223 ) 2026-03-30 22:02:53 +00:00			`/**`
			`* @license`
			`* Copyright 2026 Google LLC`
			`* SPDX-License-Identifier: Apache-2.0`
			`*/`

			`import { describe, expect } from 'vitest';`
feat(core): discourage update topic tool for simple tasks (#24640 ) 2026-04-03 16:52:24 -07:00			`import fs from 'node:fs';`
			`import path from 'node:path';`
fix(evals): add update_topic behavioral eval (#24223 ) 2026-03-30 22:02:53 +00:00			`import { evalTest } from './test-helper.js';`

			`describe('update_topic_behavior', () => {`
			`// Constants for tool names and params for robustness`
			`const UPDATE_TOPIC_TOOL_NAME = 'update_topic';`

			`/**`
			`* Verifies the desired behavior of the update_topic tool. update_topic is used by the`
			`* agent to share periodic, concise updates about what the agent is working on, independent`
			`* of the regular model output and/or thoughts. This tool is expected to be called at least`
			`* at the start and end of the session, and typically at least once in the middle, but no`
			`* more than 1/4 turns.`
			`*/`
			`evalTest('USUALLY_PASSES', {`
			`name: 'update_topic should be used at start, end and middle for complex tasks',`
			prompt: `Create a simple users REST API using Express.
			`1. Initialize a new npm project and install express.`
			`2. Create src/app.ts as the main entry point.`
			`3. Create src/routes/userRoutes.ts for user routes.`
			`4. Create src/controllers/userController.ts for user logic.`
			`5. Implement GET /users, POST /users, and GET /users/:id using an in-memory array.`
			`6. Add a 'start' script to package.json.`
			7. Finally, run a quick grep to verify the routes are in src/app.ts.`,
			`files: {`
			`'package.json': JSON.stringify(`
			`{`
			`name: 'users-api',`
			`version: '1.0.0',`
			`private: true,`
			`},`
			`null,`
			`2,`
			`),`
			`'.gemini/settings.json': JSON.stringify({`
			`experimental: {`
			`topicUpdateNarration: true,`
			`},`
			`}),`
			`},`
			`assert: async (rig, result) => {`
			`const toolLogs = rig.readToolLogs();`
			`const topicCalls = toolLogs.filter(`
			`(l) => l.toolRequest.name === UPDATE_TOPIC_TOOL_NAME,`
			`);`

			`// 1. Assert that update_topic is called at least 3 times (start, middle, end)`
			`expect(`
			`topicCalls.length,`
			`Expected at least 3 update_topic calls, but found ${topicCalls.length}`,
			`).toBeGreaterThanOrEqual(3);`

			`// 2. Assert update_topic is called at the very beginning (first tool call)`
			`expect(`
			`toolLogs[0].toolRequest.name,`
			`'First tool call should be update_topic',`
			`).toBe(UPDATE_TOPIC_TOOL_NAME);`

			`// 3. Assert update_topic is called near the end`
			`const lastTopicCallIndex = toolLogs`
			`.map((l) => l.toolRequest.name)`
			`.lastIndexOf(UPDATE_TOPIC_TOOL_NAME);`
			`expect(`
			`lastTopicCallIndex,`
			`'Expected update_topic to be used near the end of the task',`
			`).toBeGreaterThanOrEqual(toolLogs.length * 0.7);`

			`// 4. Assert there is at least one update_topic call in the middle (between start and end phases)`
			`const middleTopicCalls = topicCalls.slice(1, -1);`

			`expect(`
			`middleTopicCalls.length,`
			`'Expected at least one update_topic call in the middle of the task',`
			`).toBeGreaterThanOrEqual(1);`

			`// 5. Turn Ratio Assertion: update_topic should be <= 1/2 of total turns.`
			`// We only enforce this for tasks that take more than 5 turns, as shorter tasks`
			`// naturally have a higher ratio when following the "start, middle, end" rule.`
			`const uniquePromptIds = new Set(`
			`toolLogs`
			`.map((l) => l.toolRequest.prompt_id)`
			`.filter((id) => id !== undefined),`
			`);`
			`const totalTurns = uniquePromptIds.size;`

			`if (totalTurns > 5) {`
			`const topicTurns = new Set(`
			`topicCalls`
			`.map((l) => l.toolRequest.prompt_id)`
			`.filter((id) => id !== undefined),`
			`);`
			`const topicTurnCount = topicTurns.size;`

			`const ratio = topicTurnCount / totalTurns;`

			`expect(`
			`ratio,`
			`update_topic was used in ${topicTurnCount} out of ${totalTurns} turns (${(ratio * 100).toFixed(1)}%). Expected <= 50%.`,
			`).toBeLessThanOrEqual(0.5);`

			`// Ideal ratio is closer to 1/5 (20%). We log high usage as a warning.`
			`if (ratio > 0.25) {`
			`console.warn(`
			`[Efficiency Warning] update_topic usage is high: ${(ratio * 100).toFixed(1)}% (Goal: ~20%)`,
			`);`
			`}`
			`}`
			`},`
			`});`
feat(core): discourage update topic tool for simple tasks (#24640 ) 2026-04-03 16:52:24 -07:00
			`evalTest('USUALLY_PASSES', {`
			`name: 'update_topic should NOT be used for informational coding tasks (Obvious)',`
			`approvalMode: 'default',`
			`prompt:`
			`'Explain the difference between Map and Object in JavaScript and provide a performance-focused code snippet for each.',`
			`files: {`
			`'.gemini/settings.json': JSON.stringify({`
			`experimental: {`
			`topicUpdateNarration: true,`
			`},`
			`}),`
			`},`
			`assert: async (rig) => {`
			`const toolLogs = rig.readToolLogs();`
			`const topicCalls = toolLogs.filter(`
			`(l) => l.toolRequest.name === UPDATE_TOPIC_TOOL_NAME,`
			`);`

			`expect(`
			`topicCalls.length,`
			`Expected 0 update_topic calls for an informational task, but found ${topicCalls.length}`,
			`).toBe(0);`
			`},`
			`});`

			`evalTest('USUALLY_PASSES', {`
			`name: 'update_topic should NOT be used for surgical symbol searches (Grey Area)',`
			`approvalMode: 'default',`
			`prompt:`
			`"Find the file where the 'UPDATE_TOPIC_TOOL_NAME' constant is defined.",`
			`files: {`
			`'packages/core/src/tools/tool-names.ts':`
			`"export const UPDATE_TOPIC_TOOL_NAME = 'update_topic';",`
			`'.gemini/settings.json': JSON.stringify({`
			`experimental: {`
			`topicUpdateNarration: true,`
			`},`
			`}),`
			`},`
			`assert: async (rig) => {`
			`const toolLogs = rig.readToolLogs();`
			`const topicCalls = toolLogs.filter(`
			`(l) => l.toolRequest.name === UPDATE_TOPIC_TOOL_NAME,`
			`);`

			`expect(`
			`topicCalls.length,`
			`Expected 0 update_topic calls for a surgical symbol search, but found ${topicCalls.length}`,
			`).toBe(0);`
			`},`
			`});`

			`evalTest('USUALLY_PASSES', {`
			`name: 'update_topic should be used for medium complexity multi-step tasks',`
			`prompt:`
			'Refactor the `users-api` project. Move the routing logic from src/app.ts into a new file src/routes.ts, and update app.ts to use the new routes file.',
			`files: {`
			`'package.json': JSON.stringify(`
			`{`
			`name: 'users-api',`
			`version: '1.0.0',`
			`},`
			`null,`
			`2,`
			`),`
			'src/app.ts': `
			`import express from 'express';`
			`const app = express();`

			`app.get('/users', (req, res) => {`
			`res.json([{id: 1, name: 'Alice'}]);`
			`});`

			`app.post('/users', (req, res) => {`
			`res.status(201).send();`
			`});`

			`export default app;`
			`,
			`'.gemini/settings.json': JSON.stringify({`
			`experimental: {`
			`topicUpdateNarration: true,`
			`},`
			`}),`
			`},`
			`assert: async (rig) => {`
			`const toolLogs = rig.readToolLogs();`
			`const topicCalls = toolLogs.filter(`
			`(l) => l.toolRequest.name === UPDATE_TOPIC_TOOL_NAME,`
			`);`

			`// This is a multi-step task (read, create new file, edit old file).`
			`// It should clear the bar and use update_topic at least at the start and end.`
			`expect(topicCalls.length).toBeGreaterThanOrEqual(2);`

			`// Verify it actually did the refactoring to ensure it didn't just fail immediately`
			`expect(fs.existsSync(path.join(rig.testDir, 'src/routes.ts'))).toBe(true);`
			`},`
			`});`
Fix issue where topic headers can be posted back to back (#24759 ) 2026-04-06 18:36:22 +00:00
			`/**`
			`* Regression test for a bug where update_topic was called multiple times in a`
			`* row. We have seen cases of this occurring in earlier versions of the update_topic`
			`* system instruction, prior to https://github.com/google-gemini/gemini-cli/pull/24640.`
			`* This test demonstrated that there are cases where it can still occur and validates`
			`* the prompt change that improves the behavior.`
			`*/`
			`evalTest('USUALLY_PASSES', {`
			`name: 'update_topic should not be called twice in a row',`
			prompt: `
			`We need to build a C compiler.`

			`Before you write any code, you must formally declare your strategy.`
			`First, declare that you will build a Lexer.`
			`Then, immediately realize that is wrong and declare that you will actually build a Parser instead.`

			`Finally, create 'parser.c'.`
			`,
			`files: {`
			`'package.json': JSON.stringify({ name: 'test-project' }),`
			`'.gemini/settings.json': JSON.stringify({`
			`experimental: {`
			`topicUpdateNarration: true,`
			`},`
			`}),`
			`},`
			`assert: async (rig) => {`
			`const toolLogs = rig.readToolLogs();`

			`// Check for back-to-back update_topic calls`
			`for (let i = 1; i < toolLogs.length; i++) {`
			`if (`
			`toolLogs[i - 1].toolRequest.name === UPDATE_TOPIC_TOOL_NAME &&`
			`toolLogs[i].toolRequest.name === UPDATE_TOPIC_TOOL_NAME`
			`) {`
			`throw new Error(`
			`Detected back-to-back ${UPDATE_TOPIC_TOOL_NAME} calls at index ${i - 1} and ${i}`,
			`);`
			`}`
			`}`
			`},`
			`});`
fix(evals): add update_topic behavioral eval (#24223 ) 2026-03-30 22:02:53 +00:00			`});`