fix(evals): add update_topic behavioral eval (#24223)

2026-06-13 04:48:09 -07:00 · 2026-03-30 22:02:53 +00:00
parent 4b20d93e1d
commit 117a2d3844
1 changed files with 116 additions and 0 deletions
@@ -0,0 +1,116 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, expect } from 'vitest';
+import { evalTest } from './test-helper.js';
+
+describe('update_topic_behavior', () => {
+  // Constants for tool names and params for robustness
+  const UPDATE_TOPIC_TOOL_NAME = 'update_topic';
+
+  /**
+   * Verifies the desired behavior of the update_topic tool. update_topic is used by the
+   * agent to share periodic, concise updates about what the agent is working on, independent
+   * of the regular model output and/or thoughts. This tool is expected to be called at least
+   * at the start and end of the session, and typically at least once in the middle, but no
+   * more than 1/4 turns.
+   */
+  evalTest('USUALLY_PASSES', {
+    name: 'update_topic should be used at start, end and middle for complex tasks',
+    prompt: `Create a simple users REST API using Express. 
+1. Initialize a new npm project and install express.
+2. Create src/app.ts as the main entry point.
+3. Create src/routes/userRoutes.ts for user routes.
+4. Create src/controllers/userController.ts for user logic.
+5. Implement GET /users, POST /users, and GET /users/:id using an in-memory array.
+6. Add a 'start' script to package.json.
+7. Finally, run a quick grep to verify the routes are in src/app.ts.`,
+    files: {
+      'package.json': JSON.stringify(
+        {
+          name: 'users-api',
+          version: '1.0.0',
+          private: true,
+        },
+        null,
+        2,
+      ),
+      '.gemini/settings.json': JSON.stringify({
+        experimental: {
+          topicUpdateNarration: true,
+        },
+      }),
+    },
+    assert: async (rig, result) => {
+      const toolLogs = rig.readToolLogs();
+      const topicCalls = toolLogs.filter(
+        (l) => l.toolRequest.name === UPDATE_TOPIC_TOOL_NAME,
+      );
+
+      // 1. Assert that update_topic is called at least 3 times (start, middle, end)
+      expect(
+        topicCalls.length,
+        `Expected at least 3 update_topic calls, but found ${topicCalls.length}`,
+      ).toBeGreaterThanOrEqual(3);
+
+      // 2. Assert update_topic is called at the very beginning (first tool call)
+      expect(
+        toolLogs[0].toolRequest.name,
+        'First tool call should be update_topic',
+      ).toBe(UPDATE_TOPIC_TOOL_NAME);
+
+      // 3. Assert update_topic is called near the end
+      const lastTopicCallIndex = toolLogs
+        .map((l) => l.toolRequest.name)
+        .lastIndexOf(UPDATE_TOPIC_TOOL_NAME);
+      expect(
+        lastTopicCallIndex,
+        'Expected update_topic to be used near the end of the task',
+      ).toBeGreaterThanOrEqual(toolLogs.length * 0.7);
+
+      // 4. Assert there is at least one update_topic call in the middle (between start and end phases)
+      const middleTopicCalls = topicCalls.slice(1, -1);
+
+      expect(
+        middleTopicCalls.length,
+        'Expected at least one update_topic call in the middle of the task',
+      ).toBeGreaterThanOrEqual(1);
+
+      // 5. Turn Ratio Assertion: update_topic should be <= 1/2 of total turns.
+      // We only enforce this for tasks that take more than 5 turns, as shorter tasks
+      // naturally have a higher ratio when following the "start, middle, end" rule.
+      const uniquePromptIds = new Set(
+        toolLogs
+          .map((l) => l.toolRequest.prompt_id)
+          .filter((id) => id !== undefined),
+      );
+      const totalTurns = uniquePromptIds.size;
+
+      if (totalTurns > 5) {
+        const topicTurns = new Set(
+          topicCalls
+            .map((l) => l.toolRequest.prompt_id)
+            .filter((id) => id !== undefined),
+        );
+        const topicTurnCount = topicTurns.size;
+
+        const ratio = topicTurnCount / totalTurns;
+
+        expect(
+          ratio,
+          `update_topic was used in ${topicTurnCount} out of ${totalTurns} turns (${(ratio * 100).toFixed(1)}%). Expected <= 50%.`,
+        ).toBeLessThanOrEqual(0.5);
+
+        // Ideal ratio is closer to 1/5 (20%). We log high usage as a warning.
+        if (ratio > 0.25) {
+          console.warn(
+            `[Efficiency Warning] update_topic usage is high: ${(ratio * 100).toFixed(1)}% (Goal: ~20%)`,
+          );
+        }
+      }
+    },
+  });
+});