/** * @license * Copyright 2026 Google LLC * SPDX-License-Identifier: Apache-2.0 */ import fs from 'node:fs'; import path from 'node:path'; import { describe, expect } from 'vitest'; import { evalTest, TEST_AGENTS } from './test-helper.js'; const INDEX_TS = 'export const add = (a: number, b: number) => a + b;\n'; // A minimal package.json is used to provide a realistic workspace anchor. // This prevents the agent from making incorrect assumptions about the environment // and helps it properly navigate or act as if it is in a standard Node.js project. const MOCK_PACKAGE_JSON = JSON.stringify( { name: 'subagent-eval-project', version: '1.0.0', type: 'module', }, null, 2, ); function readProjectFile( rig: { testDir: string | null }, relativePath: string, ): string { return fs.readFileSync(path.join(rig.testDir!, relativePath), 'utf8'); } describe('subagent eval test cases', () => { /** * Checks whether the outer agent reliably utilizes an expert subagent to * accomplish a task when one is available. * * Note that the test is intentionally crafted to avoid the word "document" * or "docs". We want to see the outer agent make the connection even when * the prompt indirectly implies need of expertise. * * This tests the system prompt's subagent specific clauses. */ evalTest('USUALLY_PASSES', { name: 'should delegate to user provided agent with relevant expertise', params: { settings: { experimental: { enableAgents: true, }, }, }, prompt: 'Please update README.md with a description of this library.', files: { ...TEST_AGENTS.DOCS_AGENT.asFile(), 'index.ts': INDEX_TS, 'README.md': 'TODO: update the README.\n', }, assert: async (rig, _result) => { await rig.expectToolCallSuccess([TEST_AGENTS.DOCS_AGENT.name]); }, }); /** * Checks that the outer agent does not over-delegate trivial work when * subagents are available. This helps catch orchestration overuse. */ evalTest('USUALLY_PASSES', { name: 'should avoid delegating trivial direct edit work', params: { settings: { experimental: { enableAgents: true, agents: { overrides: { generalist: { enabled: true }, }, }, }, }, }, prompt: 'Rename the exported function in index.ts from add to sum and update the file directly.', files: { ...TEST_AGENTS.DOCS_AGENT.asFile(), 'index.ts': INDEX_TS, }, assert: async (rig, _result) => { const updatedIndex = readProjectFile(rig, 'index.ts'); const toolLogs = rig.readToolLogs() as Array<{ toolRequest: { name: string }; }>; expect(updatedIndex).toContain('export const sum ='); expect( toolLogs.some( (l) => l.toolRequest.name === TEST_AGENTS.DOCS_AGENT.name, ), ).toBe(false); expect(toolLogs.some((l) => l.toolRequest.name === 'generalist')).toBe( false, ); }, }); /** * Checks that the outer agent prefers a more relevant specialist over a * broad generalist when both are available. * * This is meant to codify the "overusing Generalist" failure mode. */ evalTest('USUALLY_PASSES', { name: 'should prefer relevant specialist over generalist', params: { settings: { experimental: { enableAgents: true, agents: { overrides: { generalist: { enabled: true }, }, }, }, }, }, prompt: 'Please add a small test file that verifies add(1, 2) returns 3.', files: { ...TEST_AGENTS.TESTING_AGENT.asFile(), 'index.ts': INDEX_TS, 'package.json': MOCK_PACKAGE_JSON, }, assert: async (rig, _result) => { const toolLogs = rig.readToolLogs() as Array<{ toolRequest: { name: string }; }>; await rig.expectToolCallSuccess([TEST_AGENTS.TESTING_AGENT.name]); expect(toolLogs.some((l) => l.toolRequest.name === 'generalist')).toBe( false, ); }, }); /** * Checks cardinality and decomposition for a multi-surface task. The task * naturally spans docs and tests, so multiple specialists should be used. */ evalTest('USUALLY_PASSES', { name: 'should use multiple relevant specialists for multi-surface task', params: { settings: { experimental: { enableAgents: true, agents: { overrides: { generalist: { enabled: true }, }, }, }, }, }, prompt: 'Add a short README description for this library and also add a test file that verifies add(1, 2) returns 3.', files: { ...TEST_AGENTS.DOCS_AGENT.asFile(), ...TEST_AGENTS.TESTING_AGENT.asFile(), 'index.ts': INDEX_TS, 'README.md': 'TODO: update the README.\n', 'package.json': MOCK_PACKAGE_JSON, }, assert: async (rig, _result) => { const toolLogs = rig.readToolLogs() as Array<{ toolRequest: { name: string }; }>; const readme = readProjectFile(rig, 'README.md'); await rig.expectToolCallSuccess([ TEST_AGENTS.DOCS_AGENT.name, TEST_AGENTS.TESTING_AGENT.name, ]); expect(readme).not.toContain('TODO: update the README.'); expect(toolLogs.some((l) => l.toolRequest.name === 'generalist')).toBe( false, ); }, }); /** * Checks that the main agent can correctly select the appropriate subagent * from a large pool of available subagents (10 total). */ evalTest('USUALLY_PASSES', { name: 'should select the correct subagent from a pool of 10 different agents', prompt: 'Please add a new SQL table migration for a user profile.', files: { ...TEST_AGENTS.DOCS_AGENT.asFile(), ...TEST_AGENTS.TESTING_AGENT.asFile(), ...TEST_AGENTS.DATABASE_AGENT.asFile(), ...TEST_AGENTS.CSS_AGENT.asFile(), ...TEST_AGENTS.I18N_AGENT.asFile(), ...TEST_AGENTS.SECURITY_AGENT.asFile(), ...TEST_AGENTS.DEVOPS_AGENT.asFile(), ...TEST_AGENTS.ANALYTICS_AGENT.asFile(), ...TEST_AGENTS.ACCESSIBILITY_AGENT.asFile(), ...TEST_AGENTS.MOBILE_AGENT.asFile(), 'package.json': MOCK_PACKAGE_JSON, }, assert: async (rig, _result) => { const toolLogs = rig.readToolLogs() as Array<{ toolRequest: { name: string }; }>; await rig.expectToolCallSuccess(['database-agent']); // Ensure the generalist and other irrelevant specialists were not invoked const uncalledAgents = [ 'generalist', TEST_AGENTS.DOCS_AGENT.name, TEST_AGENTS.TESTING_AGENT.name, TEST_AGENTS.CSS_AGENT.name, TEST_AGENTS.I18N_AGENT.name, TEST_AGENTS.SECURITY_AGENT.name, TEST_AGENTS.DEVOPS_AGENT.name, TEST_AGENTS.ANALYTICS_AGENT.name, TEST_AGENTS.ACCESSIBILITY_AGENT.name, TEST_AGENTS.MOBILE_AGENT.name, ]; for (const agentName of uncalledAgents) { expect(toolLogs.some((l) => l.toolRequest.name === agentName)).toBe( false, ); } }, }); /** * Checks that the main agent can correctly select the appropriate subagent * from a large pool of available subagents, even when many irrelevant MCP tools are present. * * This test includes stress tests the subagent delegation with ~80 tools. */ evalTest('USUALLY_PASSES', { name: 'should select the correct subagent from a pool of 10 different agents with MCP tools present', prompt: 'Please add a new SQL table migration for a user profile.', setup: async (rig) => { rig.addTestMcpServer('workspace-server', 'google-workspace'); }, files: { ...TEST_AGENTS.DOCS_AGENT.asFile(), ...TEST_AGENTS.TESTING_AGENT.asFile(), ...TEST_AGENTS.DATABASE_AGENT.asFile(), ...TEST_AGENTS.CSS_AGENT.asFile(), ...TEST_AGENTS.I18N_AGENT.asFile(), ...TEST_AGENTS.SECURITY_AGENT.asFile(), ...TEST_AGENTS.DEVOPS_AGENT.asFile(), ...TEST_AGENTS.ANALYTICS_AGENT.asFile(), ...TEST_AGENTS.ACCESSIBILITY_AGENT.asFile(), ...TEST_AGENTS.MOBILE_AGENT.asFile(), 'package.json': MOCK_PACKAGE_JSON, }, assert: async (rig, _result) => { const toolLogs = rig.readToolLogs() as Array<{ toolRequest: { name: string }; }>; await rig.expectToolCallSuccess(['database-agent']); // Ensure the generalist and other irrelevant specialists were not invoked const uncalledAgents = [ 'generalist', TEST_AGENTS.DOCS_AGENT.name, TEST_AGENTS.TESTING_AGENT.name, TEST_AGENTS.CSS_AGENT.name, TEST_AGENTS.I18N_AGENT.name, TEST_AGENTS.SECURITY_AGENT.name, TEST_AGENTS.DEVOPS_AGENT.name, TEST_AGENTS.ANALYTICS_AGENT.name, TEST_AGENTS.ACCESSIBILITY_AGENT.name, TEST_AGENTS.MOBILE_AGENT.name, ]; for (const agentName of uncalledAgents) { expect(toolLogs.some((l) => l.toolRequest.name === agentName)).toBe( false, ); } }, }); });