From d9d2ce36f2a7f4fa84cff1cfc20425675fbd6f91 Mon Sep 17 00:00:00 2001 From: Abhi <43648792+abhipatel12@users.noreply.github.com> Date: Sun, 29 Mar 2026 19:13:50 -0400 Subject: [PATCH] test(evals): add comprehensive subagent delegation evaluations (#24132) --- evals/subagents.eval.ts | 136 ++++++++++++++++++--- evals/test-helper.ts | 5 + packages/test-utils/src/fixtures/agents.ts | 80 ++++++++++++ 3 files changed, 202 insertions(+), 19 deletions(-) diff --git a/evals/subagents.eval.ts b/evals/subagents.eval.ts index 140925964b..7053290fba 100644 --- a/evals/subagents.eval.ts +++ b/evals/subagents.eval.ts @@ -13,8 +13,21 @@ import { evalTest, TEST_AGENTS } from './test-helper.js'; const INDEX_TS = 'export const add = (a: number, b: number) => a + b;\n'; +// A minimal package.json is used to provide a realistic workspace anchor. +// This prevents the agent from making incorrect assumptions about the environment +// and helps it properly navigate or act as if it is in a standard Node.js project. +const MOCK_PACKAGE_JSON = JSON.stringify( + { + name: 'subagent-eval-project', + version: '1.0.0', + type: 'module', + }, + null, + 2, +); + function readProjectFile( - rig: { testDir?: string }, + rig: { testDir: string | null }, relativePath: string, ): string { return fs.readFileSync(path.join(rig.testDir!, relativePath), 'utf8'); @@ -117,15 +130,7 @@ describe('subagent eval test cases', () => { files: { ...TEST_AGENTS.TESTING_AGENT.asFile(), 'index.ts': INDEX_TS, - 'package.json': JSON.stringify( - { - name: 'subagent-eval-project', - version: '1.0.0', - type: 'module', - }, - null, - 2, - ), + 'package.json': MOCK_PACKAGE_JSON, }, assert: async (rig, _result) => { const toolLogs = rig.readToolLogs() as Array<{ @@ -164,15 +169,7 @@ describe('subagent eval test cases', () => { ...TEST_AGENTS.TESTING_AGENT.asFile(), 'index.ts': INDEX_TS, 'README.md': 'TODO: update the README.\n', - 'package.json': JSON.stringify( - { - name: 'subagent-eval-project', - version: '1.0.0', - type: 'module', - }, - null, - 2, - ), + 'package.json': MOCK_PACKAGE_JSON, }, assert: async (rig, _result) => { const toolLogs = rig.readToolLogs() as Array<{ @@ -190,4 +187,105 @@ describe('subagent eval test cases', () => { ); }, }); + + /** + * Checks that the main agent can correctly select the appropriate subagent + * from a large pool of available subagents (10 total). + */ + evalTest('USUALLY_PASSES', { + name: 'should select the correct subagent from a pool of 10 different agents', + prompt: 'Please add a new SQL table migration for a user profile.', + files: { + ...TEST_AGENTS.DOCS_AGENT.asFile(), + ...TEST_AGENTS.TESTING_AGENT.asFile(), + ...TEST_AGENTS.DATABASE_AGENT.asFile(), + ...TEST_AGENTS.CSS_AGENT.asFile(), + ...TEST_AGENTS.I18N_AGENT.asFile(), + ...TEST_AGENTS.SECURITY_AGENT.asFile(), + ...TEST_AGENTS.DEVOPS_AGENT.asFile(), + ...TEST_AGENTS.ANALYTICS_AGENT.asFile(), + ...TEST_AGENTS.ACCESSIBILITY_AGENT.asFile(), + ...TEST_AGENTS.MOBILE_AGENT.asFile(), + 'package.json': MOCK_PACKAGE_JSON, + }, + assert: async (rig, _result) => { + const toolLogs = rig.readToolLogs() as Array<{ + toolRequest: { name: string }; + }>; + await rig.expectToolCallSuccess(['database-agent']); + + // Ensure the generalist and other irrelevant specialists were not invoked + const uncalledAgents = [ + 'generalist', + TEST_AGENTS.DOCS_AGENT.name, + TEST_AGENTS.TESTING_AGENT.name, + TEST_AGENTS.CSS_AGENT.name, + TEST_AGENTS.I18N_AGENT.name, + TEST_AGENTS.SECURITY_AGENT.name, + TEST_AGENTS.DEVOPS_AGENT.name, + TEST_AGENTS.ANALYTICS_AGENT.name, + TEST_AGENTS.ACCESSIBILITY_AGENT.name, + TEST_AGENTS.MOBILE_AGENT.name, + ]; + + for (const agentName of uncalledAgents) { + expect(toolLogs.some((l) => l.toolRequest.name === agentName)).toBe( + false, + ); + } + }, + }); + + /** + * Checks that the main agent can correctly select the appropriate subagent + * from a large pool of available subagents, even when many irrelevant MCP tools are present. + * + * This test includes stress tests the subagent delegation with ~80 tools. + */ + evalTest('USUALLY_PASSES', { + name: 'should select the correct subagent from a pool of 10 different agents with MCP tools present', + prompt: 'Please add a new SQL table migration for a user profile.', + setup: async (rig) => { + rig.addTestMcpServer('workspace-server', 'google-workspace'); + }, + files: { + ...TEST_AGENTS.DOCS_AGENT.asFile(), + ...TEST_AGENTS.TESTING_AGENT.asFile(), + ...TEST_AGENTS.DATABASE_AGENT.asFile(), + ...TEST_AGENTS.CSS_AGENT.asFile(), + ...TEST_AGENTS.I18N_AGENT.asFile(), + ...TEST_AGENTS.SECURITY_AGENT.asFile(), + ...TEST_AGENTS.DEVOPS_AGENT.asFile(), + ...TEST_AGENTS.ANALYTICS_AGENT.asFile(), + ...TEST_AGENTS.ACCESSIBILITY_AGENT.asFile(), + ...TEST_AGENTS.MOBILE_AGENT.asFile(), + 'package.json': MOCK_PACKAGE_JSON, + }, + assert: async (rig, _result) => { + const toolLogs = rig.readToolLogs() as Array<{ + toolRequest: { name: string }; + }>; + await rig.expectToolCallSuccess(['database-agent']); + + // Ensure the generalist and other irrelevant specialists were not invoked + const uncalledAgents = [ + 'generalist', + TEST_AGENTS.DOCS_AGENT.name, + TEST_AGENTS.TESTING_AGENT.name, + TEST_AGENTS.CSS_AGENT.name, + TEST_AGENTS.I18N_AGENT.name, + TEST_AGENTS.SECURITY_AGENT.name, + TEST_AGENTS.DEVOPS_AGENT.name, + TEST_AGENTS.ANALYTICS_AGENT.name, + TEST_AGENTS.ACCESSIBILITY_AGENT.name, + TEST_AGENTS.MOBILE_AGENT.name, + ]; + + for (const agentName of uncalledAgents) { + expect(toolLogs.some((l) => l.toolRequest.name === agentName)).toBe( + false, + ); + } + }, + }); }); diff --git a/evals/test-helper.ts b/evals/test-helper.ts index f79a78779a..2bf9188eee 100644 --- a/evals/test-helper.ts +++ b/evals/test-helper.ts @@ -61,6 +61,10 @@ export async function internalEvalTest(evalCase: EvalCase) { try { rig.setup(evalCase.name, evalCase.params); + if (evalCase.setup) { + await evalCase.setup(rig); + } + if (evalCase.files) { await setupTestFiles(rig, evalCase.files); } @@ -371,6 +375,7 @@ export interface EvalCase { prompt: string; timeout?: number; files?: Record; + setup?: (rig: TestRig) => Promise | void; /** Conversation history to pre-load via --resume. Each entry is a message object with type, content, etc. */ messages?: Record[]; /** Session ID for the resumed session. Auto-generated if not provided. */ diff --git a/packages/test-utils/src/fixtures/agents.ts b/packages/test-utils/src/fixtures/agents.ts index 9469457227..b105be404e 100644 --- a/packages/test-utils/src/fixtures/agents.ts +++ b/packages/test-utils/src/fixtures/agents.ts @@ -69,4 +69,84 @@ export const TEST_AGENTS = { tools: ['read_file', 'write_file'], body: 'You are the test agent. Add or update tests.', }), + /** + * An agent with expertise in database schemas, SQL, and creating database migrations. + */ + DATABASE_AGENT: createAgent({ + name: 'database-agent', + description: + 'An expert in database schemas, SQL, and creating database migrations.', + tools: ['read_file', 'write_file'], + body: 'You are the database agent. Create and update SQL migrations.', + }), + + /** + * An agent with expertise in CSS, styling, and UI design. + */ + CSS_AGENT: createAgent({ + name: 'css-agent', + description: 'An expert in CSS, styling, and UI design.', + tools: ['read_file', 'write_file'], + body: 'You are the CSS agent.', + }), + + /** + * An agent with expertise in internationalization and translations. + */ + I18N_AGENT: createAgent({ + name: 'i18n-agent', + description: 'An expert in internationalization and translations.', + tools: ['read_file', 'write_file'], + body: 'You are the i18n agent.', + }), + + /** + * An agent with expertise in security audits and vulnerability patches. + */ + SECURITY_AGENT: createAgent({ + name: 'security-agent', + description: 'An expert in security audits and vulnerability patches.', + tools: ['read_file', 'write_file'], + body: 'You are the security agent.', + }), + + /** + * An agent with expertise in CI/CD, Docker, and deployment scripts. + */ + DEVOPS_AGENT: createAgent({ + name: 'devops-agent', + description: 'An expert in CI/CD, Docker, and deployment scripts.', + tools: ['read_file', 'write_file'], + body: 'You are the devops agent.', + }), + + /** + * An agent with expertise in tracking, analytics, and metrics. + */ + ANALYTICS_AGENT: createAgent({ + name: 'analytics-agent', + description: 'An expert in tracking, analytics, and metrics.', + tools: ['read_file', 'write_file'], + body: 'You are the analytics agent.', + }), + + /** + * An agent with expertise in web accessibility and ARIA roles. + */ + ACCESSIBILITY_AGENT: createAgent({ + name: 'accessibility-agent', + description: 'An expert in web accessibility and ARIA roles.', + tools: ['read_file', 'write_file'], + body: 'You are the accessibility agent.', + }), + + /** + * An agent with expertise in React Native and mobile app development. + */ + MOBILE_AGENT: createAgent({ + name: 'mobile-agent', + description: 'An expert in React Native and mobile app development.', + tools: ['read_file', 'write_file'], + body: 'You are the mobile agent.', + }), } as const;