test(evals): add comprehensive subagent delegation evaluations (#24132)

This commit is contained in:
Abhi
2026-03-29 19:13:50 -04:00
committed by GitHub
parent da8c841ef4
commit d9d2ce36f2
3 changed files with 202 additions and 19 deletions

View File

@@ -13,8 +13,21 @@ import { evalTest, TEST_AGENTS } from './test-helper.js';
const INDEX_TS = 'export const add = (a: number, b: number) => a + b;\n';
// A minimal package.json is used to provide a realistic workspace anchor.
// This prevents the agent from making incorrect assumptions about the environment
// and helps it properly navigate or act as if it is in a standard Node.js project.
const MOCK_PACKAGE_JSON = JSON.stringify(
{
name: 'subagent-eval-project',
version: '1.0.0',
type: 'module',
},
null,
2,
);
function readProjectFile(
rig: { testDir?: string },
rig: { testDir: string | null },
relativePath: string,
): string {
return fs.readFileSync(path.join(rig.testDir!, relativePath), 'utf8');
@@ -117,15 +130,7 @@ describe('subagent eval test cases', () => {
files: {
...TEST_AGENTS.TESTING_AGENT.asFile(),
'index.ts': INDEX_TS,
'package.json': JSON.stringify(
{
name: 'subagent-eval-project',
version: '1.0.0',
type: 'module',
},
null,
2,
),
'package.json': MOCK_PACKAGE_JSON,
},
assert: async (rig, _result) => {
const toolLogs = rig.readToolLogs() as Array<{
@@ -164,15 +169,7 @@ describe('subagent eval test cases', () => {
...TEST_AGENTS.TESTING_AGENT.asFile(),
'index.ts': INDEX_TS,
'README.md': 'TODO: update the README.\n',
'package.json': JSON.stringify(
{
name: 'subagent-eval-project',
version: '1.0.0',
type: 'module',
},
null,
2,
),
'package.json': MOCK_PACKAGE_JSON,
},
assert: async (rig, _result) => {
const toolLogs = rig.readToolLogs() as Array<{
@@ -190,4 +187,105 @@ describe('subagent eval test cases', () => {
);
},
});
/**
* Checks that the main agent can correctly select the appropriate subagent
* from a large pool of available subagents (10 total).
*/
evalTest('USUALLY_PASSES', {
name: 'should select the correct subagent from a pool of 10 different agents',
prompt: 'Please add a new SQL table migration for a user profile.',
files: {
...TEST_AGENTS.DOCS_AGENT.asFile(),
...TEST_AGENTS.TESTING_AGENT.asFile(),
...TEST_AGENTS.DATABASE_AGENT.asFile(),
...TEST_AGENTS.CSS_AGENT.asFile(),
...TEST_AGENTS.I18N_AGENT.asFile(),
...TEST_AGENTS.SECURITY_AGENT.asFile(),
...TEST_AGENTS.DEVOPS_AGENT.asFile(),
...TEST_AGENTS.ANALYTICS_AGENT.asFile(),
...TEST_AGENTS.ACCESSIBILITY_AGENT.asFile(),
...TEST_AGENTS.MOBILE_AGENT.asFile(),
'package.json': MOCK_PACKAGE_JSON,
},
assert: async (rig, _result) => {
const toolLogs = rig.readToolLogs() as Array<{
toolRequest: { name: string };
}>;
await rig.expectToolCallSuccess(['database-agent']);
// Ensure the generalist and other irrelevant specialists were not invoked
const uncalledAgents = [
'generalist',
TEST_AGENTS.DOCS_AGENT.name,
TEST_AGENTS.TESTING_AGENT.name,
TEST_AGENTS.CSS_AGENT.name,
TEST_AGENTS.I18N_AGENT.name,
TEST_AGENTS.SECURITY_AGENT.name,
TEST_AGENTS.DEVOPS_AGENT.name,
TEST_AGENTS.ANALYTICS_AGENT.name,
TEST_AGENTS.ACCESSIBILITY_AGENT.name,
TEST_AGENTS.MOBILE_AGENT.name,
];
for (const agentName of uncalledAgents) {
expect(toolLogs.some((l) => l.toolRequest.name === agentName)).toBe(
false,
);
}
},
});
/**
* Checks that the main agent can correctly select the appropriate subagent
* from a large pool of available subagents, even when many irrelevant MCP tools are present.
*
* This test includes stress tests the subagent delegation with ~80 tools.
*/
evalTest('USUALLY_PASSES', {
name: 'should select the correct subagent from a pool of 10 different agents with MCP tools present',
prompt: 'Please add a new SQL table migration for a user profile.',
setup: async (rig) => {
rig.addTestMcpServer('workspace-server', 'google-workspace');
},
files: {
...TEST_AGENTS.DOCS_AGENT.asFile(),
...TEST_AGENTS.TESTING_AGENT.asFile(),
...TEST_AGENTS.DATABASE_AGENT.asFile(),
...TEST_AGENTS.CSS_AGENT.asFile(),
...TEST_AGENTS.I18N_AGENT.asFile(),
...TEST_AGENTS.SECURITY_AGENT.asFile(),
...TEST_AGENTS.DEVOPS_AGENT.asFile(),
...TEST_AGENTS.ANALYTICS_AGENT.asFile(),
...TEST_AGENTS.ACCESSIBILITY_AGENT.asFile(),
...TEST_AGENTS.MOBILE_AGENT.asFile(),
'package.json': MOCK_PACKAGE_JSON,
},
assert: async (rig, _result) => {
const toolLogs = rig.readToolLogs() as Array<{
toolRequest: { name: string };
}>;
await rig.expectToolCallSuccess(['database-agent']);
// Ensure the generalist and other irrelevant specialists were not invoked
const uncalledAgents = [
'generalist',
TEST_AGENTS.DOCS_AGENT.name,
TEST_AGENTS.TESTING_AGENT.name,
TEST_AGENTS.CSS_AGENT.name,
TEST_AGENTS.I18N_AGENT.name,
TEST_AGENTS.SECURITY_AGENT.name,
TEST_AGENTS.DEVOPS_AGENT.name,
TEST_AGENTS.ANALYTICS_AGENT.name,
TEST_AGENTS.ACCESSIBILITY_AGENT.name,
TEST_AGENTS.MOBILE_AGENT.name,
];
for (const agentName of uncalledAgents) {
expect(toolLogs.some((l) => l.toolRequest.name === agentName)).toBe(
false,
);
}
},
});
});

View File

@@ -61,6 +61,10 @@ export async function internalEvalTest(evalCase: EvalCase) {
try {
rig.setup(evalCase.name, evalCase.params);
if (evalCase.setup) {
await evalCase.setup(rig);
}
if (evalCase.files) {
await setupTestFiles(rig, evalCase.files);
}
@@ -371,6 +375,7 @@ export interface EvalCase {
prompt: string;
timeout?: number;
files?: Record<string, string>;
setup?: (rig: TestRig) => Promise<void> | void;
/** Conversation history to pre-load via --resume. Each entry is a message object with type, content, etc. */
messages?: Record<string, unknown>[];
/** Session ID for the resumed session. Auto-generated if not provided. */