mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-04-25 12:34:38 -07:00
test(evals): add comprehensive subagent delegation evaluations (#24132)
This commit is contained in:
+117
-19
@@ -13,8 +13,21 @@ import { evalTest, TEST_AGENTS } from './test-helper.js';
|
|||||||
|
|
||||||
const INDEX_TS = 'export const add = (a: number, b: number) => a + b;\n';
|
const INDEX_TS = 'export const add = (a: number, b: number) => a + b;\n';
|
||||||
|
|
||||||
|
// A minimal package.json is used to provide a realistic workspace anchor.
|
||||||
|
// This prevents the agent from making incorrect assumptions about the environment
|
||||||
|
// and helps it properly navigate or act as if it is in a standard Node.js project.
|
||||||
|
const MOCK_PACKAGE_JSON = JSON.stringify(
|
||||||
|
{
|
||||||
|
name: 'subagent-eval-project',
|
||||||
|
version: '1.0.0',
|
||||||
|
type: 'module',
|
||||||
|
},
|
||||||
|
null,
|
||||||
|
2,
|
||||||
|
);
|
||||||
|
|
||||||
function readProjectFile(
|
function readProjectFile(
|
||||||
rig: { testDir?: string },
|
rig: { testDir: string | null },
|
||||||
relativePath: string,
|
relativePath: string,
|
||||||
): string {
|
): string {
|
||||||
return fs.readFileSync(path.join(rig.testDir!, relativePath), 'utf8');
|
return fs.readFileSync(path.join(rig.testDir!, relativePath), 'utf8');
|
||||||
@@ -117,15 +130,7 @@ describe('subagent eval test cases', () => {
|
|||||||
files: {
|
files: {
|
||||||
...TEST_AGENTS.TESTING_AGENT.asFile(),
|
...TEST_AGENTS.TESTING_AGENT.asFile(),
|
||||||
'index.ts': INDEX_TS,
|
'index.ts': INDEX_TS,
|
||||||
'package.json': JSON.stringify(
|
'package.json': MOCK_PACKAGE_JSON,
|
||||||
{
|
|
||||||
name: 'subagent-eval-project',
|
|
||||||
version: '1.0.0',
|
|
||||||
type: 'module',
|
|
||||||
},
|
|
||||||
null,
|
|
||||||
2,
|
|
||||||
),
|
|
||||||
},
|
},
|
||||||
assert: async (rig, _result) => {
|
assert: async (rig, _result) => {
|
||||||
const toolLogs = rig.readToolLogs() as Array<{
|
const toolLogs = rig.readToolLogs() as Array<{
|
||||||
@@ -164,15 +169,7 @@ describe('subagent eval test cases', () => {
|
|||||||
...TEST_AGENTS.TESTING_AGENT.asFile(),
|
...TEST_AGENTS.TESTING_AGENT.asFile(),
|
||||||
'index.ts': INDEX_TS,
|
'index.ts': INDEX_TS,
|
||||||
'README.md': 'TODO: update the README.\n',
|
'README.md': 'TODO: update the README.\n',
|
||||||
'package.json': JSON.stringify(
|
'package.json': MOCK_PACKAGE_JSON,
|
||||||
{
|
|
||||||
name: 'subagent-eval-project',
|
|
||||||
version: '1.0.0',
|
|
||||||
type: 'module',
|
|
||||||
},
|
|
||||||
null,
|
|
||||||
2,
|
|
||||||
),
|
|
||||||
},
|
},
|
||||||
assert: async (rig, _result) => {
|
assert: async (rig, _result) => {
|
||||||
const toolLogs = rig.readToolLogs() as Array<{
|
const toolLogs = rig.readToolLogs() as Array<{
|
||||||
@@ -190,4 +187,105 @@ describe('subagent eval test cases', () => {
|
|||||||
);
|
);
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks that the main agent can correctly select the appropriate subagent
|
||||||
|
* from a large pool of available subagents (10 total).
|
||||||
|
*/
|
||||||
|
evalTest('USUALLY_PASSES', {
|
||||||
|
name: 'should select the correct subagent from a pool of 10 different agents',
|
||||||
|
prompt: 'Please add a new SQL table migration for a user profile.',
|
||||||
|
files: {
|
||||||
|
...TEST_AGENTS.DOCS_AGENT.asFile(),
|
||||||
|
...TEST_AGENTS.TESTING_AGENT.asFile(),
|
||||||
|
...TEST_AGENTS.DATABASE_AGENT.asFile(),
|
||||||
|
...TEST_AGENTS.CSS_AGENT.asFile(),
|
||||||
|
...TEST_AGENTS.I18N_AGENT.asFile(),
|
||||||
|
...TEST_AGENTS.SECURITY_AGENT.asFile(),
|
||||||
|
...TEST_AGENTS.DEVOPS_AGENT.asFile(),
|
||||||
|
...TEST_AGENTS.ANALYTICS_AGENT.asFile(),
|
||||||
|
...TEST_AGENTS.ACCESSIBILITY_AGENT.asFile(),
|
||||||
|
...TEST_AGENTS.MOBILE_AGENT.asFile(),
|
||||||
|
'package.json': MOCK_PACKAGE_JSON,
|
||||||
|
},
|
||||||
|
assert: async (rig, _result) => {
|
||||||
|
const toolLogs = rig.readToolLogs() as Array<{
|
||||||
|
toolRequest: { name: string };
|
||||||
|
}>;
|
||||||
|
await rig.expectToolCallSuccess(['database-agent']);
|
||||||
|
|
||||||
|
// Ensure the generalist and other irrelevant specialists were not invoked
|
||||||
|
const uncalledAgents = [
|
||||||
|
'generalist',
|
||||||
|
TEST_AGENTS.DOCS_AGENT.name,
|
||||||
|
TEST_AGENTS.TESTING_AGENT.name,
|
||||||
|
TEST_AGENTS.CSS_AGENT.name,
|
||||||
|
TEST_AGENTS.I18N_AGENT.name,
|
||||||
|
TEST_AGENTS.SECURITY_AGENT.name,
|
||||||
|
TEST_AGENTS.DEVOPS_AGENT.name,
|
||||||
|
TEST_AGENTS.ANALYTICS_AGENT.name,
|
||||||
|
TEST_AGENTS.ACCESSIBILITY_AGENT.name,
|
||||||
|
TEST_AGENTS.MOBILE_AGENT.name,
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const agentName of uncalledAgents) {
|
||||||
|
expect(toolLogs.some((l) => l.toolRequest.name === agentName)).toBe(
|
||||||
|
false,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks that the main agent can correctly select the appropriate subagent
|
||||||
|
* from a large pool of available subagents, even when many irrelevant MCP tools are present.
|
||||||
|
*
|
||||||
|
* This test includes stress tests the subagent delegation with ~80 tools.
|
||||||
|
*/
|
||||||
|
evalTest('USUALLY_PASSES', {
|
||||||
|
name: 'should select the correct subagent from a pool of 10 different agents with MCP tools present',
|
||||||
|
prompt: 'Please add a new SQL table migration for a user profile.',
|
||||||
|
setup: async (rig) => {
|
||||||
|
rig.addTestMcpServer('workspace-server', 'google-workspace');
|
||||||
|
},
|
||||||
|
files: {
|
||||||
|
...TEST_AGENTS.DOCS_AGENT.asFile(),
|
||||||
|
...TEST_AGENTS.TESTING_AGENT.asFile(),
|
||||||
|
...TEST_AGENTS.DATABASE_AGENT.asFile(),
|
||||||
|
...TEST_AGENTS.CSS_AGENT.asFile(),
|
||||||
|
...TEST_AGENTS.I18N_AGENT.asFile(),
|
||||||
|
...TEST_AGENTS.SECURITY_AGENT.asFile(),
|
||||||
|
...TEST_AGENTS.DEVOPS_AGENT.asFile(),
|
||||||
|
...TEST_AGENTS.ANALYTICS_AGENT.asFile(),
|
||||||
|
...TEST_AGENTS.ACCESSIBILITY_AGENT.asFile(),
|
||||||
|
...TEST_AGENTS.MOBILE_AGENT.asFile(),
|
||||||
|
'package.json': MOCK_PACKAGE_JSON,
|
||||||
|
},
|
||||||
|
assert: async (rig, _result) => {
|
||||||
|
const toolLogs = rig.readToolLogs() as Array<{
|
||||||
|
toolRequest: { name: string };
|
||||||
|
}>;
|
||||||
|
await rig.expectToolCallSuccess(['database-agent']);
|
||||||
|
|
||||||
|
// Ensure the generalist and other irrelevant specialists were not invoked
|
||||||
|
const uncalledAgents = [
|
||||||
|
'generalist',
|
||||||
|
TEST_AGENTS.DOCS_AGENT.name,
|
||||||
|
TEST_AGENTS.TESTING_AGENT.name,
|
||||||
|
TEST_AGENTS.CSS_AGENT.name,
|
||||||
|
TEST_AGENTS.I18N_AGENT.name,
|
||||||
|
TEST_AGENTS.SECURITY_AGENT.name,
|
||||||
|
TEST_AGENTS.DEVOPS_AGENT.name,
|
||||||
|
TEST_AGENTS.ANALYTICS_AGENT.name,
|
||||||
|
TEST_AGENTS.ACCESSIBILITY_AGENT.name,
|
||||||
|
TEST_AGENTS.MOBILE_AGENT.name,
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const agentName of uncalledAgents) {
|
||||||
|
expect(toolLogs.some((l) => l.toolRequest.name === agentName)).toBe(
|
||||||
|
false,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -61,6 +61,10 @@ export async function internalEvalTest(evalCase: EvalCase) {
|
|||||||
try {
|
try {
|
||||||
rig.setup(evalCase.name, evalCase.params);
|
rig.setup(evalCase.name, evalCase.params);
|
||||||
|
|
||||||
|
if (evalCase.setup) {
|
||||||
|
await evalCase.setup(rig);
|
||||||
|
}
|
||||||
|
|
||||||
if (evalCase.files) {
|
if (evalCase.files) {
|
||||||
await setupTestFiles(rig, evalCase.files);
|
await setupTestFiles(rig, evalCase.files);
|
||||||
}
|
}
|
||||||
@@ -371,6 +375,7 @@ export interface EvalCase {
|
|||||||
prompt: string;
|
prompt: string;
|
||||||
timeout?: number;
|
timeout?: number;
|
||||||
files?: Record<string, string>;
|
files?: Record<string, string>;
|
||||||
|
setup?: (rig: TestRig) => Promise<void> | void;
|
||||||
/** Conversation history to pre-load via --resume. Each entry is a message object with type, content, etc. */
|
/** Conversation history to pre-load via --resume. Each entry is a message object with type, content, etc. */
|
||||||
messages?: Record<string, unknown>[];
|
messages?: Record<string, unknown>[];
|
||||||
/** Session ID for the resumed session. Auto-generated if not provided. */
|
/** Session ID for the resumed session. Auto-generated if not provided. */
|
||||||
|
|||||||
@@ -69,4 +69,84 @@ export const TEST_AGENTS = {
|
|||||||
tools: ['read_file', 'write_file'],
|
tools: ['read_file', 'write_file'],
|
||||||
body: 'You are the test agent. Add or update tests.',
|
body: 'You are the test agent. Add or update tests.',
|
||||||
}),
|
}),
|
||||||
|
/**
|
||||||
|
* An agent with expertise in database schemas, SQL, and creating database migrations.
|
||||||
|
*/
|
||||||
|
DATABASE_AGENT: createAgent({
|
||||||
|
name: 'database-agent',
|
||||||
|
description:
|
||||||
|
'An expert in database schemas, SQL, and creating database migrations.',
|
||||||
|
tools: ['read_file', 'write_file'],
|
||||||
|
body: 'You are the database agent. Create and update SQL migrations.',
|
||||||
|
}),
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An agent with expertise in CSS, styling, and UI design.
|
||||||
|
*/
|
||||||
|
CSS_AGENT: createAgent({
|
||||||
|
name: 'css-agent',
|
||||||
|
description: 'An expert in CSS, styling, and UI design.',
|
||||||
|
tools: ['read_file', 'write_file'],
|
||||||
|
body: 'You are the CSS agent.',
|
||||||
|
}),
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An agent with expertise in internationalization and translations.
|
||||||
|
*/
|
||||||
|
I18N_AGENT: createAgent({
|
||||||
|
name: 'i18n-agent',
|
||||||
|
description: 'An expert in internationalization and translations.',
|
||||||
|
tools: ['read_file', 'write_file'],
|
||||||
|
body: 'You are the i18n agent.',
|
||||||
|
}),
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An agent with expertise in security audits and vulnerability patches.
|
||||||
|
*/
|
||||||
|
SECURITY_AGENT: createAgent({
|
||||||
|
name: 'security-agent',
|
||||||
|
description: 'An expert in security audits and vulnerability patches.',
|
||||||
|
tools: ['read_file', 'write_file'],
|
||||||
|
body: 'You are the security agent.',
|
||||||
|
}),
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An agent with expertise in CI/CD, Docker, and deployment scripts.
|
||||||
|
*/
|
||||||
|
DEVOPS_AGENT: createAgent({
|
||||||
|
name: 'devops-agent',
|
||||||
|
description: 'An expert in CI/CD, Docker, and deployment scripts.',
|
||||||
|
tools: ['read_file', 'write_file'],
|
||||||
|
body: 'You are the devops agent.',
|
||||||
|
}),
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An agent with expertise in tracking, analytics, and metrics.
|
||||||
|
*/
|
||||||
|
ANALYTICS_AGENT: createAgent({
|
||||||
|
name: 'analytics-agent',
|
||||||
|
description: 'An expert in tracking, analytics, and metrics.',
|
||||||
|
tools: ['read_file', 'write_file'],
|
||||||
|
body: 'You are the analytics agent.',
|
||||||
|
}),
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An agent with expertise in web accessibility and ARIA roles.
|
||||||
|
*/
|
||||||
|
ACCESSIBILITY_AGENT: createAgent({
|
||||||
|
name: 'accessibility-agent',
|
||||||
|
description: 'An expert in web accessibility and ARIA roles.',
|
||||||
|
tools: ['read_file', 'write_file'],
|
||||||
|
body: 'You are the accessibility agent.',
|
||||||
|
}),
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An agent with expertise in React Native and mobile app development.
|
||||||
|
*/
|
||||||
|
MOBILE_AGENT: createAgent({
|
||||||
|
name: 'mobile-agent',
|
||||||
|
description: 'An expert in React Native and mobile app development.',
|
||||||
|
tools: ['read_file', 'write_file'],
|
||||||
|
body: 'You are the mobile agent.',
|
||||||
|
}),
|
||||||
} as const;
|
} as const;
|
||||||
|
|||||||
Reference in New Issue
Block a user