mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-04-13 06:40:33 -07:00
323 lines
10 KiB
TypeScript
323 lines
10 KiB
TypeScript
/**
|
|
* @license
|
|
* Copyright 2026 Google LLC
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*/
|
|
|
|
import fs from 'node:fs';
|
|
import path from 'node:path';
|
|
|
|
import { describe, expect } from 'vitest';
|
|
|
|
import { AGENT_TOOL_NAME } from '@google/gemini-cli-core';
|
|
import { evalTest, TEST_AGENTS, TestRig } from './test-helper.js';
|
|
|
|
const INDEX_TS = 'export const add = (a: number, b: number) => a + b;\n';
|
|
|
|
/**
|
|
* Helper to verify that a specific subagent was successfully invoked via the unified tool.
|
|
*/
|
|
async function expectSubagentCall(rig: TestRig, agentName: string) {
|
|
await rig.expectToolCallSuccess(
|
|
[AGENT_TOOL_NAME],
|
|
undefined,
|
|
(args: string) => {
|
|
try {
|
|
const parsed = JSON.parse(args);
|
|
return parsed.agent_name === agentName;
|
|
} catch {
|
|
return false;
|
|
}
|
|
},
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Helper to check if a subagent (either via unified tool or direct name) was called.
|
|
*/
|
|
function isSubagentCalled(toolLogs: any[], agentName: string): boolean {
|
|
return toolLogs.some((l) => {
|
|
if (l.toolRequest.name === AGENT_TOOL_NAME) {
|
|
try {
|
|
const args = JSON.parse(l.toolRequest.args);
|
|
return args.agent_name === agentName;
|
|
} catch {
|
|
return false;
|
|
}
|
|
}
|
|
return l.toolRequest.name === agentName;
|
|
});
|
|
}
|
|
|
|
// A minimal package.json is used to provide a realistic workspace anchor.
|
|
// This prevents the agent from making incorrect assumptions about the environment
|
|
// and helps it properly navigate or act as if it is in a standard Node.js project.
|
|
const MOCK_PACKAGE_JSON = JSON.stringify(
|
|
{
|
|
name: 'subagent-eval-project',
|
|
version: '1.0.0',
|
|
type: 'module',
|
|
},
|
|
null,
|
|
2,
|
|
);
|
|
|
|
function readProjectFile(
|
|
rig: { testDir: string | null },
|
|
relativePath: string,
|
|
): string {
|
|
return fs.readFileSync(path.join(rig.testDir!, relativePath), 'utf8');
|
|
}
|
|
|
|
describe('subagent eval test cases', () => {
|
|
/**
|
|
* Checks whether the outer agent reliably utilizes an expert subagent to
|
|
* accomplish a task when one is available.
|
|
*
|
|
* Note that the test is intentionally crafted to avoid the word "document"
|
|
* or "docs". We want to see the outer agent make the connection even when
|
|
* the prompt indirectly implies need of expertise.
|
|
*
|
|
* This tests the system prompt's subagent specific clauses.
|
|
*/
|
|
evalTest('USUALLY_PASSES', {
|
|
suiteName: 'default',
|
|
suiteType: 'behavioral',
|
|
name: 'should delegate to user provided agent with relevant expertise',
|
|
params: {
|
|
settings: {
|
|
experimental: {
|
|
enableAgents: true,
|
|
},
|
|
},
|
|
},
|
|
prompt: 'Please update README.md with a description of this library.',
|
|
files: {
|
|
...TEST_AGENTS.DOCS_AGENT.asFile(),
|
|
'index.ts': INDEX_TS,
|
|
'README.md': 'TODO: update the README.\n',
|
|
},
|
|
assert: async (rig, _result) => {
|
|
await expectSubagentCall(rig, TEST_AGENTS.DOCS_AGENT.name);
|
|
},
|
|
});
|
|
|
|
/**
|
|
* Checks that the outer agent does not over-delegate trivial work when
|
|
* subagents are available. This helps catch orchestration overuse.
|
|
*/
|
|
evalTest('USUALLY_PASSES', {
|
|
suiteName: 'default',
|
|
suiteType: 'behavioral',
|
|
name: 'should avoid delegating trivial direct edit work',
|
|
params: {
|
|
settings: {
|
|
experimental: {
|
|
enableAgents: true,
|
|
agents: {
|
|
overrides: {
|
|
generalist: { enabled: true },
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
prompt:
|
|
'Rename the exported function in index.ts from add to sum and update the file directly.',
|
|
files: {
|
|
...TEST_AGENTS.DOCS_AGENT.asFile(),
|
|
'index.ts': INDEX_TS,
|
|
},
|
|
assert: async (rig, _result) => {
|
|
const updatedIndex = readProjectFile(rig, 'index.ts');
|
|
const toolLogs = rig.readToolLogs() as Array<{
|
|
toolRequest: { name: string };
|
|
}>;
|
|
|
|
expect(updatedIndex).toContain('export const sum =');
|
|
expect(isSubagentCalled(toolLogs, TEST_AGENTS.DOCS_AGENT.name)).toBe(
|
|
false,
|
|
);
|
|
expect(isSubagentCalled(toolLogs, 'generalist')).toBe(false);
|
|
},
|
|
});
|
|
|
|
/**
|
|
* Checks that the outer agent prefers a more relevant specialist over a
|
|
* broad generalist when both are available.
|
|
*
|
|
* This is meant to codify the "overusing Generalist" failure mode.
|
|
*/
|
|
evalTest('USUALLY_PASSES', {
|
|
suiteName: 'default',
|
|
suiteType: 'behavioral',
|
|
name: 'should prefer relevant specialist over generalist',
|
|
params: {
|
|
settings: {
|
|
experimental: {
|
|
enableAgents: true,
|
|
agents: {
|
|
overrides: {
|
|
generalist: { enabled: true },
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
prompt: 'Please add a small test file that verifies add(1, 2) returns 3.',
|
|
files: {
|
|
...TEST_AGENTS.TESTING_AGENT.asFile(),
|
|
'index.ts': INDEX_TS,
|
|
'package.json': MOCK_PACKAGE_JSON,
|
|
},
|
|
assert: async (rig, _result) => {
|
|
const toolLogs = rig.readToolLogs() as Array<{
|
|
toolRequest: { name: string; args: string };
|
|
}>;
|
|
|
|
await expectSubagentCall(rig, TEST_AGENTS.TESTING_AGENT.name);
|
|
expect(isSubagentCalled(toolLogs, 'generalist')).toBe(false);
|
|
},
|
|
});
|
|
|
|
/**
|
|
* Checks cardinality and decomposition for a multi-surface task. The task
|
|
* naturally spans docs and tests, so multiple specialists should be used.
|
|
*/
|
|
evalTest('USUALLY_PASSES', {
|
|
suiteName: 'default',
|
|
suiteType: 'behavioral',
|
|
name: 'should use multiple relevant specialists for multi-surface task',
|
|
params: {
|
|
settings: {
|
|
experimental: {
|
|
enableAgents: true,
|
|
agents: {
|
|
overrides: {
|
|
generalist: { enabled: true },
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
prompt:
|
|
'Add a short README description for this library and also add a test file that verifies add(1, 2) returns 3.',
|
|
files: {
|
|
...TEST_AGENTS.DOCS_AGENT.asFile(),
|
|
...TEST_AGENTS.TESTING_AGENT.asFile(),
|
|
'index.ts': INDEX_TS,
|
|
'README.md': 'TODO: update the README.\n',
|
|
'package.json': MOCK_PACKAGE_JSON,
|
|
},
|
|
assert: async (rig, _result) => {
|
|
const toolLogs = rig.readToolLogs() as Array<{
|
|
toolRequest: { name: string; args: string };
|
|
}>;
|
|
const readme = readProjectFile(rig, 'README.md');
|
|
|
|
await expectSubagentCall(rig, TEST_AGENTS.DOCS_AGENT.name);
|
|
await expectSubagentCall(rig, TEST_AGENTS.TESTING_AGENT.name);
|
|
|
|
expect(readme).not.toContain('TODO: update the README.');
|
|
expect(isSubagentCalled(toolLogs, 'generalist')).toBe(false);
|
|
},
|
|
});
|
|
|
|
/**
|
|
* Checks that the main agent can correctly select the appropriate subagent
|
|
* from a large pool of available subagents (10 total).
|
|
*/
|
|
evalTest('USUALLY_PASSES', {
|
|
suiteName: 'default',
|
|
suiteType: 'behavioral',
|
|
name: 'should select the correct subagent from a pool of 10 different agents',
|
|
prompt: 'Please add a new SQL table migration for a user profile.',
|
|
files: {
|
|
...TEST_AGENTS.DOCS_AGENT.asFile(),
|
|
...TEST_AGENTS.TESTING_AGENT.asFile(),
|
|
...TEST_AGENTS.DATABASE_AGENT.asFile(),
|
|
...TEST_AGENTS.CSS_AGENT.asFile(),
|
|
...TEST_AGENTS.I18N_AGENT.asFile(),
|
|
...TEST_AGENTS.SECURITY_AGENT.asFile(),
|
|
...TEST_AGENTS.DEVOPS_AGENT.asFile(),
|
|
...TEST_AGENTS.ANALYTICS_AGENT.asFile(),
|
|
...TEST_AGENTS.ACCESSIBILITY_AGENT.asFile(),
|
|
...TEST_AGENTS.MOBILE_AGENT.asFile(),
|
|
'package.json': MOCK_PACKAGE_JSON,
|
|
},
|
|
assert: async (rig, _result) => {
|
|
const toolLogs = rig.readToolLogs();
|
|
await expectSubagentCall(rig, TEST_AGENTS.DATABASE_AGENT.name);
|
|
|
|
// Ensure the generalist and other irrelevant specialists were not invoked
|
|
const uncalledAgents = [
|
|
TEST_AGENTS.DOCS_AGENT.name,
|
|
TEST_AGENTS.TESTING_AGENT.name,
|
|
TEST_AGENTS.CSS_AGENT.name,
|
|
TEST_AGENTS.I18N_AGENT.name,
|
|
TEST_AGENTS.SECURITY_AGENT.name,
|
|
TEST_AGENTS.DEVOPS_AGENT.name,
|
|
TEST_AGENTS.ANALYTICS_AGENT.name,
|
|
TEST_AGENTS.ACCESSIBILITY_AGENT.name,
|
|
TEST_AGENTS.MOBILE_AGENT.name,
|
|
];
|
|
|
|
for (const agentName of uncalledAgents) {
|
|
expect(isSubagentCalled(toolLogs, agentName)).toBe(false);
|
|
}
|
|
expect(isSubagentCalled(toolLogs, 'generalist')).toBe(false);
|
|
},
|
|
});
|
|
|
|
/**
|
|
* Checks that the main agent can correctly select the appropriate subagent
|
|
* from a large pool of available subagents, even when many irrelevant MCP tools are present.
|
|
*
|
|
* This test includes stress tests the subagent delegation with ~80 tools.
|
|
*/
|
|
evalTest('USUALLY_PASSES', {
|
|
suiteName: 'default',
|
|
suiteType: 'behavioral',
|
|
name: 'should select the correct subagent from a pool of 10 different agents with MCP tools present',
|
|
prompt: 'Please add a new SQL table migration for a user profile.',
|
|
setup: async (rig) => {
|
|
rig.addTestMcpServer('workspace-server', 'google-workspace');
|
|
},
|
|
files: {
|
|
...TEST_AGENTS.DOCS_AGENT.asFile(),
|
|
...TEST_AGENTS.TESTING_AGENT.asFile(),
|
|
...TEST_AGENTS.DATABASE_AGENT.asFile(),
|
|
...TEST_AGENTS.CSS_AGENT.asFile(),
|
|
...TEST_AGENTS.I18N_AGENT.asFile(),
|
|
...TEST_AGENTS.SECURITY_AGENT.asFile(),
|
|
...TEST_AGENTS.DEVOPS_AGENT.asFile(),
|
|
...TEST_AGENTS.ANALYTICS_AGENT.asFile(),
|
|
...TEST_AGENTS.ACCESSIBILITY_AGENT.asFile(),
|
|
...TEST_AGENTS.MOBILE_AGENT.asFile(),
|
|
'package.json': MOCK_PACKAGE_JSON,
|
|
},
|
|
assert: async (rig, _result) => {
|
|
const toolLogs = rig.readToolLogs();
|
|
await expectSubagentCall(rig, TEST_AGENTS.DATABASE_AGENT.name);
|
|
|
|
// Ensure the generalist and other irrelevant specialists were not invoked
|
|
const uncalledAgents = [
|
|
TEST_AGENTS.DOCS_AGENT.name,
|
|
TEST_AGENTS.TESTING_AGENT.name,
|
|
TEST_AGENTS.CSS_AGENT.name,
|
|
TEST_AGENTS.I18N_AGENT.name,
|
|
TEST_AGENTS.SECURITY_AGENT.name,
|
|
TEST_AGENTS.DEVOPS_AGENT.name,
|
|
TEST_AGENTS.ANALYTICS_AGENT.name,
|
|
TEST_AGENTS.ACCESSIBILITY_AGENT.name,
|
|
TEST_AGENTS.MOBILE_AGENT.name,
|
|
];
|
|
|
|
for (const agentName of uncalledAgents) {
|
|
expect(isSubagentCalled(toolLogs, agentName)).toBe(false);
|
|
}
|
|
expect(isSubagentCalled(toolLogs, 'generalist')).toBe(false);
|
|
},
|
|
});
|
|
});
|