mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-05-03 16:34:31 -07:00
feat(evals): add behavioral evaluations for subagent routing (#23272)
Co-authored-by: Samee Zahid <sameez@google.com>
This commit is contained in:
+162
-7
@@ -4,21 +4,41 @@
|
|||||||
* SPDX-License-Identifier: Apache-2.0
|
* SPDX-License-Identifier: Apache-2.0
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import { describe } from 'vitest';
|
import fs from 'node:fs';
|
||||||
|
import path from 'node:path';
|
||||||
|
|
||||||
|
import { describe, expect } from 'vitest';
|
||||||
|
|
||||||
import { evalTest } from './test-helper.js';
|
import { evalTest } from './test-helper.js';
|
||||||
|
|
||||||
const AGENT_DEFINITION = `---
|
const DOCS_AGENT_DEFINITION = `---
|
||||||
name: docs-agent
|
name: docs-agent
|
||||||
description: An agent with expertise in updating documentation.
|
description: An agent with expertise in updating documentation.
|
||||||
tools:
|
tools:
|
||||||
- read_file
|
- read_file
|
||||||
- write_file
|
- write_file
|
||||||
---
|
---
|
||||||
|
You are the docs agent. Update documentation clearly and accurately.
|
||||||
You are the docs agent. Update the documentation.
|
|
||||||
`;
|
`;
|
||||||
|
|
||||||
const INDEX_TS = 'export const add = (a: number, b: number) => a + b;';
|
const TEST_AGENT_DEFINITION = `---
|
||||||
|
name: test-agent
|
||||||
|
description: An agent with expertise in writing and updating tests.
|
||||||
|
tools:
|
||||||
|
- read_file
|
||||||
|
- write_file
|
||||||
|
---
|
||||||
|
You are the test agent. Add or update tests.
|
||||||
|
`;
|
||||||
|
|
||||||
|
const INDEX_TS = 'export const add = (a: number, b: number) => a + b;\n';
|
||||||
|
|
||||||
|
function readProjectFile(
|
||||||
|
rig: { testDir?: string },
|
||||||
|
relativePath: string,
|
||||||
|
): string {
|
||||||
|
return fs.readFileSync(path.join(rig.testDir!, relativePath), 'utf8');
|
||||||
|
}
|
||||||
|
|
||||||
describe('subagent eval test cases', () => {
|
describe('subagent eval test cases', () => {
|
||||||
/**
|
/**
|
||||||
@@ -42,12 +62,147 @@ describe('subagent eval test cases', () => {
|
|||||||
},
|
},
|
||||||
prompt: 'Please update README.md with a description of this library.',
|
prompt: 'Please update README.md with a description of this library.',
|
||||||
files: {
|
files: {
|
||||||
'.gemini/agents/test-agent.md': AGENT_DEFINITION,
|
'.gemini/agents/docs-agent.md': DOCS_AGENT_DEFINITION,
|
||||||
'index.ts': INDEX_TS,
|
'index.ts': INDEX_TS,
|
||||||
'README.md': 'TODO: update the README.',
|
'README.md': 'TODO: update the README.\n',
|
||||||
},
|
},
|
||||||
assert: async (rig, _result) => {
|
assert: async (rig, _result) => {
|
||||||
await rig.expectToolCallSuccess(['docs-agent']);
|
await rig.expectToolCallSuccess(['docs-agent']);
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks that the outer agent does not over-delegate trivial work when
|
||||||
|
* subagents are available. This helps catch orchestration overuse.
|
||||||
|
*/
|
||||||
|
evalTest('USUALLY_PASSES', {
|
||||||
|
name: 'should avoid delegating trivial direct edit work',
|
||||||
|
params: {
|
||||||
|
settings: {
|
||||||
|
experimental: {
|
||||||
|
enableAgents: true,
|
||||||
|
agents: {
|
||||||
|
overrides: {
|
||||||
|
generalist: { enabled: true },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
prompt:
|
||||||
|
'Rename the exported function in index.ts from add to sum and update the file directly.',
|
||||||
|
files: {
|
||||||
|
'.gemini/agents/docs-agent.md': DOCS_AGENT_DEFINITION,
|
||||||
|
'index.ts': INDEX_TS,
|
||||||
|
},
|
||||||
|
assert: async (rig, _result) => {
|
||||||
|
const updatedIndex = readProjectFile(rig, 'index.ts');
|
||||||
|
const toolLogs = rig.readToolLogs() as Array<{
|
||||||
|
toolRequest: { name: string };
|
||||||
|
}>;
|
||||||
|
|
||||||
|
expect(updatedIndex).toContain('export const sum =');
|
||||||
|
expect(toolLogs.some((l) => l.toolRequest.name === 'docs-agent')).toBe(
|
||||||
|
false,
|
||||||
|
);
|
||||||
|
expect(toolLogs.some((l) => l.toolRequest.name === 'generalist')).toBe(
|
||||||
|
false,
|
||||||
|
);
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks that the outer agent prefers a more relevant specialist over a
|
||||||
|
* broad generalist when both are available.
|
||||||
|
*
|
||||||
|
* This is meant to codify the "overusing Generalist" failure mode.
|
||||||
|
*/
|
||||||
|
evalTest('USUALLY_PASSES', {
|
||||||
|
name: 'should prefer relevant specialist over generalist',
|
||||||
|
params: {
|
||||||
|
settings: {
|
||||||
|
experimental: {
|
||||||
|
enableAgents: true,
|
||||||
|
agents: {
|
||||||
|
overrides: {
|
||||||
|
generalist: { enabled: true },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
prompt: 'Please add a small test file that verifies add(1, 2) returns 3.',
|
||||||
|
files: {
|
||||||
|
'.gemini/agents/test-agent.md': TEST_AGENT_DEFINITION,
|
||||||
|
'index.ts': INDEX_TS,
|
||||||
|
'package.json': JSON.stringify(
|
||||||
|
{
|
||||||
|
name: 'subagent-eval-project',
|
||||||
|
version: '1.0.0',
|
||||||
|
type: 'module',
|
||||||
|
},
|
||||||
|
null,
|
||||||
|
2,
|
||||||
|
),
|
||||||
|
},
|
||||||
|
assert: async (rig, _result) => {
|
||||||
|
const toolLogs = rig.readToolLogs() as Array<{
|
||||||
|
toolRequest: { name: string };
|
||||||
|
}>;
|
||||||
|
|
||||||
|
await rig.expectToolCallSuccess(['test-agent']);
|
||||||
|
expect(toolLogs.some((l) => l.toolRequest.name === 'generalist')).toBe(
|
||||||
|
false,
|
||||||
|
);
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks cardinality and decomposition for a multi-surface task. The task
|
||||||
|
* naturally spans docs and tests, so multiple specialists should be used.
|
||||||
|
*/
|
||||||
|
evalTest('USUALLY_PASSES', {
|
||||||
|
name: 'should use multiple relevant specialists for multi-surface task',
|
||||||
|
params: {
|
||||||
|
settings: {
|
||||||
|
experimental: {
|
||||||
|
enableAgents: true,
|
||||||
|
agents: {
|
||||||
|
overrides: {
|
||||||
|
generalist: { enabled: true },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
prompt:
|
||||||
|
'Add a short README description for this library and also add a test file that verifies add(1, 2) returns 3.',
|
||||||
|
files: {
|
||||||
|
'.gemini/agents/docs-agent.md': DOCS_AGENT_DEFINITION,
|
||||||
|
'.gemini/agents/test-agent.md': TEST_AGENT_DEFINITION,
|
||||||
|
'index.ts': INDEX_TS,
|
||||||
|
'README.md': 'TODO: update the README.\n',
|
||||||
|
'package.json': JSON.stringify(
|
||||||
|
{
|
||||||
|
name: 'subagent-eval-project',
|
||||||
|
version: '1.0.0',
|
||||||
|
type: 'module',
|
||||||
|
},
|
||||||
|
null,
|
||||||
|
2,
|
||||||
|
),
|
||||||
|
},
|
||||||
|
assert: async (rig, _result) => {
|
||||||
|
const toolLogs = rig.readToolLogs() as Array<{
|
||||||
|
toolRequest: { name: string };
|
||||||
|
}>;
|
||||||
|
const readme = readProjectFile(rig, 'README.md');
|
||||||
|
|
||||||
|
await rig.expectToolCallSuccess(['docs-agent', 'test-agent']);
|
||||||
|
expect(readme).not.toContain('TODO: update the README.');
|
||||||
|
expect(toolLogs.some((l) => l.toolRequest.name === 'generalist')).toBe(
|
||||||
|
false,
|
||||||
|
);
|
||||||
|
},
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
Reference in New Issue
Block a user