Files
gemini-cli/evals/subagents.eval.ts
Christian Gunderman 02bf9db95c Enable generalist.
2026-02-16 12:19:18 -08:00

129 lines
3.7 KiB
TypeScript

/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe } from 'vitest';
import { evalTest } from './test-helper.js';
const AGENT_DEFINITION = `---
name: docs-agent
description: An agent with expertise in updating documentation.
tools:
- read_file
- write_file
---
You are the docs agent. Update the documentation.
`;
const INDEX_TS = 'export const add = (a: number, b: number) => a + b;';
describe('subagent eval test cases', () => {
/**
* Checks whether the outer agent reliably utilizes an expert subagent to
* accomplish a task when one is available.
*
* Note that the test is intentionally crafted to avoid the word "document"
* or "docs". We want to see the outer agent make the connection even when
* the prompt indirectly implies need of expertise.
*
* This tests the system prompt's subagent specific clauses.
*/
evalTest('USUALLY_PASSES', {
name: 'should delegate to user provided agent with relevant expertise',
params: {
settings: {
experimental: {
enableAgents: true,
},
},
},
prompt: 'Please update README.md with a description of this library.',
files: {
'.gemini/agents/test-agent.md': AGENT_DEFINITION,
'index.ts': INDEX_TS,
'README.md': 'TODO: update the README.',
},
assert: async (rig, _result) => {
await rig.expectToolCallSuccess(['docs-agent']);
},
});
evalTest('ALWAYS_PASSES', {
name: 'should fix linter errors in multiple projects using implicit parallelism',
prompt: 'Fix all linter errors.',
timeout: 600000,
files: {
'project-a/eslint.config.js': `
module.exports = [
{
files: ["**/*.js"],
rules: {
"no-var": "error"
}
}
];
`,
'project-a/index.js': 'var x = 1;',
'project-b/eslint.config.js': `
module.exports = [
{
files: ["**/*.js"],
rules: {
"no-console": "error"
}
}
];
`,
'project-b/main.js': 'console.log("hello");',
},
assert: async (rig) => {
const fileA = rig.readFile('project-a/index.js');
const fileB = rig.readFile('project-b/main.js');
if (fileA.includes('var x')) {
throw new Error(`project-a/index.js was not fixed. Content:\n${fileA}`);
}
// Check if console.log is present and NOT commented out or disabled.
const lines = fileB.split('\n');
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
if (line.includes('console.log')) {
const isCommented = line.trim().startsWith('//');
const isDisabled =
(i > 0 && lines[i - 1].includes('eslint-disable')) ||
line.includes('eslint-disable-line');
if (!isCommented && !isDisabled) {
throw new Error(
`project-b/main.js was not fixed (console.log present without disable/comment). Content:\n${fileB}`,
);
}
}
}
// Assert that the agent delegated to a subagent for each project.
const toolLogs = rig.readToolLogs();
const subagentCalls = toolLogs.filter((log) => {
if (log.toolRequest.name === 'generalist') return true;
if (log.toolRequest.name === 'delegate_to_agent') {
try {
const args = JSON.parse(log.toolRequest.args);
return args.agent_name === 'generalist';
} catch {
return false;
}
}
return false;
});
if (subagentCalls.length < 2) {
throw new Error(
`Expected at least 2 generalist calls, but found ${subagentCalls.length}`,
);
}
},
});
});