Steer outer agent to use expert subagents when present (#16763)

This commit is contained in:
Christian Gunderman
2026-01-16 16:51:10 +00:00
committed by GitHub
parent 4bb817de22
commit a15978593a
6 changed files with 122 additions and 5 deletions
+7
View File
@@ -88,6 +88,13 @@ describe('my_feature', () => {
## Running Evaluations
First, build the bundled Gemini CLI. You must do this after every code change.
```bash
npm run build
npm run bundle
```
### Always Passing Evals
To run the evaluations that are expected to always pass (CI safe):
+64
View File
@@ -0,0 +1,64 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe } from 'vitest';
import { evalTest } from './test-helper.js';
const AGENT_DEFINITION = `---
name: docs-agent
description: An agent with expertise in updating documentation.
tools:
- read_file
- write_file
---
You are the docs agent. Update the documentation.
`;
const INDEX_TS = 'export const add = (a: number, b: number) => a + b;';
describe('subagent eval test cases', () => {
/**
* Checks whether the outer agent reliably utilizes an expert subagent to
* accomplish a task when one is available.
*
* Note that the test is intentionally crafted to avoid the word "document"
* or "docs". We want to see the outer agent make the connection even when
* the prompt indirectly implies need of expertise.
*
* This tests the system prompt's subagent specific clauses.
*/
evalTest('ALWAYS_PASSES', {
name: 'should delegate to user provided agent with relevant expertise',
params: {
settings: {
experimental: {
enableAgents: true,
},
},
},
prompt: 'Please update README.md with a description of this library.',
files: {
'.gemini/agents/test-agent.md': AGENT_DEFINITION,
'index.ts': INDEX_TS,
'README.md': 'TODO: update the README.',
},
assert: async (rig, _result) => {
await rig.expectToolCallSuccess(
['delegate_to_agent'],
undefined,
(args) => {
try {
const parsed = JSON.parse(args);
return parsed.agent_name === 'docs-agent';
} catch {
return false;
}
},
);
},
});
});
+31 -2
View File
@@ -6,7 +6,10 @@
import { it } from 'vitest';
import fs from 'node:fs';
import path from 'node:path';
import { execSync } from 'node:child_process';
import { TestRig } from '@google/gemini-cli-test-utils';
import { createUnauthorizedToolError } from '@google/gemini-cli-core';
export * from '@google/gemini-cli-test-utils';
@@ -32,8 +35,33 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
const fn = async () => {
const rig = new TestRig();
try {
await rig.setup(evalCase.name, evalCase.params);
rig.setup(evalCase.name, evalCase.params);
if (evalCase.files) {
for (const [filePath, content] of Object.entries(evalCase.files)) {
const fullPath = path.join(rig.testDir!, filePath);
fs.mkdirSync(path.dirname(fullPath), { recursive: true });
fs.writeFileSync(fullPath, content);
}
const execOptions = { cwd: rig.testDir!, stdio: 'inherit' as const };
execSync('git init', execOptions);
execSync('git config user.email "test@example.com"', execOptions);
execSync('git config user.name "Test User"', execOptions);
execSync('git add .', execOptions);
execSync('git commit --allow-empty -m "Initial commit"', execOptions);
}
const result = await rig.run({ args: evalCase.prompt });
const unauthorizedErrorPrefix =
createUnauthorizedToolError('').split("'")[0];
if (result.includes(unauthorizedErrorPrefix)) {
throw new Error(
'Test failed due to unauthorized tool call in output: ' + result,
);
}
await evalCase.assert(rig, result);
} finally {
await logToFile(
@@ -44,7 +72,7 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
}
};
if (policy === 'USUALLY_PASSES' && !process.env.RUN_EVALS) {
if (policy === 'USUALLY_PASSES' && !process.env['RUN_EVALS']) {
it.skip(evalCase.name, fn);
} else {
it(evalCase.name, fn);
@@ -55,6 +83,7 @@ export interface EvalCase {
name: string;
params?: Record<string, any>;
prompt: string;
files?: Record<string, string>;
assert: (rig: TestRig, result: string) => Promise<void>;
}