mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-04-17 08:41:19 -07:00
test(evals): add subagent delegation evaluation tests (#24619)
This commit is contained in:
130
evals/subtask_delegation.eval.ts
Normal file
130
evals/subtask_delegation.eval.ts
Normal file
@@ -0,0 +1,130 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { describe, expect } from 'vitest';
|
||||
import {
|
||||
TRACKER_CREATE_TASK_TOOL_NAME,
|
||||
TRACKER_UPDATE_TASK_TOOL_NAME,
|
||||
} from '@google/gemini-cli-core';
|
||||
import { evalTest, TEST_AGENTS } from './test-helper.js';
|
||||
|
||||
describe('subtask delegation eval test cases', () => {
|
||||
/**
|
||||
* Checks that the main agent can correctly decompose a complex, sequential
|
||||
* task into subtasks using the task tracker and delegate each to the appropriate expert subagent.
|
||||
*
|
||||
* The task requires:
|
||||
* 1. Reading requirements (researcher)
|
||||
* 2. Implementing logic (developer)
|
||||
* 3. Documenting (doc expert)
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should delegate sequential subtasks to relevant experts using the task tracker',
|
||||
params: {
|
||||
settings: {
|
||||
experimental: {
|
||||
enableAgents: true,
|
||||
taskTracker: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
prompt:
|
||||
'Please read the requirements in requirements.txt using a researcher, then implement the requested logic in src/logic.ts using a developer, and finally document the implementation in docs/logic.md using a documentation expert.',
|
||||
files: {
|
||||
'.gemini/agents/researcher.md': `---
|
||||
name: researcher
|
||||
description: Expert in reading files and extracting requirements.
|
||||
tools:
|
||||
- read_file
|
||||
---
|
||||
You are the researcher. Read the provided file and extract requirements.`,
|
||||
'.gemini/agents/developer.md': `---
|
||||
name: developer
|
||||
description: Expert in implementing logic in TypeScript.
|
||||
tools:
|
||||
- write_file
|
||||
---
|
||||
You are the developer. Implement the requested logic in the specified file.`,
|
||||
'.gemini/agents/doc-expert.md': `---
|
||||
name: doc-expert
|
||||
description: Expert in writing technical documentation.
|
||||
tools:
|
||||
- write_file
|
||||
---
|
||||
You are the doc expert. Document the provided implementation clearly.`,
|
||||
'requirements.txt':
|
||||
'Implement a function named "calculateSum" that adds two numbers.',
|
||||
},
|
||||
assert: async (rig, _result) => {
|
||||
// Verify tracker tasks were created
|
||||
const wasCreateCalled = await rig.waitForToolCall(
|
||||
TRACKER_CREATE_TASK_TOOL_NAME,
|
||||
);
|
||||
expect(wasCreateCalled).toBe(true);
|
||||
|
||||
const toolLogs = rig.readToolLogs();
|
||||
const createCalls = toolLogs.filter(
|
||||
(l) => l.toolRequest.name === TRACKER_CREATE_TASK_TOOL_NAME,
|
||||
);
|
||||
expect(createCalls.length).toBeGreaterThanOrEqual(3);
|
||||
|
||||
await rig.expectToolCallSuccess([
|
||||
'researcher',
|
||||
'developer',
|
||||
'doc-expert',
|
||||
]);
|
||||
|
||||
const logicFile = rig.readFile('src/logic.ts');
|
||||
const docFile = rig.readFile('docs/logic.md');
|
||||
|
||||
expect(logicFile).toContain('calculateSum');
|
||||
expect(docFile).toBeTruthy();
|
||||
},
|
||||
});
|
||||
|
||||
/**
|
||||
* Checks that the main agent can delegate a batch of independent subtasks
|
||||
* to multiple subagents in parallel using the task tracker to manage state.
|
||||
*/
|
||||
evalTest('USUALLY_PASSES', {
|
||||
name: 'should delegate independent subtasks to specialists using the task tracker',
|
||||
params: {
|
||||
settings: {
|
||||
experimental: {
|
||||
enableAgents: true,
|
||||
taskTracker: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
prompt:
|
||||
'Please update the project for internationalization (i18n), audit the security of the current code, and update the CSS to use a blue theme. Use specialized experts for each task.',
|
||||
files: {
|
||||
...TEST_AGENTS.I18N_AGENT.asFile(),
|
||||
...TEST_AGENTS.SECURITY_AGENT.asFile(),
|
||||
...TEST_AGENTS.CSS_AGENT.asFile(),
|
||||
'index.ts': 'console.log("Hello World");',
|
||||
},
|
||||
assert: async (rig, _result) => {
|
||||
// Verify tracker tasks were created
|
||||
const wasCreateCalled = await rig.waitForToolCall(
|
||||
TRACKER_CREATE_TASK_TOOL_NAME,
|
||||
);
|
||||
expect(wasCreateCalled).toBe(true);
|
||||
|
||||
const toolLogs = rig.readToolLogs();
|
||||
const createCalls = toolLogs.filter(
|
||||
(l) => l.toolRequest.name === TRACKER_CREATE_TASK_TOOL_NAME,
|
||||
);
|
||||
expect(createCalls.length).toBeGreaterThanOrEqual(3);
|
||||
|
||||
await rig.expectToolCallSuccess([
|
||||
TEST_AGENTS.I18N_AGENT.name,
|
||||
TEST_AGENTS.SECURITY_AGENT.name,
|
||||
TEST_AGENTS.CSS_AGENT.name,
|
||||
]);
|
||||
},
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user