Files
gemini-cli/evals/subtask_delegation.eval.ts

131 lines
4.1 KiB
TypeScript

/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, expect } from 'vitest';
import {
TRACKER_CREATE_TASK_TOOL_NAME,
TRACKER_UPDATE_TASK_TOOL_NAME,
} from '@google/gemini-cli-core';
import { evalTest, TEST_AGENTS } from './test-helper.js';
describe('subtask delegation eval test cases', () => {
/**
* Checks that the main agent can correctly decompose a complex, sequential
* task into subtasks using the task tracker and delegate each to the appropriate expert subagent.
*
* The task requires:
* 1. Reading requirements (researcher)
* 2. Implementing logic (developer)
* 3. Documenting (doc expert)
*/
evalTest('USUALLY_PASSES', {
name: 'should delegate sequential subtasks to relevant experts using the task tracker',
params: {
settings: {
experimental: {
enableAgents: true,
taskTracker: true,
},
},
},
prompt:
'Please read the requirements in requirements.txt using a researcher, then implement the requested logic in src/logic.ts using a developer, and finally document the implementation in docs/logic.md using a documentation expert.',
files: {
'.gemini/agents/researcher.md': `---
name: researcher
description: Expert in reading files and extracting requirements.
tools:
- read_file
---
You are the researcher. Read the provided file and extract requirements.`,
'.gemini/agents/developer.md': `---
name: developer
description: Expert in implementing logic in TypeScript.
tools:
- write_file
---
You are the developer. Implement the requested logic in the specified file.`,
'.gemini/agents/doc-expert.md': `---
name: doc-expert
description: Expert in writing technical documentation.
tools:
- write_file
---
You are the doc expert. Document the provided implementation clearly.`,
'requirements.txt':
'Implement a function named "calculateSum" that adds two numbers.',
},
assert: async (rig, _result) => {
// Verify tracker tasks were created
const wasCreateCalled = await rig.waitForToolCall(
TRACKER_CREATE_TASK_TOOL_NAME,
);
expect(wasCreateCalled).toBe(true);
const toolLogs = rig.readToolLogs();
const createCalls = toolLogs.filter(
(l) => l.toolRequest.name === TRACKER_CREATE_TASK_TOOL_NAME,
);
expect(createCalls.length).toBeGreaterThanOrEqual(3);
await rig.expectToolCallSuccess([
'researcher',
'developer',
'doc-expert',
]);
const logicFile = rig.readFile('src/logic.ts');
const docFile = rig.readFile('docs/logic.md');
expect(logicFile).toContain('calculateSum');
expect(docFile).toBeTruthy();
},
});
/**
* Checks that the main agent can delegate a batch of independent subtasks
* to multiple subagents in parallel using the task tracker to manage state.
*/
evalTest('USUALLY_PASSES', {
name: 'should delegate independent subtasks to specialists using the task tracker',
params: {
settings: {
experimental: {
enableAgents: true,
taskTracker: true,
},
},
},
prompt:
'Please update the project for internationalization (i18n), audit the security of the current code, and update the CSS to use a blue theme. Use specialized experts for each task.',
files: {
...TEST_AGENTS.I18N_AGENT.asFile(),
...TEST_AGENTS.SECURITY_AGENT.asFile(),
...TEST_AGENTS.CSS_AGENT.asFile(),
'index.ts': 'console.log("Hello World");',
},
assert: async (rig, _result) => {
// Verify tracker tasks were created
const wasCreateCalled = await rig.waitForToolCall(
TRACKER_CREATE_TASK_TOOL_NAME,
);
expect(wasCreateCalled).toBe(true);
const toolLogs = rig.readToolLogs();
const createCalls = toolLogs.filter(
(l) => l.toolRequest.name === TRACKER_CREATE_TASK_TOOL_NAME,
);
expect(createCalls.length).toBeGreaterThanOrEqual(3);
await rig.expectToolCallSuccess([
TEST_AGENTS.I18N_AGENT.name,
TEST_AGENTS.SECURITY_AGENT.name,
TEST_AGENTS.CSS_AGENT.name,
]);
},
});
});