mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-03-10 14:10:37 -07:00
166 lines
4.5 KiB
TypeScript
166 lines
4.5 KiB
TypeScript
|
|
/**
|
||
|
|
* @license
|
||
|
|
* Copyright 2026 Google LLC
|
||
|
|
* SPDX-License-Identifier: Apache-2.0
|
||
|
|
*/
|
||
|
|
|
||
|
|
import { describe, expect } from 'vitest';
|
||
|
|
import { appEvalTest } from './app-test-helper.js';
|
||
|
|
|
||
|
|
describe('generalist_delegation', () => {
|
||
|
|
// --- Positive Evals (Should Delegate) ---
|
||
|
|
|
||
|
|
appEvalTest('USUALLY_PASSES', {
|
||
|
|
name: 'should delegate batch error fixing to generalist agent',
|
||
|
|
configOverrides: {
|
||
|
|
agents: {
|
||
|
|
overrides: {
|
||
|
|
generalist: { enabled: true },
|
||
|
|
},
|
||
|
|
},
|
||
|
|
experimental: {
|
||
|
|
enableAgents: true,
|
||
|
|
},
|
||
|
|
excludeTools: ['run_shell_command'],
|
||
|
|
},
|
||
|
|
files: {
|
||
|
|
'file1.ts': 'console.log("no semi")',
|
||
|
|
'file2.ts': 'console.log("no semi")',
|
||
|
|
'file3.ts': 'console.log("no semi")',
|
||
|
|
'file4.ts': 'console.log("no semi")',
|
||
|
|
'file5.ts': 'console.log("no semi")',
|
||
|
|
'file6.ts': 'console.log("no semi")',
|
||
|
|
'file7.ts': 'console.log("no semi")',
|
||
|
|
'file8.ts': 'console.log("no semi")',
|
||
|
|
'file9.ts': 'console.log("no semi")',
|
||
|
|
'file10.ts': 'console.log("no semi")',
|
||
|
|
},
|
||
|
|
prompt:
|
||
|
|
'I have 10 files (file1.ts to file10.ts) that are missing semicolons. Can you fix them?',
|
||
|
|
setup: async (rig) => {
|
||
|
|
rig.setBreakpoint(['generalist']);
|
||
|
|
},
|
||
|
|
assert: async (rig) => {
|
||
|
|
const confirmation = await rig.waitForPendingConfirmation(
|
||
|
|
'generalist',
|
||
|
|
60000,
|
||
|
|
);
|
||
|
|
expect(
|
||
|
|
confirmation,
|
||
|
|
'Expected a tool call for generalist agent',
|
||
|
|
).toBeTruthy();
|
||
|
|
await rig.resolveTool(confirmation);
|
||
|
|
await rig.waitForIdle(60000);
|
||
|
|
},
|
||
|
|
});
|
||
|
|
|
||
|
|
appEvalTest('USUALLY_PASSES', {
|
||
|
|
name: 'should autonomously delegate complex batch task to generalist agent',
|
||
|
|
configOverrides: {
|
||
|
|
agents: {
|
||
|
|
overrides: {
|
||
|
|
generalist: { enabled: true },
|
||
|
|
},
|
||
|
|
},
|
||
|
|
experimental: {
|
||
|
|
enableAgents: true,
|
||
|
|
},
|
||
|
|
excludeTools: ['run_shell_command'],
|
||
|
|
},
|
||
|
|
files: {
|
||
|
|
'src/a.ts': 'export const a = 1;',
|
||
|
|
'src/b.ts': 'export const b = 2;',
|
||
|
|
'src/c.ts': 'export const c = 3;',
|
||
|
|
'src/d.ts': 'export const d = 4;',
|
||
|
|
'src/e.ts': 'export const e = 5;',
|
||
|
|
},
|
||
|
|
prompt:
|
||
|
|
'Please update all files in the src directory. For each file, add a comment at the top that says "Processed by Gemini".',
|
||
|
|
setup: async (rig) => {
|
||
|
|
rig.setBreakpoint(['generalist']);
|
||
|
|
},
|
||
|
|
assert: async (rig) => {
|
||
|
|
const confirmation = await rig.waitForPendingConfirmation(
|
||
|
|
'generalist',
|
||
|
|
60000,
|
||
|
|
);
|
||
|
|
expect(
|
||
|
|
confirmation,
|
||
|
|
'Expected autonomously delegate to generalist for batch task',
|
||
|
|
).toBeTruthy();
|
||
|
|
await rig.resolveTool(confirmation);
|
||
|
|
await rig.waitForIdle(60000);
|
||
|
|
},
|
||
|
|
});
|
||
|
|
|
||
|
|
// --- Negative Evals (Should NOT Delegate - Assertive Handling) ---
|
||
|
|
|
||
|
|
appEvalTest('USUALLY_PASSES', {
|
||
|
|
name: 'should NOT delegate simple read and fix to generalist agent',
|
||
|
|
configOverrides: {
|
||
|
|
agents: {
|
||
|
|
overrides: {
|
||
|
|
generalist: { enabled: true },
|
||
|
|
},
|
||
|
|
},
|
||
|
|
experimental: {
|
||
|
|
enableAgents: true,
|
||
|
|
},
|
||
|
|
excludeTools: ['run_shell_command'],
|
||
|
|
},
|
||
|
|
files: {
|
||
|
|
'README.md': 'This is a proyect.',
|
||
|
|
},
|
||
|
|
prompt:
|
||
|
|
'There is a typo in README.md ("proyect"). Please fix it to "project".',
|
||
|
|
setup: async (rig) => {
|
||
|
|
// Break on everything to see what it calls
|
||
|
|
rig.setBreakpoint(['*']);
|
||
|
|
},
|
||
|
|
assert: async (rig) => {
|
||
|
|
await rig.drainBreakpointsUntilIdle((confirmation) => {
|
||
|
|
expect(
|
||
|
|
confirmation.toolName,
|
||
|
|
`Agent should NOT have delegated to generalist.`,
|
||
|
|
).not.toBe('generalist');
|
||
|
|
});
|
||
|
|
|
||
|
|
const output = rig.getStaticOutput();
|
||
|
|
expect(output).toMatch(/project/i);
|
||
|
|
},
|
||
|
|
});
|
||
|
|
|
||
|
|
appEvalTest('USUALLY_PASSES', {
|
||
|
|
name: 'should NOT delegate simple direct question to generalist agent',
|
||
|
|
configOverrides: {
|
||
|
|
agents: {
|
||
|
|
overrides: {
|
||
|
|
generalist: { enabled: true },
|
||
|
|
},
|
||
|
|
},
|
||
|
|
experimental: {
|
||
|
|
enableAgents: true,
|
||
|
|
},
|
||
|
|
excludeTools: ['run_shell_command'],
|
||
|
|
},
|
||
|
|
files: {
|
||
|
|
'src/VERSION': '1.2.3',
|
||
|
|
},
|
||
|
|
prompt: 'Can you tell me the version number in the src folder?',
|
||
|
|
setup: async (rig) => {
|
||
|
|
rig.setBreakpoint(['*']);
|
||
|
|
},
|
||
|
|
assert: async (rig) => {
|
||
|
|
await rig.drainBreakpointsUntilIdle((confirmation) => {
|
||
|
|
expect(
|
||
|
|
confirmation.toolName,
|
||
|
|
`Agent should NOT have delegated to generalist.`,
|
||
|
|
).not.toBe('generalist');
|
||
|
|
});
|
||
|
|
|
||
|
|
const output = rig.getStaticOutput();
|
||
|
|
expect(output).toMatch(/1\.2\.3/);
|
||
|
|
},
|
||
|
|
});
|
||
|
|
});
|