gemini-cli/evals/generalist_delegation.eval.ts

/**
 * @license
 * Copyright 2026 Google LLC
 * SPDX-License-Identifier: Apache-2.0
 */

import { describe, expect } from 'vitest';
import { appEvalTest } from './app-test-helper.js';

describe('generalist_delegation', () => {
  // --- Positive Evals (Should Delegate) ---

  appEvalTest('USUALLY_PASSES', {
    name: 'should delegate batch error fixing to generalist agent',
    configOverrides: {
      agents: {
        overrides: {
          generalist: { enabled: true },
        },
      },
      experimental: {
        enableAgents: true,
      },
      excludeTools: ['run_shell_command'],
    },
    files: {
      'file1.ts': 'console.log("no semi")',
      'file2.ts': 'console.log("no semi")',
      'file3.ts': 'console.log("no semi")',
      'file4.ts': 'console.log("no semi")',
      'file5.ts': 'console.log("no semi")',
      'file6.ts': 'console.log("no semi")',
      'file7.ts': 'console.log("no semi")',
      'file8.ts': 'console.log("no semi")',
      'file9.ts': 'console.log("no semi")',
      'file10.ts': 'console.log("no semi")',
    },
    prompt:
      'I have 10 files (file1.ts to file10.ts) that are missing semicolons. Can you fix them?',
    setup: async (rig) => {
      rig.setBreakpoint(['generalist']);
    },
    assert: async (rig) => {
      const confirmation = await rig.waitForPendingConfirmation(
        'generalist',
        60000,
      );
      expect(
        confirmation,
        'Expected a tool call for generalist agent',
      ).toBeTruthy();
      await rig.resolveTool(confirmation);
      await rig.waitForIdle(60000);
    },
  });

  appEvalTest('USUALLY_PASSES', {
    name: 'should autonomously delegate complex batch task to generalist agent',
    configOverrides: {
      agents: {
        overrides: {
          generalist: { enabled: true },
        },
      },
      experimental: {
        enableAgents: true,
      },
      excludeTools: ['run_shell_command'],
    },
    files: {
      'src/a.ts': 'export const a = 1;',
      'src/b.ts': 'export const b = 2;',
      'src/c.ts': 'export const c = 3;',
      'src/d.ts': 'export const d = 4;',
      'src/e.ts': 'export const e = 5;',
    },
    prompt:
      'Please update all files in the src directory. For each file, add a comment at the top that says "Processed by Gemini".',
    setup: async (rig) => {
      rig.setBreakpoint(['generalist']);
    },
    assert: async (rig) => {
      const confirmation = await rig.waitForPendingConfirmation(
        'generalist',
        60000,
      );
      expect(
        confirmation,
        'Expected autonomously delegate to generalist for batch task',
      ).toBeTruthy();
      await rig.resolveTool(confirmation);
      await rig.waitForIdle(60000);
    },
  });

  // --- Negative Evals (Should NOT Delegate - Assertive Handling) ---

  appEvalTest('USUALLY_PASSES', {
    name: 'should NOT delegate simple read and fix to generalist agent',
    configOverrides: {
      agents: {
        overrides: {
          generalist: { enabled: true },
        },
      },
      experimental: {
        enableAgents: true,
      },
      excludeTools: ['run_shell_command'],
    },
    files: {
      'README.md': 'This is a proyect.',
    },
    prompt:
      'There is a typo in README.md ("proyect"). Please fix it to "project".',
    setup: async (rig) => {
      // Break on everything to see what it calls
      rig.setBreakpoint(['*']);
    },
    assert: async (rig) => {
      await rig.drainBreakpointsUntilIdle((confirmation) => {
        expect(
          confirmation.toolName,
          `Agent should NOT have delegated to generalist.`,
        ).not.toBe('generalist');
      });

      const output = rig.getStaticOutput();
      expect(output).toMatch(/project/i);
    },
  });

  appEvalTest('USUALLY_PASSES', {
    name: 'should NOT delegate simple direct question to generalist agent',
    configOverrides: {
      agents: {
        overrides: {
          generalist: { enabled: true },
        },
      },
      experimental: {
        enableAgents: true,
      },
      excludeTools: ['run_shell_command'],
    },
    files: {
      'src/VERSION': '1.2.3',
    },
    prompt: 'Can you tell me the version number in the src folder?',
    setup: async (rig) => {
      rig.setBreakpoint(['*']);
    },
    assert: async (rig) => {
      await rig.drainBreakpointsUntilIdle((confirmation) => {
        expect(
          confirmation.toolName,
          `Agent should NOT have delegated to generalist.`,
        ).not.toBe('generalist');
      });

      const output = rig.getStaticOutput();
      expect(output).toMatch(/1\.2\.3/);
    },
  });
});