evals/generalist_delegation.eval.ts

/**
 * @license
 * Copyright 2026 Google LLC
 * SPDX-License-Identifier: Apache-2.0
 */

import { describe, expect } from 'vitest';
import { appEvalTest } from './app-test-helper.js';

describe('generalist_delegation', () => {
  // --- Positive Evals (Should Delegate) ---

  appEvalTest('USUALLY_PASSES', {
    name: 'should delegate batch error fixing to generalist agent',
    configOverrides: {
      agents: {
        overrides: {
          generalist: { enabled: true },
        },
      },
      experimental: {
        enableAgents: true,
      },
      excludeTools: ['run_shell_command'],
    },
    files: {
      'file1.ts': 'console.log("no semi")',
      'file2.ts': 'console.log("no semi")',
      'file3.ts': 'console.log("no semi")',
      'file4.ts': 'console.log("no semi")',
      'file5.ts': 'console.log("no semi")',
      'file6.ts': 'console.log("no semi")',
      'file7.ts': 'console.log("no semi")',
      'file8.ts': 'console.log("no semi")',
      'file9.ts': 'console.log("no semi")',
      'file10.ts': 'console.log("no semi")',
    },
    prompt:
      'I have 10 files (file1.ts to file10.ts) that are missing semicolons. Can you fix them?',
    setup: async (rig) => {
      rig.setBreakpoint(['generalist']);
    },
    assert: async (rig) => {
      const confirmation = await rig.waitForPendingConfirmation(
        'generalist',
        60000,
      );
      expect(
        confirmation,
        'Expected a tool call for generalist agent',
      ).toBeTruthy();
      await rig.resolveTool(confirmation);
      await rig.waitForIdle(60000);
    },
  });

  appEvalTest('USUALLY_PASSES', {
    name: 'should autonomously delegate complex batch task to generalist agent',
    configOverrides: {
      agents: {
        overrides: {
          generalist: { enabled: true },
        },
      },
      experimental: {
        enableAgents: true,
      },
      excludeTools: ['run_shell_command'],
    },
    files: {
      'src/a.ts': 'export const a = 1;',
      'src/b.ts': 'export const b = 2;',
      'src/c.ts': 'export const c = 3;',
      'src/d.ts': 'export const d = 4;',
      'src/e.ts': 'export const e = 5;',
    },
    prompt:
      'Please update all files in the src directory. For each file, add a comment at the top that says "Processed by Gemini".',
    setup: async (rig) => {
      rig.setBreakpoint(['generalist']);
    },
    assert: async (rig) => {
      const confirmation = await rig.waitForPendingConfirmation(
        'generalist',
        60000,
      );
      expect(
        confirmation,
        'Expected autonomously delegate to generalist for batch task',
      ).toBeTruthy();
      await rig.resolveTool(confirmation);
      await rig.waitForIdle(60000);
    },
  });

  // --- Negative Evals (Should NOT Delegate - Assertive Handling) ---

  appEvalTest('USUALLY_PASSES', {
    name: 'should NOT delegate simple read and fix to generalist agent',
    configOverrides: {
      agents: {
        overrides: {
          generalist: { enabled: true },
        },
      },
      experimental: {
        enableAgents: true,
      },
      excludeTools: ['run_shell_command'],
    },
    files: {
      'README.md': 'This is a proyect.',
    },
    prompt:
      'There is a typo in README.md ("proyect"). Please fix it to "project".',
    setup: async (rig) => {
      // Break on everything to see what it calls
      rig.setBreakpoint(['*']);
    },
    assert: async (rig) => {
      await rig.drainBreakpointsUntilIdle((confirmation) => {
        expect(
          confirmation.toolName,
          `Agent should NOT have delegated to generalist.`,
        ).not.toBe('generalist');
      });

      const output = rig.getStaticOutput();
      expect(output).toMatch(/project/i);
    },
  });

  appEvalTest('USUALLY_PASSES', {
    name: 'should NOT delegate simple direct question to generalist agent',
    configOverrides: {
      agents: {
        overrides: {
          generalist: { enabled: true },
        },
      },
      experimental: {
        enableAgents: true,
      },
      excludeTools: ['run_shell_command'],
    },
    files: {
      'src/VERSION': '1.2.3',
    },
    prompt: 'Can you tell me the version number in the src folder?',
    setup: async (rig) => {
      rig.setBreakpoint(['*']);
    },
    assert: async (rig) => {
      await rig.drainBreakpointsUntilIdle((confirmation) => {
        expect(
          confirmation.toolName,
          `Agent should NOT have delegated to generalist.`,
        ).not.toBe('generalist');
      });

      const output = rig.getStaticOutput();
      expect(output).toMatch(/1\.2\.3/);
    },
  });
});
feat(core): Enable generalist agent (#19665) 2026-02-26 08:38:49 -08:00			`/**`
			`* @license`
			`* Copyright 2026 Google LLC`
			`* SPDX-License-Identifier: Apache-2.0`
			`*/`

			`import { describe, expect } from 'vitest';`
			`import { appEvalTest } from './app-test-helper.js';`

			`describe('generalist_delegation', () => {`
			`// --- Positive Evals (Should Delegate) ---`

			`appEvalTest('USUALLY_PASSES', {`
			`name: 'should delegate batch error fixing to generalist agent',`
			`configOverrides: {`
			`agents: {`
			`overrides: {`
			`generalist: { enabled: true },`
			`},`
			`},`
			`experimental: {`
			`enableAgents: true,`
			`},`
			`excludeTools: ['run_shell_command'],`
			`},`
			`files: {`
			`'file1.ts': 'console.log("no semi")',`
			`'file2.ts': 'console.log("no semi")',`
			`'file3.ts': 'console.log("no semi")',`
			`'file4.ts': 'console.log("no semi")',`
			`'file5.ts': 'console.log("no semi")',`
			`'file6.ts': 'console.log("no semi")',`
			`'file7.ts': 'console.log("no semi")',`
			`'file8.ts': 'console.log("no semi")',`
			`'file9.ts': 'console.log("no semi")',`
			`'file10.ts': 'console.log("no semi")',`
			`},`
			`prompt:`
			`'I have 10 files (file1.ts to file10.ts) that are missing semicolons. Can you fix them?',`
			`setup: async (rig) => {`
			`rig.setBreakpoint(['generalist']);`
			`},`
			`assert: async (rig) => {`
			`const confirmation = await rig.waitForPendingConfirmation(`
			`'generalist',`
			`60000,`
			`);`
			`expect(`
			`confirmation,`
			`'Expected a tool call for generalist agent',`
			`).toBeTruthy();`
			`await rig.resolveTool(confirmation);`
			`await rig.waitForIdle(60000);`
			`},`
			`});`

			`appEvalTest('USUALLY_PASSES', {`
			`name: 'should autonomously delegate complex batch task to generalist agent',`
			`configOverrides: {`
			`agents: {`
			`overrides: {`
			`generalist: { enabled: true },`
			`},`
			`},`
			`experimental: {`
			`enableAgents: true,`
			`},`
			`excludeTools: ['run_shell_command'],`
			`},`
			`files: {`
			`'src/a.ts': 'export const a = 1;',`
			`'src/b.ts': 'export const b = 2;',`
			`'src/c.ts': 'export const c = 3;',`
			`'src/d.ts': 'export const d = 4;',`
			`'src/e.ts': 'export const e = 5;',`
			`},`
			`prompt:`
			`'Please update all files in the src directory. For each file, add a comment at the top that says "Processed by Gemini".',`
			`setup: async (rig) => {`
			`rig.setBreakpoint(['generalist']);`
			`},`
			`assert: async (rig) => {`
			`const confirmation = await rig.waitForPendingConfirmation(`
			`'generalist',`
			`60000,`
			`);`
			`expect(`
			`confirmation,`
			`'Expected autonomously delegate to generalist for batch task',`
			`).toBeTruthy();`
			`await rig.resolveTool(confirmation);`
			`await rig.waitForIdle(60000);`
			`},`
			`});`

			`// --- Negative Evals (Should NOT Delegate - Assertive Handling) ---`

			`appEvalTest('USUALLY_PASSES', {`
			`name: 'should NOT delegate simple read and fix to generalist agent',`
			`configOverrides: {`
			`agents: {`
			`overrides: {`
			`generalist: { enabled: true },`
			`},`
			`},`
			`experimental: {`
			`enableAgents: true,`
			`},`
			`excludeTools: ['run_shell_command'],`
			`},`
			`files: {`
			`'README.md': 'This is a proyect.',`
			`},`
			`prompt:`
			`'There is a typo in README.md ("proyect"). Please fix it to "project".',`
			`setup: async (rig) => {`
			`// Break on everything to see what it calls`
			`rig.setBreakpoint(['*']);`
			`},`
			`assert: async (rig) => {`
			`await rig.drainBreakpointsUntilIdle((confirmation) => {`
			`expect(`
			`confirmation.toolName,`
			`Agent should NOT have delegated to generalist.`,
			`).not.toBe('generalist');`
			`});`

			`const output = rig.getStaticOutput();`
			`expect(output).toMatch(/project/i);`
			`},`
			`});`

			`appEvalTest('USUALLY_PASSES', {`
			`name: 'should NOT delegate simple direct question to generalist agent',`
			`configOverrides: {`
			`agents: {`
			`overrides: {`
			`generalist: { enabled: true },`
			`},`
			`},`
			`experimental: {`
			`enableAgents: true,`
			`},`
			`excludeTools: ['run_shell_command'],`
			`},`
			`files: {`
			`'src/VERSION': '1.2.3',`
			`},`
			`prompt: 'Can you tell me the version number in the src folder?',`
			`setup: async (rig) => {`
			`rig.setBreakpoint(['*']);`
			`},`
			`assert: async (rig) => {`
			`await rig.drainBreakpointsUntilIdle((confirmation) => {`
			`expect(`
			`confirmation.toolName,`
			`Agent should NOT have delegated to generalist.`,
			`).not.toBe('generalist');`
			`});`

			`const output = rig.getStaticOutput();`
			`expect(output).toMatch(/1\.2\.3/);`
			`},`
			`});`
			`});`