gemini-cli/evals/self_validation_workflow.eval.ts

/**
 * @license
 * Copyright 2026 Google LLC
 * SPDX-License-Identifier: Apache-2.0
 */

import { describe, expect } from 'vitest';
import { evalTest } from './test-helper.js';

describe('Self-Validation Workflow', () => {
  /**
   * Verifies that the agent performs "Parallel Discovery" in the first turn.
   */
  evalTest('USUALLY_PASSES', {
    name: 'should perform parallel discovery in the first turn',
    files: {
      'package.json': JSON.stringify({
        name: 'test-project',
        scripts: { test: 'vitest' },
      }),
      'src/index.ts': 'export const main = () => console.log("hello");',
    },
    prompt: 'Explore the project and find where the main function is defined.',
    assert: async (rig) => {
      const toolLogs = rig.readToolLogs();
      // Group by prompt_id and find the one that ends with #0 (first turn)
      const firstTurnLogs = toolLogs.filter((log) =>
        log.toolRequest.prompt_id?.endsWith('#0'),
      );

      const hasReadPackageJson = toolLogs.some(
        (log) =>
          log.toolRequest.name === 'read_file' &&
          log.toolRequest.args.includes('package.json'),
      );
      const hasSearch = toolLogs.some(
        (log) =>
          log.toolRequest.name === 'grep_search' ||
          log.toolRequest.name === 'list_directory' ||
          log.toolRequest.name === 'glob',
      );

      // Relaxing turn-1 check slightly as it might take a moment to bootstrap,
      // but ensuring they happen early.
      expect(
        hasReadPackageJson,
        'Should read package.json to discover scripts',
      ).toBe(true);
      expect(
        hasSearch,
        'Should perform search/listing to explore the project',
      ).toBe(true);
    },
  });

  /**
   * Verifies "Negative Verification": Agent must run the repro and confirm failure.
   */
  evalTest('USUALLY_PASSES', {
    name: 'should confirm negative verification (repro fails) before fix',
    files: {
      'src/utils.ts':
        'export const square = (n: number) => n + n; // BUG: should be n * n',
      'package.json': JSON.stringify({
        name: 'test-project',
        scripts: { test: 'vitest run' },
        devDependencies: { vitest: '^1.0.0' },
      }),
    },
    prompt:
      'Fix the square function in src/utils.ts. Create a test to reproduce it first.',
    assert: async (rig) => {
      const toolLogs = rig.readToolLogs();
      const editIndex = toolLogs.findIndex(
        (log) =>
          log.toolRequest.name === 'replace' &&
          log.toolRequest.args.includes('src/utils.ts'),
      );

      const testRunsBeforeFix = toolLogs
        .slice(0, editIndex)
        .filter(
          (log) =>
            log.toolRequest.name === 'run_shell_command' &&
            log.toolRequest.args.includes('test'),
        );

      expect(
        testRunsBeforeFix.length,
        'Should run tests at least once before fix',
      ).toBeGreaterThanOrEqual(1);

      // Check if it acknowledged the failure in thoughts or if it explicitly ran the test.
      // The mandate is to "run this reproduction script and confirm it fails".
    },
  });

  /**
   * Verifies "Tail-First Navigation" for large logs.
   */
  evalTest('USUALLY_PASSES', {
    name: 'should use tail-first navigation for large logs',
    files: {
      'src/bug.ts': 'console.log("error");',
      'large_log.log':
        'Line 1\n'.repeat(1000) +
        'CRITICAL ERROR: specific failure at the end\n',
    },
    prompt:
      'There is a failure at the end of large_log.log. Find it and explain the cause.',
    assert: async (rig) => {
      const toolLogs = rig.readToolLogs();

      const usedTailOrGrep = toolLogs.some(
        (log) =>
          log.toolRequest.name === 'run_shell_command' &&
          (log.toolRequest.args.includes('tail') ||
            log.toolRequest.args.includes('grep')),
      );

      const readWholeFile = toolLogs.some(
        (log) =>
          log.toolRequest.name === 'read_file' &&
          log.toolRequest.args.includes('large_log.log') &&
          !log.toolRequest.args.includes('limit'),
      );

      expect(usedTailOrGrep, 'Should use tail or grep for large logs').toBe(
        true,
      );
      expect(readWholeFile, 'Should not read the entire large log file').toBe(
        false,
      );
    },
  });
});