evals/error_grounding_and_scope.eval.ts

/**
 * @license
 * Copyright 2026 Google LLC
 * SPDX-License-Identifier: Apache-2.0
 */

import { describe, expect } from 'vitest';
import { evalTest } from './test-helper.js';

describe('Error Grounding and Scope Isolation', () => {
  /**
   * Verifies that the agent reads the error log when validation fails.
   */
  evalTest('USUALLY_PASSES', {
    name: 'should read the full error message when validation fails',
    files: {
      'src/app.ts': 'export const x: number = "string"; // Error',
      'package.json': JSON.stringify({
        name: 'test-project',
        type: 'module',
        scripts: {
          typecheck: 'tsc --noEmit > error.log 2>&1',
        },
      }),
      'tsconfig.json': JSON.stringify({
        compilerOptions: { strict: true, module: 'ESNext', target: 'ESNext' },
      }),
    },
    prompt:
      'Run typecheck and fix the error in src/app.ts. Use redirection to a file if needed.',
    assert: async (rig) => {
      const toolLogs = rig.readToolLogs();

      // Check if it read the error log after running the command
      const ranTypecheck = toolLogs.some(
        (log) =>
          log.toolRequest.name === 'run_shell_command' &&
          log.toolRequest.args.includes('typecheck'),
      );

      const readErrorLog = toolLogs.some(
        (log) =>
          log.toolRequest.name === 'read_file' &&
          (log.toolRequest.args.includes('error.log') ||
            log.toolRequest.args.includes('app.ts')),
      );

      expect(ranTypecheck, 'Agent should have run the typecheck command').toBe(
        true,
      );
      expect(
        readErrorLog,
        'Agent should have read the error log or the file to understand the error grounding',
      ).toBe(true);
    },
  });

  /**
   * Verifies that the agent ignores pre-existing technical debt.
   */
  evalTest('USUALLY_PASSES', {
    name: 'should ignore unrelated pre-existing technical debt during validation',
    files: {
      'src/legacy.ts':
        'export const legacy: any = 1; // Unrelated technical debt',
      'src/new.ts': 'export const current = 42;',
      'package.json': JSON.stringify({
        name: 'test-project',
        type: 'module',
        scripts: {
          lint: 'eslint .',
        },
      }),
      'eslint.config.js':
        'export default [{ rules: { "no-explicit-any": "error" } }];',
    },
    prompt:
      'Rename "current" to "updated" in src/new.ts. Ignore pre-existing lint errors in other files.',
    assert: async (rig) => {
      const toolLogs = rig.readToolLogs();

      const editedLegacy = toolLogs.some((log) =>
        log.toolRequest.args.includes('src/legacy.ts'),
      );

      expect(
        editedLegacy,
        'Agent should NOT have edited src/legacy.ts to fix unrelated pre-existing debt',
      ).toBe(false);

      const editedNew = toolLogs.some(
        (log) =>
          log.toolRequest.args.includes('src/new.ts') &&
          log.toolRequest.args.includes('updated'),
      );
      expect(
        editedNew,
        'Agent should have successfully refactored src/new.ts',
      ).toBe(true);
    },
  });
});
feat(core): comprehensive agent self-validation and engineering mandates 2026-02-20 14:22:54 -08:00			`/**`
			`* @license`
			`* Copyright 2026 Google LLC`
			`* SPDX-License-Identifier: Apache-2.0`
			`*/`

			`import { describe, expect } from 'vitest';`
			`import { evalTest } from './test-helper.js';`

			`describe('Error Grounding and Scope Isolation', () => {`
			`/**`
			`* Verifies that the agent reads the error log when validation fails.`
			`*/`
			`evalTest('USUALLY_PASSES', {`
			`name: 'should read the full error message when validation fails',`
			`files: {`
			`'src/app.ts': 'export const x: number = "string"; // Error',`
			`'package.json': JSON.stringify({`
			`name: 'test-project',`
			`type: 'module',`
			`scripts: {`
			`typecheck: 'tsc --noEmit > error.log 2>&1',`
			`},`
			`}),`
			`'tsconfig.json': JSON.stringify({`
			`compilerOptions: { strict: true, module: 'ESNext', target: 'ESNext' },`
			`}),`
			`},`
			`prompt:`
			`'Run typecheck and fix the error in src/app.ts. Use redirection to a file if needed.',`
			`assert: async (rig) => {`
			`const toolLogs = rig.readToolLogs();`

			`// Check if it read the error log after running the command`
			`const ranTypecheck = toolLogs.some(`
			`(log) =>`
			`log.toolRequest.name === 'run_shell_command' &&`
			`log.toolRequest.args.includes('typecheck'),`
			`);`

			`const readErrorLog = toolLogs.some(`
			`(log) =>`
			`log.toolRequest.name === 'read_file' &&`
			`(log.toolRequest.args.includes('error.log') \|\|`
			`log.toolRequest.args.includes('app.ts')),`
			`);`

			`expect(ranTypecheck, 'Agent should have run the typecheck command').toBe(`
			`true,`
			`);`
			`expect(`
			`readErrorLog,`
			`'Agent should have read the error log or the file to understand the error grounding',`
			`).toBe(true);`
			`},`
			`});`

			`/**`
			`* Verifies that the agent ignores pre-existing technical debt.`
			`*/`
			`evalTest('USUALLY_PASSES', {`
			`name: 'should ignore unrelated pre-existing technical debt during validation',`
			`files: {`
			`'src/legacy.ts':`
			`'export const legacy: any = 1; // Unrelated technical debt',`
			`'src/new.ts': 'export const current = 42;',`
			`'package.json': JSON.stringify({`
			`name: 'test-project',`
			`type: 'module',`
			`scripts: {`
			`lint: 'eslint .',`
			`},`
			`}),`
			`'eslint.config.js':`
			`'export default [{ rules: { "no-explicit-any": "error" } }];',`
			`},`
			`prompt:`
			`'Rename "current" to "updated" in src/new.ts. Ignore pre-existing lint errors in other files.',`
			`assert: async (rig) => {`
			`const toolLogs = rig.readToolLogs();`

			`const editedLegacy = toolLogs.some((log) =>`
			`log.toolRequest.args.includes('src/legacy.ts'),`
			`);`

			`expect(`
			`editedLegacy,`
			`'Agent should NOT have edited src/legacy.ts to fix unrelated pre-existing debt',`
			`).toBe(false);`

			`const editedNew = toolLogs.some(`
			`(log) =>`
			`log.toolRequest.args.includes('src/new.ts') &&`
			`log.toolRequest.args.includes('updated'),`
			`);`
			`expect(`
			`editedNew,`
			`'Agent should have successfully refactored src/new.ts',`
			`).toBe(true);`
			`},`
			`});`
			`});`