evals/incremental_validation.eval.ts

/**
 * @license
 * Copyright 2026 Google LLC
 * SPDX-License-Identifier: Apache-2.0
 */

import { describe, expect } from 'vitest';
import { evalTest } from './test-helper.js';

describe('Incremental Validation', () => {
  /**
   * This evaluation verifies that the agent adheres to the "Incremental Validation" mandate
   * by performing build or test checks between distinct, significant file changes.
   */
  evalTest('USUALLY_PASSES', {
    name: 'should perform incremental validation between distinct file changes',
    files: {
      'src/a.ts': 'export const valA = 1 - 2; // BUG: should be 1 + 2',
      'src/b.ts': 'export const valB = 0;',
      'package.json': JSON.stringify({
        name: 'test-project',
        type: 'module',
        scripts: {
          test: 'echo "running tests..."',
          build: 'echo "building..."',
        },
      }),
    },
    prompt:
      '1. Fix the bug in src/a.ts (change - to +). 2. After that is done, update src/b.ts to export valB = 42. Ensure the project is buildable and tested at each step.',
    assert: async (rig) => {
      const toolLogs = rig.readToolLogs();

      // Find indices of edits to a.ts and b.ts
      const editAIndex = toolLogs.findIndex(
        (log) =>
          (log.toolRequest.name === 'replace' ||
            log.toolRequest.name === 'write_file') &&
          log.toolRequest.args.includes('src/a.ts'),
      );

      const editBIndex = toolLogs.findIndex(
        (log) =>
          (log.toolRequest.name === 'replace' ||
            log.toolRequest.name === 'write_file') &&
          log.toolRequest.args.includes('src/b.ts'),
      );

      expect(editAIndex, 'Agent should have edited src/a.ts').toBeGreaterThan(
        -1,
      );
      expect(editBIndex, 'Agent should have edited src/b.ts').toBeGreaterThan(
        editAIndex,
      );

      const isValidationCommand = (log: any) => {
        if (log.toolRequest.name !== 'run_shell_command') return false;
        const cmd = log.toolRequest.args.toLowerCase();
        return (
          cmd.includes('build') ||
          cmd.includes('test') ||
          cmd.includes('npm run') ||
          cmd.includes('tsc')
        );
      };

      // Check for validation between editA and editB
      const validationBetween = toolLogs
        .slice(editAIndex + 1, editBIndex)
        .some(isValidationCommand);

      expect(
        validationBetween,
        'Expected a build/test command between two distinct file edits to ensure incremental stability',
      ).toBe(true);

      // Also check for validation after editB to confirm final state
      const validationAfter = toolLogs
        .slice(editBIndex + 1)
        .some(isValidationCommand);

      expect(
        validationAfter,
        'Expected a build/test command after the final file edit',
      ).toBe(true);
    },
  });
});
feat(core): comprehensive agent self-validation and engineering mandates Major upgrade to the agent's self-validation, safety, and project integrity capabilities through five iterations of system prompt enhancements: Workflow & Quality Mandates: 1. Incremental Validation: Mandates building, linting, and testing after every significant file change to maintain a "green" state. 2. Mandatory Reproduction: Requires creating a failing test case to confirm a bug before fixing, and explicitly verifying the failure (Negative Verification). 3. Test Persistence & Locality: Requires integrating repro cases into the permanent test suite, preferably by amending existing related test files. 4. Script Discovery: Mandates identifying project-specific validation commands from configuration files (package.json, Makefile, etc.). 5. Self-Review: Mandates running `git diff` after every edit, using `--name-only` for large changes to preserve context window tokens. 6. Fast-Path Validation: Prioritizes lightweight checks (e.g., `tsc --noEmit`) for frequent feedback, reserving heavy builds for final verification. 7. Output Verification: Requires checking command output (not just exit codes) to prevent false-positives from empty test runs or hidden warnings. Semantic Integrity & Dependency Safety: 8. Global Usage Discovery: Mandates searching the entire workspace for all usages (via `grep_search`) before modifying exported symbols or APIs. 9. Dependency Integrity: Requires verifying that new imports are explicitly declared in the project's dependency manifest (e.g., package.json). 10. Configuration Sync: Mandates updating build/environment configs (tsconfig, Dockerfile, etc.) to support new file types or entry points. 11. Documentation Sync: Requires searching for and updating documentation references when public APIs or CLI interfaces change. 12. Anti-Silencing Mandate: Prohibits using `any`, `@ts-ignore`, or lint suppressions to resolve validation errors. Diagnostics, Safety & Runtime Verification: 13. Error Grounding: Mandates reading full error logs and stack traces upon failure. Includes Smart Log Navigation to prioritize the tail of large files. 14. Scope Isolation: Instructs the agent to focus only on errors introduced by its changes and ignore unrelated legacy technical debt. 15. Destructive Safety: Mandates a `git status` check before deleting files or modifying critical project configurations. 16. Non-Blocking Smoke Tests: Requires briefly running applications to verify boot stability, using background/timeout strategies for servers. Includes 15 new behavioral evaluations verifying these mandates and updated snapshots in packages/core/src/core/prompts.test.ts. 2026-02-20 14:22:54 -08:00			`/**`
			`* @license`
			`* Copyright 2026 Google LLC`
			`* SPDX-License-Identifier: Apache-2.0`
			`*/`

			`import { describe, expect } from 'vitest';`
			`import { evalTest } from './test-helper.js';`

			`describe('Incremental Validation', () => {`
			`/**`
			`* This evaluation verifies that the agent adheres to the "Incremental Validation" mandate`
			`* by performing build or test checks between distinct, significant file changes.`
			`*/`
			`evalTest('USUALLY_PASSES', {`
			`name: 'should perform incremental validation between distinct file changes',`
			`files: {`
			`'src/a.ts': 'export const valA = 1 - 2; // BUG: should be 1 + 2',`
			`'src/b.ts': 'export const valB = 0;',`
			`'package.json': JSON.stringify({`
			`name: 'test-project',`
			`type: 'module',`
			`scripts: {`
			`test: 'echo "running tests..."',`
			`build: 'echo "building..."',`
			`},`
			`}),`
			`},`
			`prompt:`
			`'1. Fix the bug in src/a.ts (change - to +). 2. After that is done, update src/b.ts to export valB = 42. Ensure the project is buildable and tested at each step.',`
			`assert: async (rig) => {`
			`const toolLogs = rig.readToolLogs();`

			`// Find indices of edits to a.ts and b.ts`
			`const editAIndex = toolLogs.findIndex(`
			`(log) =>`
			`(log.toolRequest.name === 'replace' \|\|`
			`log.toolRequest.name === 'write_file') &&`
			`log.toolRequest.args.includes('src/a.ts'),`
			`);`

			`const editBIndex = toolLogs.findIndex(`
			`(log) =>`
			`(log.toolRequest.name === 'replace' \|\|`
			`log.toolRequest.name === 'write_file') &&`
			`log.toolRequest.args.includes('src/b.ts'),`
			`);`

			`expect(editAIndex, 'Agent should have edited src/a.ts').toBeGreaterThan(`
			`-1,`
			`);`
			`expect(editBIndex, 'Agent should have edited src/b.ts').toBeGreaterThan(`
			`editAIndex,`
			`);`

			`const isValidationCommand = (log: any) => {`
			`if (log.toolRequest.name !== 'run_shell_command') return false;`
			`const cmd = log.toolRequest.args.toLowerCase();`
			`return (`
			`cmd.includes('build') \|\|`
			`cmd.includes('test') \|\|`
			`cmd.includes('npm run') \|\|`
			`cmd.includes('tsc')`
			`);`
			`};`

			`// Check for validation between editA and editB`
			`const validationBetween = toolLogs`
			`.slice(editAIndex + 1, editBIndex)`
			`.some(isValidationCommand);`

			`expect(`
			`validationBetween,`
			`'Expected a build/test command between two distinct file edits to ensure incremental stability',`
			`).toBe(true);`

			`// Also check for validation after editB to confirm final state`
			`const validationAfter = toolLogs`
			`.slice(editBIndex + 1)`
			`.some(isValidationCommand);`

			`expect(`
			`validationAfter,`
			`'Expected a build/test command after the final file edit',`
			`).toBe(true);`
			`},`
			`});`
			`});`