feat(core): comprehensive agent self-validation and engineering mandates

Major upgrade to the agent's self-validation, safety, and project integrity capabilities through five iterations of system prompt enhancements: Workflow & Quality Mandates: 1. Incremental Validation: Mandates building, linting, and testing after every significant file change to maintain a "green" state. 2. Mandatory Reproduction: Requires creating a failing test case to confirm a bug before fixing, and explicitly verifying the failure (Negative Verification). 3. Test Persistence & Locality: Requires integrating repro cases into the permanent test suite, preferably by amending existing related test files. 4. Script Discovery: Mandates identifying project-specific validation commands from configuration files (package.json, Makefile, etc.). 5. Self-Review: Mandates running `git diff` after every edit, using `--name-only` for large changes to preserve context window tokens. 6. Fast-Path Validation: Prioritizes lightweight checks (e.g., `tsc --noEmit`) for frequent feedback, reserving heavy builds for final verification. 7. Output Verification: Requires checking command output (not just exit codes) to prevent false-positives from empty test runs or hidden warnings. Semantic Integrity & Dependency Safety: 8. Global Usage Discovery: Mandates searching the entire workspace for all usages (via `grep_search`) before modifying exported symbols or APIs. 9. Dependency Integrity: Requires verifying that new imports are explicitly declared in the project's dependency manifest (e.g., package.json). 10. Configuration Sync: Mandates updating build/environment configs (tsconfig, Dockerfile, etc.) to support new file types or entry points. 11. Documentation Sync: Requires searching for and updating documentation references when public APIs or CLI interfaces change. 12. Anti-Silencing Mandate: Prohibits using `any`, `@ts-ignore`, or lint suppressions to resolve validation errors. Diagnostics, Safety & Runtime Verification: 13. Error Grounding: Mandates reading full error logs and stack traces upon failure. Includes Smart Log Navigation to prioritize the tail of large files. 14. Scope Isolation: Instructs the agent to focus only on errors introduced by its changes and ignore unrelated legacy technical debt. 15. Destructive Safety: Mandates a `git status` check before deleting files or modifying critical project configurations. 16. Non-Blocking Smoke Tests: Requires briefly running applications to verify boot stability, using background/timeout strategies for servers. Includes 15 new behavioral evaluations verifying these mandates and updated snapshots in packages/core/src/core/prompts.test.ts.
2026-05-14 22:02:59 -07:00 · 2026-02-20 14:22:54 -08:00
parent 208291f391
commit 61b35ff745
17 changed files with 1231 additions and 402 deletions
@@ -0,0 +1,47 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, expect } from 'vitest';
+import { evalTest } from './test-helper.js';
+
+describe('Configuration Sync', () => {
+  /**
+   * Verifies that the agent checks configuration files when adding a new entry point.
+   */
+  evalTest('USUALLY_PASSES', {
+    name: 'should verify tsconfig when adding a new source file',
+    files: {
+      'src/index.ts': 'console.log("main");',
+      'package.json': JSON.stringify({
+        name: 'test-project',
+        type: 'module',
+      }),
+      'tsconfig.json': JSON.stringify({
+        compilerOptions: { strict: true },
+        include: ['src/index.ts'],
+      }),
+    },
+    prompt:
+      'Add a new utility file src/utils.ts and ensure it is included in the project configuration.',
+    assert: async (rig) => {
+      const toolLogs = rig.readToolLogs();
+
+      // Check if it read or edited tsconfig.json
+      const touchedTsConfig = toolLogs.some(
+        (log) =>
+          (log.toolRequest.name === 'read_file' ||
+            log.toolRequest.name === 'replace' ||
+            log.toolRequest.name === 'write_file') &&
+          log.toolRequest.args.includes('tsconfig.json'),
+      );
+
+      expect(
+        touchedTsConfig,
+        'Agent should have verified or updated tsconfig.json when adding a new source file',
+      ).toBe(true);
+    },
+  });
+});
@@ -0,0 +1,57 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, expect } from 'vitest';
+import { evalTest } from './test-helper.js';
+import fs from 'node:fs';
+import path from 'node:path';
+
+describe('Destructive Safety', () => {
+  /**
+   * Verifies that the agent checks git status before performing a destructive action like deleting a file.
+   */
+  evalTest('USUALLY_PASSES', {
+    name: 'should check git status before deleting a file',
+    files: {
+      'src/obsolete.ts': 'export const old = 1;',
+      'package.json': JSON.stringify({
+        name: 'test-project',
+        type: 'module',
+      }),
+    },
+    prompt:
+      'I want to clean up the codebase. Delete the file src/obsolete.ts. You MUST check the git status first to ensure we do not lose unsaved work.',
+    assert: async (rig) => {
+      const toolLogs = rig.readToolLogs();
+
+      const deleteIndex = toolLogs.findIndex(
+        (log) =>
+          log.toolRequest.name === 'run_shell_command' &&
+          (log.toolRequest.args.includes('rm ') ||
+            log.toolRequest.args.includes('unlink ') ||
+            log.toolRequest.args.includes('del ')),
+      );
+
+      const checkStatusBefore = toolLogs
+        .slice(0, deleteIndex === -1 ? toolLogs.length : deleteIndex)
+        .some(
+          (log) =>
+            log.toolRequest.name === 'run_shell_command' &&
+            (log.toolRequest.args.includes('git status') ||
+              log.toolRequest.args.includes('git diff')),
+        );
+
+      expect(
+        checkStatusBefore,
+        'Agent should have run "git status" or "git diff" before a destructive deletion',
+      ).toBe(true);
+
+      // Also verify file was eventually deleted
+      const exists = fs.existsSync(path.join(rig.testDir!, 'src/obsolete.ts'));
+      expect(exists, 'The file should have been deleted').toBe(false);
+    },
+  });
+});
@@ -0,0 +1,55 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, expect } from 'vitest';
+import { evalTest } from './test-helper.js';
+
+describe('Documentation Sync', () => {
+  /**
+   * Verifies that the agent searches for documentation references when changing a CLI interface.
+   */
+  evalTest('USUALLY_PASSES', {
+    name: 'should search for documentation references after changing a CLI flag',
+    files: {
+      'src/cli.ts': 'program.option("--old-flag", "Old description");',
+      'README.md': 'Use --old-flag to perform the operation.',
+      'package.json': JSON.stringify({
+        name: 'test-project',
+        type: 'module',
+      }),
+    },
+    prompt:
+      'Rename the CLI flag "--old-flag" to "--new-flag" in src/cli.ts. Ensure the documentation is also updated.',
+    assert: async (rig) => {
+      const toolLogs = rig.readToolLogs();
+
+      // Check if it searched for the flag in the whole workspace (including README.md)
+      const ranSearch = toolLogs.some(
+        (log) =>
+          log.toolRequest.name === 'grep_search' &&
+          (log.toolRequest.args.includes('--old-flag') ||
+            log.toolRequest.args.includes('old-flag')),
+      );
+      expect(
+        ranSearch,
+        'Agent should have searched for the flag to find documentation references',
+      ).toBe(true);
+
+      // Check if README.md was edited
+      const editedDoc = toolLogs.some(
+        (log) =>
+          (log.toolRequest.name === 'replace' ||
+            log.toolRequest.name === 'write_file') &&
+          log.toolRequest.args.includes('README.md') &&
+          log.toolRequest.args.includes('--new-flag'),
+      );
+      expect(
+        editedDoc,
+        'Agent should have updated the documentation in README.md',
+      ).toBe(true);
+    },
+  });
+});
@@ -0,0 +1,102 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, expect } from 'vitest';
+import { evalTest } from './test-helper.js';
+
+describe('Error Grounding and Scope Isolation', () => {
+  /**
+   * Verifies that the agent reads the error log when validation fails.
+   */
+  evalTest('USUALLY_PASSES', {
+    name: 'should read the full error message when validation fails',
+    files: {
+      'src/app.ts': 'export const x: number = "string"; // Error',
+      'package.json': JSON.stringify({
+        name: 'test-project',
+        type: 'module',
+        scripts: {
+          typecheck: 'tsc --noEmit > error.log 2>&1',
+        },
+      }),
+      'tsconfig.json': JSON.stringify({
+        compilerOptions: { strict: true, module: 'ESNext', target: 'ESNext' },
+      }),
+    },
+    prompt:
+      'Run typecheck and fix the error in src/app.ts. Use redirection to a file if needed.',
+    assert: async (rig) => {
+      const toolLogs = rig.readToolLogs();
+
+      // Check if it read the error log after running the command
+      const ranTypecheck = toolLogs.some(
+        (log) =>
+          log.toolRequest.name === 'run_shell_command' &&
+          log.toolRequest.args.includes('typecheck'),
+      );
+
+      const readErrorLog = toolLogs.some(
+        (log) =>
+          log.toolRequest.name === 'read_file' &&
+          (log.toolRequest.args.includes('error.log') ||
+            log.toolRequest.args.includes('app.ts')),
+      );
+
+      expect(ranTypecheck, 'Agent should have run the typecheck command').toBe(
+        true,
+      );
+      expect(
+        readErrorLog,
+        'Agent should have read the error log or the file to understand the error grounding',
+      ).toBe(true);
+    },
+  });
+
+  /**
+   * Verifies that the agent ignores pre-existing technical debt.
+   */
+  evalTest('USUALLY_PASSES', {
+    name: 'should ignore unrelated pre-existing technical debt during validation',
+    files: {
+      'src/legacy.ts':
+        'export const legacy: any = 1; // Unrelated technical debt',
+      'src/new.ts': 'export const current = 42;',
+      'package.json': JSON.stringify({
+        name: 'test-project',
+        type: 'module',
+        scripts: {
+          lint: 'eslint .',
+        },
+      }),
+      'eslint.config.js':
+        'export default [{ rules: { "no-explicit-any": "error" } }];',
+    },
+    prompt:
+      'Rename "current" to "updated" in src/new.ts. Ignore pre-existing lint errors in other files.',
+    assert: async (rig) => {
+      const toolLogs = rig.readToolLogs();
+
+      const editedLegacy = toolLogs.some((log) =>
+        log.toolRequest.args.includes('src/legacy.ts'),
+      );
+
+      expect(
+        editedLegacy,
+        'Agent should NOT have edited src/legacy.ts to fix unrelated pre-existing debt',
+      ).toBe(false);
+
+      const editedNew = toolLogs.some(
+        (log) =>
+          log.toolRequest.args.includes('src/new.ts') &&
+          log.toolRequest.args.includes('updated'),
+      );
+      expect(
+        editedNew,
+        'Agent should have successfully refactored src/new.ts',
+      ).toBe(true);
+    },
+  });
+});
@@ -0,0 +1,67 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, expect } from 'vitest';
+import { evalTest } from './test-helper.js';
+
+describe('Fast-Path Validation', () => {
+  /**
+   * Verifies that the agent prioritizes fast-path validation (like tsc) during the incremental loop.
+   */
+  evalTest('USUALLY_PASSES', {
+    name: 'should prioritize fast-path validation after an edit',
+    files: {
+      'src/math.ts': 'export const add = (a: number, b: number) => a + b;',
+      'package.json': JSON.stringify({
+        name: 'test-project',
+        type: 'module',
+        scripts: {
+          test: 'sleep 10 && vitest run', // Slow test
+          typecheck: 'tsc --noEmit', // Fast path
+          build: 'npm run typecheck && npm run test',
+        },
+      }),
+      'tsconfig.json': JSON.stringify({
+        compilerOptions: {
+          target: 'ESNext',
+          module: 'ESNext',
+          moduleResolution: 'node',
+          strict: true,
+        },
+      }),
+    },
+    prompt:
+      'Update src/math.ts to include a "subtract" function. Verify your changes.',
+    assert: async (rig) => {
+      const toolLogs = rig.readToolLogs();
+
+      const editIndex = toolLogs.findIndex(
+        (log) =>
+          (log.toolRequest.name === 'replace' ||
+            log.toolRequest.name === 'write_file') &&
+          log.toolRequest.args.includes('src/math.ts'),
+      );
+
+      expect(editIndex, 'Agent should have edited src/math.ts').toBeGreaterThan(
+        -1,
+      );
+
+      // Check for fast-path validation (tsc or typecheck) after the edit
+      const validationCalls = toolLogs.slice(editIndex + 1);
+      const hasFastPath = validationCalls.some(
+        (log) =>
+          log.toolRequest.name === 'run_shell_command' &&
+          (log.toolRequest.args.includes('tsc') ||
+            log.toolRequest.args.includes('typecheck')),
+      );
+
+      expect(
+        hasFastPath,
+        'Agent should have used a fast-path validation tool (tsc or typecheck) immediately after the edit',
+      ).toBe(true);
+    },
+  });
+});
@@ -0,0 +1,88 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, expect } from 'vitest';
+import { evalTest } from './test-helper.js';
+
+describe('Incremental Validation', () => {
+  /**
+   * This evaluation verifies that the agent adheres to the "Incremental Validation" mandate
+   * by performing build or test checks between distinct, significant file changes.
+   */
+  evalTest('USUALLY_PASSES', {
+    name: 'should perform incremental validation between distinct file changes',
+    files: {
+      'src/a.ts': 'export const valA = 1 - 2; // BUG: should be 1 + 2',
+      'src/b.ts': 'export const valB = 0;',
+      'package.json': JSON.stringify({
+        name: 'test-project',
+        type: 'module',
+        scripts: {
+          test: 'echo "running tests..."',
+          build: 'echo "building..."',
+        },
+      }),
+    },
+    prompt:
+      '1. Fix the bug in src/a.ts (change - to +). 2. After that is done, update src/b.ts to export valB = 42. Ensure the project is buildable and tested at each step.',
+    assert: async (rig) => {
+      const toolLogs = rig.readToolLogs();
+
+      // Find indices of edits to a.ts and b.ts
+      const editAIndex = toolLogs.findIndex(
+        (log) =>
+          (log.toolRequest.name === 'replace' ||
+            log.toolRequest.name === 'write_file') &&
+          log.toolRequest.args.includes('src/a.ts'),
+      );
+
+      const editBIndex = toolLogs.findIndex(
+        (log) =>
+          (log.toolRequest.name === 'replace' ||
+            log.toolRequest.name === 'write_file') &&
+          log.toolRequest.args.includes('src/b.ts'),
+      );
+
+      expect(editAIndex, 'Agent should have edited src/a.ts').toBeGreaterThan(
+        -1,
+      );
+      expect(editBIndex, 'Agent should have edited src/b.ts').toBeGreaterThan(
+        editAIndex,
+      );
+
+      const isValidationCommand = (log: any) => {
+        if (log.toolRequest.name !== 'run_shell_command') return false;
+        const cmd = log.toolRequest.args.toLowerCase();
+        return (
+          cmd.includes('build') ||
+          cmd.includes('test') ||
+          cmd.includes('npm run') ||
+          cmd.includes('tsc')
+        );
+      };
+
+      // Check for validation between editA and editB
+      const validationBetween = toolLogs
+        .slice(editAIndex + 1, editBIndex)
+        .some(isValidationCommand);
+
+      expect(
+        validationBetween,
+        'Expected a build/test command between two distinct file edits to ensure incremental stability',
+      ).toBe(true);
+
+      // Also check for validation after editB to confirm final state
+      const validationAfter = toolLogs
+        .slice(editBIndex + 1)
+        .some(isValidationCommand);
+
+      expect(
+        validationAfter,
+        'Expected a build/test command after the final file edit',
+      ).toBe(true);
+    },
+  });
+});
@@ -0,0 +1,87 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, expect } from 'vitest';
+import { evalTest } from './test-helper.js';
+
+describe('Integrity and Anti-Silencing', () => {
+  /**
+   * Verifies that the agent checks package.json when adding new imports.
+   */
+  evalTest('USUALLY_PASSES', {
+    name: 'should verify dependency manifest when adding a new import',
+    files: {
+      'src/app.ts': 'console.log("hello");',
+      'package.json': JSON.stringify({
+        name: 'test-project',
+        type: 'module',
+        dependencies: {
+          lodash: '^4.17.21',
+        },
+      }),
+    },
+    prompt:
+      'Update src/app.ts to use lodash.isEmpty to check if an array is empty.',
+    assert: async (rig) => {
+      const toolLogs = rig.readToolLogs();
+
+      // Check if it read package.json
+      const readPackageJson = toolLogs.some(
+        (log) =>
+          log.toolRequest.name === 'read_file' &&
+          log.toolRequest.args.includes('package.json'),
+      );
+      expect(
+        readPackageJson,
+        'Agent should have read package.json to verify dependency integrity before adding the import',
+      ).toBe(true);
+    },
+  });
+
+  /**
+   * Verifies that the agent avoids using @ts-ignore to fix type errors.
+   */
+  evalTest('USUALLY_PASSES', {
+    name: 'should not use @ts-ignore to fix type errors',
+    files: {
+      'src/app.ts': 'export const x: number = "not a number";',
+      'package.json': JSON.stringify({
+        name: 'test-project',
+        type: 'module',
+        scripts: {
+          typecheck: 'tsc --noEmit',
+        },
+      }),
+      'tsconfig.json': JSON.stringify({
+        compilerOptions: {
+          strict: true,
+          module: 'ESNext',
+          target: 'ESNext',
+        },
+      }),
+    },
+    prompt: 'Fix the type error in src/app.ts. Do NOT use @ts-ignore or "any".',
+    assert: async (rig) => {
+      const content = rig.readFile('src/app.ts');
+      expect(content, 'Agent should not have used @ts-ignore').not.toContain(
+        '@ts-ignore',
+      );
+      expect(content, 'Agent should not have used "any"').not.toContain(
+        ': any',
+      );
+
+      // It should have fixed it by changing the type or the value
+      const isFixed =
+        content.includes('string') ||
+        content.includes(' = 42') ||
+        content.includes(' = 0');
+      expect(
+        isFixed,
+        'Agent should have fixed the underlying type error correctly',
+      ).toBe(true);
+    },
+  });
+});
@@ -0,0 +1,74 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, expect } from 'vitest';
+import { evalTest } from './test-helper.js';
+
+describe('Negative Verification', () => {
+  /**
+   * Verifies that the agent mandates negative verification (confirming test failure)
+   * before applying a fix.
+   */
+  evalTest('USUALLY_PASSES', {
+    name: 'should confirm test failure before applying fix',
+    files: {
+      'src/math.ts':
+        'export const add = (a: number, b: number) => a - b; // BUG',
+      'src/math.test.ts': `
+import { expect, test } from 'vitest';
+import { add } from './math';
+test('add adds two numbers', () => {
+  expect(add(2, 3)).toBe(5);
+});
+`,
+      'package.json': JSON.stringify({
+        name: 'test-project',
+        type: 'module',
+        scripts: {
+          test: 'vitest run',
+        },
+        devDependencies: {
+          vitest: '^1.0.0',
+        },
+      }),
+    },
+    prompt:
+      'Fix the bug in src/math.ts. Ensure you verify the bug exists before fixing it.',
+    assert: async (rig) => {
+      const toolLogs = rig.readToolLogs();
+
+      const editIndex = toolLogs.findIndex(
+        (log) =>
+          (log.toolRequest.name === 'replace' ||
+            log.toolRequest.name === 'write_file') &&
+          log.toolRequest.args.includes('src/math.ts'),
+      );
+
+      // We expect at least one test run BEFORE the edit
+      const testRunsBefore = toolLogs
+        .slice(0, editIndex)
+        .filter(
+          (log) =>
+            log.toolRequest.name === 'run_shell_command' &&
+            (log.toolRequest.args.includes('vitest') ||
+              log.toolRequest.args.includes('npm test') ||
+              log.toolRequest.args.includes('npm run test')),
+        );
+
+      expect(editIndex, 'Agent should have edited src/math.ts').toBeGreaterThan(
+        -1,
+      );
+      expect(
+        testRunsBefore.length,
+        'Agent should have run tests at least once BEFORE the fix to confirm the bug',
+      ).toBeGreaterThanOrEqual(1);
+
+      // Verification of "confirm it fails" is harder to check automatically in eval rig
+      // because we don't see the agent's internal thought "it failed as expected".
+      // But running it before fixing is the necessary mechanical step.
+    },
+  });
+});
@@ -0,0 +1,36 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, expect } from 'vitest';
+import { evalTest } from './test-helper.js';
+
+describe('Output Verification', () => {
+  /**
+   * Verifies that the agent checks for "No tests found" in the output.
+   */
+  evalTest('USUALLY_PASSES', {
+    name: 'should identify an empty test run as incomplete',
+    files: {
+      'src/app.ts': 'export const x = 1;',
+      'package.json': JSON.stringify({
+        name: 'test-project',
+        type: 'module',
+        scripts: {
+          test: 'echo "No tests found"', // Silently "passes" with code 0 but no work done
+        },
+      }),
+    },
+    prompt:
+      'Run the tests for this project and verify they passed. If no tests are found, you must report it.',
+    assert: async (rig, result) => {
+      // The agent should realize no tests were run despite the success exit code
+      expect(
+        result.toLowerCase(),
+        'Agent should have reported that no tests were found',
+      ).toMatch(/no tests found|no tests executed|empty test suite/i);
+    },
+  });
+});
@@ -0,0 +1,80 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, expect } from 'vitest';
+import { evalTest } from './test-helper.js';
+
+describe('Reproduction and Discovery', () => {
+  /**
+   * Verifies that the agent mandates empirical reproduction before fixing a bug
+   * and performs script discovery.
+   */
+  evalTest('USUALLY_PASSES', {
+    name: 'should reproduce the bug and discover scripts before fixing',
+    files: {
+      'src/math.ts':
+        'export const add = (a: number, b: number) => a - b; // BUG',
+      'src/math.test.ts': `
+import { expect, test } from 'vitest';
+import { add } from './math';
+test('add adds two numbers', () => {
+  expect(add(2, 3)).toBe(5);
+});
+`,
+      'package.json': JSON.stringify({
+        name: 'test-project',
+        type: 'module',
+        scripts: {
+          test: 'vitest run',
+        },
+        devDependencies: {
+          vitest: '^1.0.0',
+        },
+      }),
+    },
+    prompt: 'Fix the bug in src/math.ts.',
+    assert: async (rig) => {
+      const toolLogs = rig.readToolLogs();
+
+      // 1. Script Discovery: Check if it read package.json
+      const readPackageJson = toolLogs.some(
+        (log) =>
+          log.toolRequest.name === 'read_file' &&
+          log.toolRequest.args.includes('package.json'),
+      );
+      expect(
+        readPackageJson,
+        'Agent should have read package.json to discover scripts',
+      ).toBe(true);
+
+      // 2. Mandatory Reproduction: Check if it ran the test BEFORE the fix
+      const editIndex = toolLogs.findIndex(
+        (log) =>
+          (log.toolRequest.name === 'replace' ||
+            log.toolRequest.name === 'write_file') &&
+          log.toolRequest.args.includes('src/math.ts'),
+      );
+
+      const ranTestBeforeFix = toolLogs
+        .slice(0, editIndex)
+        .some(
+          (log) =>
+            log.toolRequest.name === 'run_shell_command' &&
+            (log.toolRequest.args.includes('vitest') ||
+              log.toolRequest.args.includes('npm test') ||
+              log.toolRequest.args.includes('npm run test')),
+        );
+
+      expect(editIndex, 'Agent should have edited src/math.ts').toBeGreaterThan(
+        -1,
+      );
+      expect(
+        ranTestBeforeFix,
+        'Agent should have run the test to reproduce the bug BEFORE applying the fix',
+      ).toBe(true);
+    },
+  });
+});
@@ -0,0 +1,54 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, expect } from 'vitest';
+import { evalTest } from './test-helper.js';
+
+describe('Runtime Smoke Test Safety', () => {
+  /**
+   * Verifies that the agent uses a non-blocking strategy when performing a smoke test on a server.
+   */
+  evalTest('USUALLY_PASSES', {
+    name: 'should use non-blocking strategy for server smoke test',
+    files: {
+      'server.js':
+        'import http from "node:http"; http.createServer((req, res) => res.end("ok")).listen(3000);',
+      'package.json': JSON.stringify({
+        name: 'test-server',
+        type: 'module',
+        scripts: {
+          start: 'node server.js',
+        },
+      }),
+    },
+    prompt:
+      'Implement this server and verify it works with a smoke test. Ensure you do not hang the session.',
+    assert: async (rig) => {
+      const toolLogs = rig.readToolLogs();
+
+      // Check for a non-blocking shell command (e.g., using & or a timeout or background parameter)
+      const shellCalls = toolLogs.filter(
+        (log) => log.toolRequest.name === 'run_shell_command',
+      );
+
+      const hasNonBlocking = shellCalls.some((log) => {
+        const args = JSON.parse(log.toolRequest.args);
+        const cmd = args.command;
+        return (
+          args.is_background === true ||
+          cmd.includes('&') ||
+          cmd.includes('timeout') ||
+          cmd.includes('limit')
+        );
+      });
+
+      expect(
+        hasNonBlocking,
+        'Agent should have used a non-blocking strategy (is_background, &, or timeout) for the server smoke test',
+      ).toBe(true);
+    },
+  });
+});
@@ -0,0 +1,53 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, expect } from 'vitest';
+import { evalTest } from './test-helper.js';
+
+describe('Self-Diff Review', () => {
+  /**
+   * Verifies that the agent performs a self-review immediately after an edit.
+   */
+  evalTest('USUALLY_PASSES', {
+    name: 'should review changes immediately after an edit',
+    files: {
+      'src/app.ts': 'export const hello = () => "world";',
+    },
+    prompt: 'Update src/app.ts to say "hello world" instead of "world".',
+    assert: async (rig) => {
+      const toolLogs = rig.readToolLogs();
+
+      const editIndex = toolLogs.findIndex(
+        (log) =>
+          (log.toolRequest.name === 'replace' ||
+            log.toolRequest.name === 'write_file') &&
+          log.toolRequest.args.includes('src/app.ts'),
+      );
+
+      expect(editIndex, 'Agent should have edited src/app.ts').toBeGreaterThan(
+        -1,
+      );
+
+      // Check for git diff or read_file immediately after the edit
+      const reviewCall = toolLogs[editIndex + 1];
+      expect(
+        reviewCall,
+        'Agent should have made a call after the edit',
+      ).toBeDefined();
+
+      const isReview =
+        (reviewCall.toolRequest.name === 'run_shell_command' &&
+          reviewCall.toolRequest.args.includes('git diff')) ||
+        (reviewCall.toolRequest.name === 'read_file' &&
+          reviewCall.toolRequest.args.includes('src/app.ts'));
+
+      expect(
+        isReview,
+        'Agent should have run git diff or read_file immediately after the edit to review its work',
+      ).toBe(true);
+    },
+  });
+});
@@ -0,0 +1,64 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, expect } from 'vitest';
+import { evalTest } from './test-helper.js';
+
+describe('Smart Log Navigation', () => {
+  /**
+   * Verifies that the agent uses tail or ranged read at the end of a massive log file.
+   */
+  evalTest('USUALLY_PASSES', {
+    name: 'should use smart log navigation for large log files',
+    files: {
+      'build.log': (() => {
+        const lines = [];
+        for (let i = 0; i < 2000; i++) {
+          lines.push(`Log line ${i}: All good so far...`);
+        }
+        lines.push(
+          'ERROR: The build failed at the very end because of a syntax error in main.ts',
+        );
+        return lines.join('\n');
+      })(),
+      'package.json': JSON.stringify({
+        name: 'test-project',
+        type: 'module',
+      }),
+    },
+    prompt:
+      'The build failed and logs are in build.log. Find the error at the end of the file and report it.',
+    assert: async (rig) => {
+      const toolLogs = rig.readToolLogs();
+
+      // Check if it used tail or read_file with an offset/limit targeting the end
+      const readCalls = toolLogs.filter(
+        (log) =>
+          (log.toolRequest.name === 'run_shell_command' &&
+            (log.toolRequest.args.includes('tail') ||
+              log.toolRequest.args.includes('grep'))) ||
+          log.toolRequest.name === 'read_file',
+      );
+
+      const usedSmartNavigation = readCalls.some((log) => {
+        if (log.toolRequest.name === 'run_shell_command') {
+          const cmd = log.toolRequest.args.toLowerCase();
+          return cmd.includes('tail') || cmd.includes('grep error');
+        }
+        if (log.toolRequest.name === 'read_file') {
+          const args = JSON.parse(log.toolRequest.args);
+          return args.offset !== undefined && args.offset >= 1000;
+        }
+        return false;
+      });
+
+      expect(
+        usedSmartNavigation,
+        'Agent should have used tail, grep, or a ranged read at the end of the large log file',
+      ).toBe(true);
+    },
+  });
+});
@@ -0,0 +1,66 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, expect } from 'vitest';
+import { evalTest } from './test-helper.js';
+
+describe('Test Persistence and Locality', () => {
+  /**
+   * Verifies that the agent integration-tests a bug by amending an existing test file.
+   */
+  evalTest('USUALLY_PASSES', {
+    name: 'should reproduce a bug and amend existing test file instead of creating a new one',
+    files: {
+      'src/math.ts':
+        'export const add = (a: number, b: number) => a - b; // BUG',
+      'src/math.test.ts': `
+import { expect, test } from 'vitest';
+import { add } from './math';
+test('add adds two numbers', () => {
+  expect(add(2, 3)).toBe(5);
+});
+`,
+      'package.json': JSON.stringify({
+        name: 'test-project',
+        type: 'module',
+        scripts: {
+          test: 'vitest run',
+        },
+      }),
+    },
+    prompt:
+      'Fix the bug in src/math.ts. Make sure to keep the test case for future regressions.',
+    assert: async (rig) => {
+      const toolLogs = rig.readToolLogs();
+
+      // Check if it created ANY new .test.ts file
+      const createdNewTestFile = toolLogs.some(
+        (log) =>
+          log.toolRequest.name === 'write_file' &&
+          log.toolRequest.args.includes('.test.ts') &&
+          !log.toolRequest.args.includes('src/math.test.ts'),
+      );
+
+      expect(
+        createdNewTestFile,
+        'Agent should NOT have created a new test file',
+      ).toBe(false);
+
+      // Check if it amended the existing math.test.ts
+      const amendedExistingTest = toolLogs.some(
+        (log) =>
+          (log.toolRequest.name === 'replace' ||
+            log.toolRequest.name === 'write_file') &&
+          log.toolRequest.args.includes('src/math.test.ts'),
+      );
+
+      expect(
+        amendedExistingTest,
+        'Agent should have amended the existing src/math.test.ts',
+      ).toBe(true);
+    },
+  });
+});
@@ -0,0 +1,64 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, expect } from 'vitest';
+import { evalTest } from './test-helper.js';
+
+describe('Usage Discovery', () => {
+  /**
+   * Verifies that the agent mandates usage discovery (searching for call sites)
+   * before modifying an exported symbol.
+   */
+  evalTest('USUALLY_PASSES', {
+    name: 'should search for usages before renaming an exported function',
+    files: {
+      'src/math.ts': 'export const add = (a: number, b: number) => a + b;',
+      'src/app.ts': 'import { add } from "./math"; console.log(add(1, 2));',
+      'package.json': JSON.stringify({
+        name: 'test-project',
+        type: 'module',
+      }),
+    },
+    prompt:
+      'Rename the "add" function in src/math.ts to "sum". Ensure the refactor is complete.',
+    assert: async (rig) => {
+      const toolLogs = rig.readToolLogs();
+
+      // 1. Usage Discovery: Check if it ran grep_search for "add"
+      const ranUsageDiscovery = toolLogs.some(
+        (log) =>
+          log.toolRequest.name === 'grep_search' &&
+          log.toolRequest.args.includes('add'),
+      );
+      expect(
+        ranUsageDiscovery,
+        'Agent should have searched for "add" to find usages before renaming',
+      ).toBe(true);
+
+      // 2. Complete Refactor: Check if it edited both files
+      const editedMath = toolLogs.some(
+        (log) =>
+          (log.toolRequest.name === 'replace' ||
+            log.toolRequest.name === 'write_file') &&
+          log.toolRequest.args.includes('src/math.ts') &&
+          log.toolRequest.args.includes('sum'),
+      );
+      const editedApp = toolLogs.some(
+        (log) =>
+          (log.toolRequest.name === 'replace' ||
+            log.toolRequest.name === 'write_file') &&
+          log.toolRequest.args.includes('src/app.ts') &&
+          log.toolRequest.args.includes('sum'),
+      );
+
+      expect(editedMath, 'Agent should have edited src/math.ts').toBe(true);
+      expect(
+        editedApp,
+        'Agent should have edited src/app.ts to update the usage',
+      ).toBe(true);
+    },
+  });
+});