From db8910c39be479488814ecbf31581dbe8553f971 Mon Sep 17 00:00:00 2001
From: Alisa Novikova <62909685+alisa-alisa@users.noreply.github.com>
Date: Tue, 7 Apr 2026 19:35:12 -0700
Subject: [PATCH] feat(evals): implement related evaluation system for targeted
 testing

---
 .github/workflows/eval-pr.yml    |   7 +-
 evals/README.md                  |  44 ++++++-
 evals/suites.json                | 171 +++++++++++++++++++++++++++
 package.json                     |   1 +
 scripts/changed_prompt.js        | 193 ++++++++++++++++++++++++++-----
 scripts/get_trustworthy_evals.js |  81 +++++++++++--
 scripts/lint.js                  |  13 +++
 scripts/run_eval_regression.js   |  45 ++++++-
 scripts/validate_eval_suites.js  |  98 ++++++++++++++++
 9 files changed, 610 insertions(+), 43 deletions(-)
 create mode 100644 evals/suites.json
 create mode 100644 scripts/validate_eval_suites.js

diff --git a/.github/workflows/eval-pr.yml b/.github/workflows/eval-pr.yml
index 3e6784960c..6ade8c7542 100644
--- a/.github/workflows/eval-pr.yml
+++ b/.github/workflows/eval-pr.yml
@@ -1,7 +1,7 @@
 name: 'Evals: PR Evaluation & Regression'
 
 on:
-  pull_request_target:
+  pull_request:
     types: ['opened', 'synchronize', 'reopened', 'ready_for_review']
     paths:
       - 'packages/core/src/prompts/**'
@@ -153,9 +153,10 @@ jobs:
           GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}'
           GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}'
           MODEL_LIST: '${{ env.MODEL_LIST }}'
+          GITHUB_BASE_REF: '${{ github.base_ref }}'
         run: |
-          # Run the regression check loop. The script saves the report to a file.
-          node scripts/run_eval_regression.js
+          # Run the related regression check loop.
+          node scripts/run_eval_regression.js --related
 
           # Use the generated report file if it exists
           if [[ -f eval_regression_report.md ]]; then
diff --git a/evals/README.md b/evals/README.md
index aebfe38ebc..36e543fd88 100644
--- a/evals/README.md
+++ b/evals/README.md
@@ -227,10 +227,18 @@ in Pull Requests. These can also be run locally for debugging.
 ### Running Regression Checks Locally
 
 You can simulate the PR regression check locally to verify your changes before
-pushing:
+pushing. To optimize your workflow and reduce LLM costs, use the **`--related`**
+flag to run only the tests relevant to your specific changes:
 
 ```bash
-# Run the full regression loop for a specific model
+# Run the targeted regression loop for your changes
+MODEL_LIST=gemini-3-flash-preview node scripts/run_eval_regression.js --related
+```
+
+To run the full regression loop for a specific model (all stable tests):
+
+```bash
+# Run everything
 MODEL_LIST=gemini-3-flash-preview node scripts/run_eval_regression.js
 ```
 
@@ -244,6 +252,38 @@ OUTPUT=$(node scripts/get_trustworthy_evals.js "gemini-3-flash-preview")
 node scripts/run_regression_check.js "gemini-3-flash-preview" "$OUTPUT"
 ```
 
+### Related Testing with `--related`
+
+The project uses a "Smart Eval" system to identify which behavioral evaluations
+are affected by your code changes. This is controlled by the `--related` flag
+available in `scripts/run_eval_regression.js`.
+
+#### How it Works
+
+1.  **Change Detection**: The system uses Git to identify modified files in your
+    branch compared to `main`.
+2.  **Suite Mapping**: Modified files are matched against patterns in
+    `evals/suites.json`. This file maps core files (e.g., `grep.ts`) to their
+    corresponding evaluations.
+3.  **Targeted Execution**: Only the evaluations belonging to the affected
+    suites are executed.
+4.  **Global Fallback**: If changes are detected in core system prompts or
+    unmapped files, the system automatically falls back to running the full
+    stable evaluation suite for safety.
+
+#### Updating Detection Logic
+
+If you add a new tool or functional area, you should update `evals/suites.json`
+to ensure your new evaluations are triggered correctly.
+
+```json
+"my_new_tool": {
+  "description": "Description of the tool",
+  "patterns": ["packages/core/src/tools/my-new-tool.ts"],
+  "evals": ["evals/my_new_tool.eval.ts"]
+}
+```
+
 ### The Regression Quality Bar
 
 Because LLMs are non-deterministic, the PR regression check uses a high-signal
diff --git a/evals/suites.json b/evals/suites.json
new file mode 100644
index 0000000000..feedfce4b3
--- /dev/null
+++ b/evals/suites.json
@@ -0,0 +1,171 @@
+{
+  "allowedOverlaps": [],
+  "grep": {
+    "description": "Grep search functionality",
+    "patterns": [
+      "packages/core/src/tools/grep.ts",
+      "packages/core/src/tools/ripGrep.ts",
+      "packages/core/src/tools/grep-utils.ts",
+      "evals/grep_search_functionality.eval.ts"
+    ],
+    "evals": ["evals/grep_search_functionality.eval.ts"]
+  },
+  "memory": {
+    "description": "Memory tool and fact persistence",
+    "patterns": [
+      "packages/core/src/tools/memoryTool.ts",
+      "packages/core/src/persistence/storage.ts",
+      "evals/save_memory.eval.ts",
+      "evals/hierarchical_memory.eval.ts"
+    ],
+    "evals": ["evals/save_memory.eval.ts", "evals/hierarchical_memory.eval.ts"]
+  },
+  "read_file": {
+    "description": "File reading and content extraction",
+    "patterns": [
+      "packages/core/src/tools/read-file.ts",
+      "packages/core/src/tools/read-many-files.ts",
+      "evals/frugalReads.eval.ts"
+    ],
+    "evals": ["evals/frugalReads.eval.ts"]
+  },
+  "glob": {
+    "description": "File discovery and globbing",
+    "patterns": [
+      "packages/core/src/tools/glob.ts",
+      "evals/frugalSearch.eval.ts"
+    ],
+    "evals": ["evals/frugalSearch.eval.ts"]
+  },
+  "tracker": {
+    "description": "Task and progress tracking",
+    "patterns": [
+      "packages/core/src/tools/trackerTools.ts",
+      "evals/tracker.eval.ts"
+    ],
+    "evals": ["evals/tracker.eval.ts"]
+  },
+  "ask_user": {
+    "description": "Interactive user confirmation and input",
+    "patterns": [
+      "packages/core/src/tools/ask-user.ts",
+      "evals/ask_user.eval.ts"
+    ],
+    "evals": ["evals/ask_user.eval.ts"]
+  },
+  "plan_mode": {
+    "description": "Plan Mode orchestration",
+    "patterns": [
+      "packages/core/src/tools/enter-plan-mode.ts",
+      "packages/core/src/tools/exit-plan-mode.ts",
+      "packages/core/src/agents/planAgent.ts",
+      "evals/plan_mode.eval.ts"
+    ],
+    "evals": ["evals/plan_mode.eval.ts"]
+  },
+  "git": {
+    "description": "Git repository operations",
+    "patterns": [
+      "packages/core/src/utils/git.ts",
+      "packages/core/src/tools/shell.ts",
+      "evals/gitRepo.eval.ts",
+      "evals/unsafe-cloning.eval.ts"
+    ],
+    "evals": ["evals/gitRepo.eval.ts", "evals/unsafe-cloning.eval.ts"]
+  },
+  "agents": {
+    "description": "Agent delegation and help",
+    "patterns": [
+      "packages/core/src/agents/**",
+      "packages/core/src/routing/**",
+      "evals/subagents.eval.ts",
+      "evals/generalist_agent.eval.ts",
+      "evals/generalist_delegation.eval.ts",
+      "evals/cli_help_delegation.eval.ts"
+    ],
+    "evals": [
+      "evals/subagents.eval.ts",
+      "evals/generalist_agent.eval.ts",
+      "evals/generalist_delegation.eval.ts",
+      "evals/cli_help_delegation.eval.ts"
+    ]
+  },
+  "background": {
+    "description": "Background process management",
+    "patterns": [
+      "packages/core/src/tools/shellBackgroundTools.ts",
+      "evals/background_processes.eval.ts"
+    ],
+    "evals": ["evals/background_processes.eval.ts"]
+  },
+  "core_steering": {
+    "description": "System prompts and core model steering",
+    "patterns": [
+      "packages/core/src/prompts/**",
+      "evals/answer-vs-act.eval.ts",
+      "evals/model_steering.eval.ts",
+      "evals/redundant_casts.eval.ts"
+    ],
+    "evals": [
+      "ALL_ALWAYS_PASSING",
+      "evals/answer-vs-act.eval.ts",
+      "evals/model_steering.eval.ts",
+      "evals/redundant_casts.eval.ts"
+    ]
+  },
+  "edit_fidelity": {
+    "description": "Code edit accuracy and location checks",
+    "patterns": [
+      "packages/core/src/tools/edit.ts",
+      "packages/core/src/tools/write-file.ts",
+      "evals/edit-locations-eval.eval.ts",
+      "evals/validation_fidelity.eval.ts",
+      "evals/validation_fidelity_pre_existing_errors.eval.ts"
+    ],
+    "evals": [
+      "evals/edit-locations-eval.eval.ts",
+      "evals/validation_fidelity.eval.ts",
+      "evals/validation_fidelity_pre_existing_errors.eval.ts"
+    ]
+  },
+  "reliability": {
+    "description": "Core safety and sandbox reliability",
+    "patterns": [
+      "packages/core/src/safety/**",
+      "packages/core/src/fallback/**",
+      "evals/concurrency-safety.eval.ts",
+      "evals/sandbox_recovery.eval.ts",
+      "evals/interactive-hang.eval.ts",
+      "evals/tool_output_masking.eval.ts"
+    ],
+    "evals": [
+      "evals/concurrency-safety.eval.ts",
+      "evals/sandbox_recovery.eval.ts",
+      "evals/interactive-hang.eval.ts",
+      "evals/tool_output_masking.eval.ts"
+    ]
+  },
+  "orchestration": {
+    "description": "Agent task lifecycle and topic management",
+    "patterns": [
+      "packages/core/src/tools/update-topic.ts",
+      "packages/core/src/tools/complete-task.ts",
+      "evals/update_topic.eval.ts"
+    ],
+    "evals": ["evals/update_topic.eval.ts"]
+  },
+  "general_tools": {
+    "description": "General tool usage and efficiency",
+    "patterns": [
+      "packages/core/src/tools/tools.ts",
+      "packages/core/src/tools/tool-registry.ts",
+      "packages/core/src/tools/shell.ts",
+      "evals/automated-tool-use.eval.ts",
+      "evals/shell-efficiency.eval.ts"
+    ],
+    "evals": [
+      "evals/automated-tool-use.eval.ts",
+      "evals/shell-efficiency.eval.ts"
+    ]
+  }
+}
diff --git a/package.json b/package.json
index e24f6a20b5..cde3743e16 100644
--- a/package.json
+++ b/package.json
@@ -47,6 +47,7 @@
     "posttest": "npm run build",
     "test:always_passing_evals": "vitest run --config evals/vitest.config.ts",
     "test:all_evals": "cross-env RUN_EVALS=1 vitest run --config evals/vitest.config.ts",
+    "test:evals:related": "node scripts/run_eval_regression.js --related",
     "test:e2e": "cross-env VERBOSE=true KEEP_OUTPUT=true npm run test:integration:sandbox:none",
     "test:integration:all": "npm run test:integration:sandbox:none && npm run test:integration:sandbox:docker && npm run test:integration:sandbox:podman",
     "test:integration:flaky": "cross-env RUN_FLAKY_INTEGRATION=1 npm run test:integration:sandbox:none",
diff --git a/scripts/changed_prompt.js b/scripts/changed_prompt.js
index 3fe33443a0..4c76e90b38 100644
--- a/scripts/changed_prompt.js
+++ b/scripts/changed_prompt.js
@@ -3,7 +3,23 @@
  * Copyright 2026 Google LLC
  * SPDX-License-Identifier: Apache-2.0
  */
+
+/**
+ * @fileoverview Intelligence layer for detecting steering and behavior changes.
+ *
+ * This script identifies if code changes affect model steering (system prompts,
+ * tool definitions, agent instructions) and maps them to relevant evaluation
+ * suites. It supports both CI (GitHub Actions) and local development workflows.
+ *
+ * Detection Methods:
+ * 1. Path-based: Monitors critical steering and tool directories.
+ * 2. Signature-based: Scans diff content for core steering primitives
+ *    (e.g., ToolDefinition, inputSchema).
+ * 3. Suite-aware: Uses evals/suites.json to identify related tests for surgical runs.
+ */
+
 import { execSync } from 'node:child_process';
+import fs from 'node:fs';
 
 const CORE_STEERING_PATHS = [
   'packages/core/src/prompts/',
@@ -20,46 +36,132 @@ const STEERING_SIGNATURES = [
   "kind: 'local'",
 ];
 
+function minimatch(file, pattern) {
+  if (pattern.endsWith('/**')) {
+    const prefix = pattern.slice(0, -3);
+    return file.startsWith(prefix);
+  }
+  if (pattern.includes('*')) {
+    const regex = new RegExp(
+      '^' +
+        pattern
+          .replace(/\./g, '\\.')
+          .replace(/\*\*/g, '.*')
+          .replace(/\*/g, '[^/]*') +
+        '$',
+    );
+    return regex.test(file);
+  }
+  return file === pattern;
+}
+
 function main() {
   const targetBranch = process.env.GITHUB_BASE_REF || 'main';
   const verbose = process.argv.includes('--verbose');
   const steeringOnly = process.argv.includes('--steering-only');
+  const isRelatedMode = process.argv.includes('--related');
+  const isJsonMode = process.argv.includes('--json');
 
   try {
     const remoteUrl = process.env.GITHUB_REPOSITORY
       ? `https://github.com/${process.env.GITHUB_REPOSITORY}.git`
       : 'origin';
 
-    // Fetch target branch from the remote.
-    execSync(`git fetch ${remoteUrl} ${targetBranch}`, {
-      stdio: 'ignore',
-    });
+    let changedFiles = [];
+    const isCi = !!process.env.GITHUB_ACTIONS;
 
-    // Get changed files using the triple-dot syntax which correctly handles merge commits
-    const head = process.env.PR_HEAD_SHA || 'HEAD';
-    const changedFiles = execSync(`git diff --name-only FETCH_HEAD...${head}`, {
-      encoding: 'utf-8',
-    })
-      .split('\n')
-      .filter(Boolean);
+    if (isCi) {
+      try {
+        // 1. Try fetching from remote (CI environment)
+        execSync(`git fetch ${remoteUrl} ${targetBranch}`, {
+          stdio: 'ignore',
+        });
+
+        // Get changed files using the triple-dot syntax which correctly handles merge commits
+        const head = process.env.PR_HEAD_SHA || 'HEAD';
+        changedFiles = execSync(`git diff --name-only FETCH_HEAD...${head}`, {
+          encoding: 'utf-8',
+        })
+          .split('\n')
+          .filter(Boolean);
+      } catch (e) {
+        if (verbose)
+          process.stderr.write(
+            `Warning: git fetch failed in CI: ${e.message}\n`,
+          );
+      }
+    }
+
+    // 2. Local fallback or if CI fetch failed: Try diffing against target branch
+    if (changedFiles.length === 0) {
+      try {
+        changedFiles = execSync(`git diff --name-only ${targetBranch}`, {
+          encoding: 'utf-8',
+        })
+          .split('\n')
+          .filter(Boolean);
+      } catch {
+        // 3. Last resort: Just diff against HEAD (uncommitted changes only)
+        changedFiles = execSync('git diff --name-only HEAD', {
+          encoding: 'utf-8',
+        })
+          .split('\n')
+          .filter(Boolean);
+      }
+
+      // Also include untracked files in local mode
+      const untracked = execSync('git ls-files --others --exclude-standard', {
+        encoding: 'utf-8',
+      })
+        .split('\n')
+        .filter(Boolean);
+      changedFiles = [...new Set([...changedFiles, ...untracked])];
+    }
 
     let detected = false;
     const reasons = [];
+    const affectedSuites = new Set();
+    const rationales = [];
+
+    // Load suites for --related mode
+    let suitesConfig = null;
+    if (isRelatedMode) {
+      try {
+        suitesConfig = JSON.parse(
+          fs.readFileSync('evals/suites.json', 'utf-8'),
+        );
+      } catch {
+        process.stderr.write(`Warning: Could not load evals/suites.json\n`);
+      }
+    }
 
     // 1. Path-based detection
     for (const file of changedFiles) {
       if (CORE_STEERING_PATHS.some((prefix) => file.startsWith(prefix))) {
         detected = true;
         reasons.push(`Matched core steering path: ${file}`);
-        if (!verbose) break;
       }
       if (
         !steeringOnly &&
-        TEST_PATHS.some((prefix) => file.startsWith(prefix))
+        TEST_PATHS.some((prefix) => file.startsWith(prefix)) &&
+        file.endsWith('.eval.ts')
       ) {
         detected = true;
-        reasons.push(`Matched test path: ${file}`);
-        if (!verbose) break;
+        reasons.push(`Matched test file: ${file}`);
+      }
+
+      // Related suite detection
+      if (suitesConfig) {
+        for (const [suiteName, suite] of Object.entries(suitesConfig)) {
+          if (suiteName === 'allowedOverlaps' || !suite.patterns) continue;
+
+          if (suite.patterns.some((pattern) => minimatch(file, pattern))) {
+            affectedSuites.add(suiteName);
+            rationales.push(
+              `Testing **${suiteName}** because **${file}** was modified.`,
+            );
+          }
+        }
       }
     }
 
@@ -70,15 +172,30 @@ function main() {
       );
       if (coreChanges.length > 0) {
         // Get the actual diff content for core files
-        const diff = execSync(
-          `git diff -U0 FETCH_HEAD...${head} -- packages/core/src/`,
-          { encoding: 'utf-8' },
-        );
+        // We need to be careful with the diff command depending on if we have FETCH_HEAD
+        let diffCmd = '';
+        try {
+          const head = process.env.PR_HEAD_SHA || 'HEAD';
+          diffCmd = `git diff -U0 FETCH_HEAD...${head} -- packages/core/src/`;
+          execSync('git rev-parse FETCH_HEAD', { stdio: 'ignore' });
+        } catch {
+          diffCmd = `git diff -U0 ${targetBranch} -- packages/core/src/`;
+        }
+
+        const diff = execSync(diffCmd, { encoding: 'utf-8' });
         for (const sig of STEERING_SIGNATURES) {
           if (diff.includes(sig)) {
             detected = true;
             reasons.push(`Matched steering signature in core: ${sig}`);
-            if (!verbose) break;
+
+            // If we detected a steering signature, mark core_steering suite
+            if (isRelatedMode) {
+              affectedSuites.add('core_steering');
+              rationales.push(
+                `Testing **core_steering** because matched signature '${sig}' in core files.`,
+              );
+            }
+            if (!verbose && !isRelatedMode) break;
           }
         }
       }
@@ -89,14 +206,38 @@ function main() {
       reasons.forEach((r) => process.stderr.write(` - ${r}\n`));
     }
 
-    process.stdout.write(detected ? 'true' : 'false');
+    if (isJsonMode) {
+      process.stdout.write(
+        JSON.stringify(
+          {
+            detected,
+            reasons,
+            affectedSuites: Array.from(affectedSuites),
+            rationales,
+          },
+          null,
+          2,
+        ),
+      );
+    } else {
+      process.stdout.write(detected ? 'true' : 'false');
+    }
   } catch (error) {
-    // If anything fails (e.g., no git history), run evals/guidance to be safe
-    process.stderr.write(
-      'Warning: Failed to determine if changes occurred. Defaulting to true.\n',
-    );
+    if (isJsonMode) {
+      process.stdout.write(
+        JSON.stringify({
+          detected: true,
+          reasons: [`Error during detection: ${error.message}`],
+          affectedSuites: ['core_steering'],
+          rationales: [
+            'Error during detection: running all stable evals for safety.',
+          ],
+        }),
+      );
+    } else {
+      process.stdout.write('true');
+    }
     process.stderr.write(String(error) + '\n');
-    process.stdout.write('true');
   }
 }
 
diff --git a/scripts/get_trustworthy_evals.js b/scripts/get_trustworthy_evals.js
index c87d148e7a..ae25550134 100644
--- a/scripts/get_trustworthy_evals.js
+++ b/scripts/get_trustworthy_evals.js
@@ -13,6 +13,7 @@
  * to ensure high-signal validation and minimize noise.
  */
 
+import fs from 'node:fs';
 import { fetchNightlyHistory, escapeRegex } from './eval_utils.js';
 
 const LOOKBACK_COUNT = 6;
@@ -25,11 +26,24 @@ const AGGREGATE_PASS_RATE_THRESHOLD = 0.8; // Weekly signal (e.g., 15/18)
  */
 function main() {
   const targetModel = process.argv[2];
-  if (!targetModel) {
+  if (!targetModel || targetModel.startsWith('--')) {
     console.error('❌ Error: No target model specified.');
     process.exit(1);
   }
+
+  // Parse --suites argument
+  const suitesArgIndex = process.argv.indexOf('--suites');
+  let requestedSuites = null;
+  if (suitesArgIndex !== -1 && process.argv[suitesArgIndex + 1]) {
+    requestedSuites = process.argv[suitesArgIndex + 1]
+      .split(',')
+      .map((s) => s.trim());
+  }
+
   console.error(`🔍 Identifying trustworthy evals for model: ${targetModel}`);
+  if (requestedSuites) {
+    console.error(`📂 Filtering by suites: ${requestedSuites.join(', ')}`);
+  }
 
   const history = fetchNightlyHistory(LOOKBACK_COUNT);
   if (history.length === 0) {
@@ -37,6 +51,32 @@ function main() {
     process.exit(1);
   }
 
+  // Load suites configuration
+  let allowedFiles = null;
+  let runAllStable = false;
+  if (requestedSuites) {
+    try {
+      const suitesConfig = JSON.parse(
+        fs.readFileSync('evals/suites.json', 'utf-8'),
+      );
+      allowedFiles = new Set();
+      for (const suiteName of requestedSuites) {
+        const suite = suitesConfig[suiteName];
+        if (suite) {
+          if (suite.evals.includes('ALL_ALWAYS_PASSING')) {
+            runAllStable = true;
+          } else {
+            suite.evals.forEach((file) => allowedFiles.add(file));
+          }
+        }
+      }
+    } catch (e) {
+      console.error(
+        `⚠️ Warning: Could not load evals/suites.json or match suites: ${e.message}`,
+      );
+    }
+  }
+
   // Aggregate results for the target model across all history
   const testHistories = {}; // { [testName]: { totalPassed: 0, totalRuns: 0, dailyRates: [], file: string } }
 
@@ -83,11 +123,28 @@ function main() {
     const isAggregateHighSignal = aggregateRate > AGGREGATE_PASS_RATE_THRESHOLD;
 
     if (isDailyStable && isAggregateHighSignal) {
-      trustworthyTests.push(testName);
-      if (info.file) {
-        const match = info.file.match(/evals\/.*\.eval\.ts/);
-        if (match) {
-          trustworthyFiles.add(match[0]);
+      // Suite filtering logic
+      let isFileAllowed = true;
+      if (requestedSuites && !runAllStable) {
+        if (info.file) {
+          const match = info.file.match(/evals\/.*\.eval\.ts/);
+          if (match && !allowedFiles.has(match[0])) {
+            isFileAllowed = false;
+          } else if (!match) {
+            isFileAllowed = false;
+          }
+        } else {
+          isFileAllowed = false;
+        }
+      }
+
+      if (isFileAllowed) {
+        trustworthyTests.push(testName);
+        if (info.file) {
+          const match = info.file.match(/evals\/.*\.eval\.ts/);
+          if (match) {
+            trustworthyFiles.add(match[0]);
+          }
         }
       }
     } else {
@@ -99,10 +156,14 @@ function main() {
     `✅ Found ${trustworthyTests.length} trustworthy tests across ${trustworthyFiles.size} files:`,
   );
   trustworthyTests.sort().forEach((name) => console.error(`   - ${name}`));
-  console.error(`\n⚪ Ignored ${volatileTests.length} volatile tests.`);
-  console.error(
-    `🆕 Ignored ${newTests.length} tests with insufficient history.`,
-  );
+  if (volatileTests.length > 0) {
+    console.error(`\n⚪ Ignored ${volatileTests.length} volatile tests.`);
+  }
+  if (newTests.length > 0) {
+    console.error(
+      `🆕 Ignored ${newTests.length} tests with insufficient history.`,
+    );
+  }
 
   // Output the list of names as a regex-friendly pattern for vitest -t
   const pattern = trustworthyTests.map((name) => escapeRegex(name)).join('|');
diff --git a/scripts/lint.js b/scripts/lint.js
index 0cf51cb8ba..f5782ce467 100644
--- a/scripts/lint.js
+++ b/scripts/lint.js
@@ -500,6 +500,9 @@ function main() {
   if (args.includes('--check-github-actions-pinning')) {
     runGithubActionsPinningLinter();
   }
+  if (args.includes('--eval-suites')) {
+    runEvalSuiteLinter();
+  }
 
   if (args.length === 0) {
     setupLinters();
@@ -511,8 +514,18 @@ function main() {
     runSensitiveKeywordLinter();
     runTSConfigLinter();
     runGithubActionsPinningLinter();
+    runEvalSuiteLinter();
     console.log('\nAll linting checks passed!');
   }
 }
 
+export function runEvalSuiteLinter() {
+  console.log('\nRunning eval suite linter...');
+  try {
+    execSync('node scripts/validate_eval_suites.js', { stdio: 'inherit' });
+  } catch {
+    process.exit(1);
+  }
+}
+
 main();
diff --git a/scripts/run_eval_regression.js b/scripts/run_eval_regression.js
index 7a64a6a2f9..03cdde90c1 100644
--- a/scripts/run_eval_regression.js
+++ b/scripts/run_eval_regression.js
@@ -22,22 +22,62 @@ import fs from 'node:fs';
 async function main() {
   const modelList = process.env.MODEL_LIST || 'gemini-3-flash-preview';
   const models = modelList.split(',').map((m) => m.trim());
+  const isRelatedMode = process.argv.includes('--related');
 
   let combinedReport = '';
   let hasRegression = false;
+  let detectionRationale = '';
+  let affectedSuitesStr = '';
 
   console.log(
     `🚀 Starting evaluation orchestration for models: ${models.join(', ')}`,
   );
 
+  if (isRelatedMode) {
+    console.log('🔍 Identifying related evaluations based on changes...');
+    try {
+      const detectionOutput = execSync(
+        `node scripts/changed_prompt.js --related --json`,
+        { encoding: 'utf-8', stdio: ['inherit', 'pipe', 'inherit'] },
+      ).trim();
+      const detection = JSON.parse(detectionOutput);
+
+      if (detection.affectedSuites && detection.affectedSuites.length > 0) {
+        affectedSuitesStr = detection.affectedSuites.join(',');
+        detectionRationale = '### 🧪 Related Evaluation Rationale\n\n';
+        detection.rationales.forEach((r) => {
+          detectionRationale += `- ${r}\n`;
+        });
+        detectionRationale +=
+          '\n_Something missing? [Update evals/suites.json](evals/README.md#related-testing-with-related) to adjust detection logic._\n\n---\n\n';
+      } else if (!detection.detected) {
+        console.log('✅ No related changes detected. Skipping evaluations.');
+        process.exit(0);
+      } else {
+        console.log(
+          '⚠️ Changes detected but no specific suites matched. Running full stable suite for safety.',
+        );
+        detectionRationale =
+          '### 🧪 Related Evaluation Rationale\n\n- No specific suites matched. Running full stable suite for safety.\n\n---\n\n';
+      }
+    } catch (e) {
+      console.error(`❌ Error during suite detection: ${e.message}`);
+      detectionRationale =
+        '### 🧪 Related Evaluation Rationale\n\n- Error during suite detection. Running full stable suite for safety.\n\n---\n\n';
+    }
+  }
+
   for (const model of models) {
     console.log(`\n--- Processing Model: ${model} ---`);
 
     try {
       // 1. Identify Trustworthy Evals
       console.log(`🔍 Identifying trustworthy tests for ${model}...`);
+      const suitesFlag = affectedSuitesStr
+        ? `--suites ${affectedSuitesStr}`
+        : '';
       const output = execSync(
-        `node scripts/get_trustworthy_evals.js "${model}"`,
+        `node scripts/get_trustworthy_evals.js "${model}" ${suitesFlag}`,
         {
           encoding: 'utf-8',
           stdio: ['inherit', 'pipe', 'inherit'], // Capture stdout but pass stdin/stderr
@@ -83,7 +123,8 @@ async function main() {
 
   // Always save the combined report to a file so the workflow can capture it cleanly
   if (combinedReport) {
-    fs.writeFileSync('eval_regression_report.md', combinedReport);
+    const finalReport = detectionRationale + combinedReport;
+    fs.writeFileSync('eval_regression_report.md', finalReport);
     console.log(
       '\n📊 Final Markdown report saved to eval_regression_report.md',
     );
diff --git a/scripts/validate_eval_suites.js b/scripts/validate_eval_suites.js
new file mode 100644
index 0000000000..9854856021
--- /dev/null
+++ b/scripts/validate_eval_suites.js
@@ -0,0 +1,98 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import fs from 'node:fs';
+import path from 'node:path';
+
+const SUITES_PATH = 'evals/suites.json';
+const EVALS_DIR = 'evals';
+
+/**
+ * Validates that all eval files are mapped in suites.json and that there are no overlaps.
+ */
+function main() {
+  if (!fs.existsSync(SUITES_PATH)) {
+    console.error(`❌ Error: ${SUITES_PATH} not found.`);
+    process.exit(1);
+  }
+
+  const suitesConfig = JSON.parse(fs.readFileSync(SUITES_PATH, 'utf-8'));
+  const allowedOverlaps = new Set(suitesConfig.allowedOverlaps || []);
+  const evalFilesOnDisk = fs
+    .readdirSync(EVALS_DIR)
+    .filter((f) => f.endsWith('.eval.ts'))
+    .map((f) => path.join(EVALS_DIR, f));
+
+  const evalToSuiteMap = new Map();
+  const errors = [];
+
+  // 1. Map evals to suites and check for overlaps/trigger-coverage
+  for (const [suiteName, suite] of Object.entries(suitesConfig)) {
+    if (suiteName === 'allowedOverlaps' || !suite.evals) continue;
+
+    for (const evalFile of suite.evals) {
+      if (evalFile === 'ALL_ALWAYS_PASSING') continue;
+
+      if (!fs.existsSync(evalFile)) {
+        errors.push(
+          `Suite **${suiteName}** references non-existent file: **${evalFile}**`,
+        );
+        continue;
+      }
+
+      // Check if the eval file itself is in the suite's trigger patterns
+      if (!suite.patterns || !suite.patterns.includes(evalFile)) {
+        errors.push(
+          `Trigger coverage missing: **${evalFile}** is in the **${suiteName}** suite but is missing from its **patterns** array. (Changes to the test won't trigger itself correctly).`,
+        );
+      }
+
+      if (evalToSuiteMap.has(evalFile) && !allowedOverlaps.has(evalFile)) {
+        errors.push(
+          `Overlap detected: **${evalFile}** is present in both **${evalToSuiteMap.get(evalFile)}** and **${suiteName}** suites.`,
+        );
+      } else {
+        const existingSuites = evalToSuiteMap.get(evalFile) || [];
+        evalToSuiteMap.set(
+          evalFile,
+          Array.isArray(existingSuites)
+            ? [...existingSuites, suiteName]
+            : [existingSuites, suiteName],
+        );
+      }
+    }
+  }
+
+  // 2. Check for orphaned evals (on disk but not in suites.json)
+  for (const diskFile of evalFilesOnDisk) {
+    if (!evalToSuiteMap.has(diskFile)) {
+      errors.push(
+        `Orphaned eval detected: **${diskFile}** is not mapped to any suite in ${SUITES_PATH}.`,
+      );
+    }
+  }
+
+  if (errors.length > 0) {
+    console.error('\n❌ Eval Suite Validation Failed:');
+    errors.forEach((err) => console.error(`  - ${err}`));
+
+    const hasOverlap = errors.some((err) => err.includes('Overlap detected'));
+    if (hasOverlap) {
+      console.error(
+        `\n💡 Tip: If this overlap is intentional, add the file path to the 'allowedOverlaps' list in ${SUITES_PATH}.`,
+      );
+    } else {
+      console.error(`\n💡 Tip: Update ${SUITES_PATH} to resolve these issues.`);
+    }
+    process.exit(1);
+  }
+
+  console.log(
+    '✅ Eval Suite Validation Passed: All files mapped and no overlaps found.',
+  );
+}
+
+main();