feat(evals): implement related evaluation system for targeted testing

2026-07-19 06:20:44 -07:00 · 2026-04-07 19:35:12 -07:00
parent 06fcdc231c
commit db8910c39b
9 changed files with 610 additions and 43 deletions
@@ -3,7 +3,23 @@
 * Copyright 2026 Google LLC
 * SPDX-License-Identifier: Apache-2.0
 */
+
+/**
+ * @fileoverview Intelligence layer for detecting steering and behavior changes.
+ *
+ * This script identifies if code changes affect model steering (system prompts,
+ * tool definitions, agent instructions) and maps them to relevant evaluation
+ * suites. It supports both CI (GitHub Actions) and local development workflows.
+ *
+ * Detection Methods:
+ * 1. Path-based: Monitors critical steering and tool directories.
+ * 2. Signature-based: Scans diff content for core steering primitives
+ *    (e.g., ToolDefinition, inputSchema).
+ * 3. Suite-aware: Uses evals/suites.json to identify related tests for surgical runs.
+ */
+
 import { execSync } from 'node:child_process';
+import fs from 'node:fs';

 const CORE_STEERING_PATHS = [
  'packages/core/src/prompts/',
@@ -20,46 +36,132 @@ const STEERING_SIGNATURES = [
  "kind: 'local'",
 ];

+function minimatch(file, pattern) {
+  if (pattern.endsWith('/**')) {
+    const prefix = pattern.slice(0, -3);
+    return file.startsWith(prefix);
+  }
+  if (pattern.includes('*')) {
+    const regex = new RegExp(
+      '^' +
+        pattern
+          .replace(/\./g, '\\.')
+          .replace(/\*\*/g, '.*')
+          .replace(/\*/g, '[^/]*') +
+        '$',
+    );
+    return regex.test(file);
+  }
+  return file === pattern;
+}
+
 function main() {
  const targetBranch = process.env.GITHUB_BASE_REF || 'main';
  const verbose = process.argv.includes('--verbose');
  const steeringOnly = process.argv.includes('--steering-only');
+  const isRelatedMode = process.argv.includes('--related');
+  const isJsonMode = process.argv.includes('--json');

  try {
    const remoteUrl = process.env.GITHUB_REPOSITORY
      ? `https://github.com/${process.env.GITHUB_REPOSITORY}.git`
      : 'origin';

-    // Fetch target branch from the remote.
-    execSync(`git fetch ${remoteUrl} ${targetBranch}`, {
-      stdio: 'ignore',
-    });
+    let changedFiles = [];
+    const isCi = !!process.env.GITHUB_ACTIONS;

-    // Get changed files using the triple-dot syntax which correctly handles merge commits
-    const head = process.env.PR_HEAD_SHA || 'HEAD';
-    const changedFiles = execSync(`git diff --name-only FETCH_HEAD...${head}`, {
-      encoding: 'utf-8',
-    })
-      .split('\n')
-      .filter(Boolean);
+    if (isCi) {
+      try {
+        // 1. Try fetching from remote (CI environment)
+        execSync(`git fetch ${remoteUrl} ${targetBranch}`, {
+          stdio: 'ignore',
+        });
+
+        // Get changed files using the triple-dot syntax which correctly handles merge commits
+        const head = process.env.PR_HEAD_SHA || 'HEAD';
+        changedFiles = execSync(`git diff --name-only FETCH_HEAD...${head}`, {
+          encoding: 'utf-8',
+        })
+          .split('\n')
+          .filter(Boolean);
+      } catch (e) {
+        if (verbose)
+          process.stderr.write(
+            `Warning: git fetch failed in CI: ${e.message}\n`,
+          );
+      }
+    }
+
+    // 2. Local fallback or if CI fetch failed: Try diffing against target branch
+    if (changedFiles.length === 0) {
+      try {
+        changedFiles = execSync(`git diff --name-only ${targetBranch}`, {
+          encoding: 'utf-8',
+        })
+          .split('\n')
+          .filter(Boolean);
+      } catch {
+        // 3. Last resort: Just diff against HEAD (uncommitted changes only)
+        changedFiles = execSync('git diff --name-only HEAD', {
+          encoding: 'utf-8',
+        })
+          .split('\n')
+          .filter(Boolean);
+      }
+
+      // Also include untracked files in local mode
+      const untracked = execSync('git ls-files --others --exclude-standard', {
+        encoding: 'utf-8',
+      })
+        .split('\n')
+        .filter(Boolean);
+      changedFiles = [...new Set([...changedFiles, ...untracked])];
+    }

    let detected = false;
    const reasons = [];
+    const affectedSuites = new Set();
+    const rationales = [];
+
+    // Load suites for --related mode
+    let suitesConfig = null;
+    if (isRelatedMode) {
+      try {
+        suitesConfig = JSON.parse(
+          fs.readFileSync('evals/suites.json', 'utf-8'),
+        );
+      } catch {
+        process.stderr.write(`Warning: Could not load evals/suites.json\n`);
+      }
+    }

    // 1. Path-based detection
    for (const file of changedFiles) {
      if (CORE_STEERING_PATHS.some((prefix) => file.startsWith(prefix))) {
        detected = true;
        reasons.push(`Matched core steering path: ${file}`);
-        if (!verbose) break;
      }
      if (
        !steeringOnly &&
-        TEST_PATHS.some((prefix) => file.startsWith(prefix))
+        TEST_PATHS.some((prefix) => file.startsWith(prefix)) &&
+        file.endsWith('.eval.ts')
      ) {
        detected = true;
-        reasons.push(`Matched test path: ${file}`);
-        if (!verbose) break;
+        reasons.push(`Matched test file: ${file}`);
+      }
+
+      // Related suite detection
+      if (suitesConfig) {
+        for (const [suiteName, suite] of Object.entries(suitesConfig)) {
+          if (suiteName === 'allowedOverlaps' || !suite.patterns) continue;
+
+          if (suite.patterns.some((pattern) => minimatch(file, pattern))) {
+            affectedSuites.add(suiteName);
+            rationales.push(
+              `Testing **${suiteName}** because **${file}** was modified.`,
+            );
+          }
+        }
      }
    }

@@ -70,15 +172,30 @@ function main() {
      );
      if (coreChanges.length > 0) {
        // Get the actual diff content for core files
-        const diff = execSync(
-          `git diff -U0 FETCH_HEAD...${head} -- packages/core/src/`,
-          { encoding: 'utf-8' },
-        );
+        // We need to be careful with the diff command depending on if we have FETCH_HEAD
+        let diffCmd = '';
+        try {
+          const head = process.env.PR_HEAD_SHA || 'HEAD';
+          diffCmd = `git diff -U0 FETCH_HEAD...${head} -- packages/core/src/`;
+          execSync('git rev-parse FETCH_HEAD', { stdio: 'ignore' });
+        } catch {
+          diffCmd = `git diff -U0 ${targetBranch} -- packages/core/src/`;
+        }
+
+        const diff = execSync(diffCmd, { encoding: 'utf-8' });
        for (const sig of STEERING_SIGNATURES) {
          if (diff.includes(sig)) {
            detected = true;
            reasons.push(`Matched steering signature in core: ${sig}`);
-            if (!verbose) break;
+
+            // If we detected a steering signature, mark core_steering suite
+            if (isRelatedMode) {
+              affectedSuites.add('core_steering');
+              rationales.push(
+                `Testing **core_steering** because matched signature '${sig}' in core files.`,
+              );
+            }
+            if (!verbose && !isRelatedMode) break;
          }
        }
      }
@@ -89,14 +206,38 @@ function main() {
      reasons.forEach((r) => process.stderr.write(` - ${r}\n`));
    }

-    process.stdout.write(detected ? 'true' : 'false');
+    if (isJsonMode) {
+      process.stdout.write(
+        JSON.stringify(
+          {
+            detected,
+            reasons,
+            affectedSuites: Array.from(affectedSuites),
+            rationales,
+          },
+          null,
+          2,
+        ),
+      );
+    } else {
+      process.stdout.write(detected ? 'true' : 'false');
+    }
  } catch (error) {
-    // If anything fails (e.g., no git history), run evals/guidance to be safe
-    process.stderr.write(
-      'Warning: Failed to determine if changes occurred. Defaulting to true.\n',
-    );
+    if (isJsonMode) {
+      process.stdout.write(
+        JSON.stringify({
+          detected: true,
+          reasons: [`Error during detection: ${error.message}`],
+          affectedSuites: ['core_steering'],
+          rationales: [
+            'Error during detection: running all stable evals for safety.',
+          ],
+        }),
+      );
+    } else {
+      process.stdout.write('true');
+    }
    process.stderr.write(String(error) + '\n');
-    process.stdout.write('true');
  }
 }

@@ -13,6 +13,7 @@
 * to ensure high-signal validation and minimize noise.
 */

+import fs from 'node:fs';
 import { fetchNightlyHistory, escapeRegex } from './eval_utils.js';

 const LOOKBACK_COUNT = 6;
@@ -25,11 +26,24 @@ const AGGREGATE_PASS_RATE_THRESHOLD = 0.8; // Weekly signal (e.g., 15/18)
 */
 function main() {
  const targetModel = process.argv[2];
-  if (!targetModel) {
+  if (!targetModel || targetModel.startsWith('--')) {
    console.error('❌ Error: No target model specified.');
    process.exit(1);
  }
+
+  // Parse --suites argument
+  const suitesArgIndex = process.argv.indexOf('--suites');
+  let requestedSuites = null;
+  if (suitesArgIndex !== -1 && process.argv[suitesArgIndex + 1]) {
+    requestedSuites = process.argv[suitesArgIndex + 1]
+      .split(',')
+      .map((s) => s.trim());
+  }
+
  console.error(`🔍 Identifying trustworthy evals for model: ${targetModel}`);
+  if (requestedSuites) {
+    console.error(`📂 Filtering by suites: ${requestedSuites.join(', ')}`);
+  }

  const history = fetchNightlyHistory(LOOKBACK_COUNT);
  if (history.length === 0) {
@@ -37,6 +51,32 @@ function main() {
    process.exit(1);
  }

+  // Load suites configuration
+  let allowedFiles = null;
+  let runAllStable = false;
+  if (requestedSuites) {
+    try {
+      const suitesConfig = JSON.parse(
+        fs.readFileSync('evals/suites.json', 'utf-8'),
+      );
+      allowedFiles = new Set();
+      for (const suiteName of requestedSuites) {
+        const suite = suitesConfig[suiteName];
+        if (suite) {
+          if (suite.evals.includes('ALL_ALWAYS_PASSING')) {
+            runAllStable = true;
+          } else {
+            suite.evals.forEach((file) => allowedFiles.add(file));
+          }
+        }
+      }
+    } catch (e) {
+      console.error(
+        `⚠️ Warning: Could not load evals/suites.json or match suites: ${e.message}`,
+      );
+    }
+  }
+
  // Aggregate results for the target model across all history
  const testHistories = {}; // { [testName]: { totalPassed: 0, totalRuns: 0, dailyRates: [], file: string } }

@@ -83,11 +123,28 @@ function main() {
    const isAggregateHighSignal = aggregateRate > AGGREGATE_PASS_RATE_THRESHOLD;

    if (isDailyStable && isAggregateHighSignal) {
-      trustworthyTests.push(testName);
-      if (info.file) {
-        const match = info.file.match(/evals\/.*\.eval\.ts/);
-        if (match) {
-          trustworthyFiles.add(match[0]);
+      // Suite filtering logic
+      let isFileAllowed = true;
+      if (requestedSuites && !runAllStable) {
+        if (info.file) {
+          const match = info.file.match(/evals\/.*\.eval\.ts/);
+          if (match && !allowedFiles.has(match[0])) {
+            isFileAllowed = false;
+          } else if (!match) {
+            isFileAllowed = false;
+          }
+        } else {
+          isFileAllowed = false;
+        }
+      }
+
+      if (isFileAllowed) {
+        trustworthyTests.push(testName);
+        if (info.file) {
+          const match = info.file.match(/evals\/.*\.eval\.ts/);
+          if (match) {
+            trustworthyFiles.add(match[0]);
+          }
        }
      }
    } else {
@@ -99,10 +156,14 @@ function main() {
    `✅ Found ${trustworthyTests.length} trustworthy tests across ${trustworthyFiles.size} files:`,
  );
  trustworthyTests.sort().forEach((name) => console.error(`   - ${name}`));
-  console.error(`\n⚪ Ignored ${volatileTests.length} volatile tests.`);
-  console.error(
-    `🆕 Ignored ${newTests.length} tests with insufficient history.`,
-  );
+  if (volatileTests.length > 0) {
+    console.error(`\n⚪ Ignored ${volatileTests.length} volatile tests.`);
+  }
+  if (newTests.length > 0) {
+    console.error(
+      `🆕 Ignored ${newTests.length} tests with insufficient history.`,
+    );
+  }

  // Output the list of names as a regex-friendly pattern for vitest -t
  const pattern = trustworthyTests.map((name) => escapeRegex(name)).join('|');
@@ -500,6 +500,9 @@ function main() {
  if (args.includes('--check-github-actions-pinning')) {
    runGithubActionsPinningLinter();
  }
+  if (args.includes('--eval-suites')) {
+    runEvalSuiteLinter();
+  }

  if (args.length === 0) {
    setupLinters();
@@ -511,8 +514,18 @@ function main() {
    runSensitiveKeywordLinter();
    runTSConfigLinter();
    runGithubActionsPinningLinter();
+    runEvalSuiteLinter();
    console.log('\nAll linting checks passed!');
  }
 }

+export function runEvalSuiteLinter() {
+  console.log('\nRunning eval suite linter...');
+  try {
+    execSync('node scripts/validate_eval_suites.js', { stdio: 'inherit' });
+  } catch {
+    process.exit(1);
+  }
+}
+
 main();
@@ -22,22 +22,62 @@ import fs from 'node:fs';
 async function main() {
  const modelList = process.env.MODEL_LIST || 'gemini-3-flash-preview';
  const models = modelList.split(',').map((m) => m.trim());
+  const isRelatedMode = process.argv.includes('--related');

  let combinedReport = '';
  let hasRegression = false;
+  let detectionRationale = '';
+  let affectedSuitesStr = '';

  console.log(
    `🚀 Starting evaluation orchestration for models: ${models.join(', ')}`,
  );

+  if (isRelatedMode) {
+    console.log('🔍 Identifying related evaluations based on changes...');
+    try {
+      const detectionOutput = execSync(
+        `node scripts/changed_prompt.js --related --json`,
+        { encoding: 'utf-8', stdio: ['inherit', 'pipe', 'inherit'] },
+      ).trim();
+      const detection = JSON.parse(detectionOutput);
+
+      if (detection.affectedSuites && detection.affectedSuites.length > 0) {
+        affectedSuitesStr = detection.affectedSuites.join(',');
+        detectionRationale = '### 🧪 Related Evaluation Rationale\n\n';
+        detection.rationales.forEach((r) => {
+          detectionRationale += `- ${r}\n`;
+        });
+        detectionRationale +=
+          '\n_Something missing? [Update evals/suites.json](evals/README.md#related-testing-with-related) to adjust detection logic._\n\n---\n\n';
+      } else if (!detection.detected) {
+        console.log('✅ No related changes detected. Skipping evaluations.');
+        process.exit(0);
+      } else {
+        console.log(
+          '⚠️ Changes detected but no specific suites matched. Running full stable suite for safety.',
+        );
+        detectionRationale =
+          '### 🧪 Related Evaluation Rationale\n\n- No specific suites matched. Running full stable suite for safety.\n\n---\n\n';
+      }
+    } catch (e) {
+      console.error(`❌ Error during suite detection: ${e.message}`);
+      detectionRationale =
+        '### 🧪 Related Evaluation Rationale\n\n- Error during suite detection. Running full stable suite for safety.\n\n---\n\n';
+    }
+  }
+
  for (const model of models) {
    console.log(`\n--- Processing Model: ${model} ---`);

    try {
      // 1. Identify Trustworthy Evals
      console.log(`🔍 Identifying trustworthy tests for ${model}...`);
+      const suitesFlag = affectedSuitesStr
+        ? `--suites ${affectedSuitesStr}`
+        : '';
      const output = execSync(
-        `node scripts/get_trustworthy_evals.js "${model}"`,
+        `node scripts/get_trustworthy_evals.js "${model}" ${suitesFlag}`,
        {
          encoding: 'utf-8',
          stdio: ['inherit', 'pipe', 'inherit'], // Capture stdout but pass stdin/stderr
@@ -83,7 +123,8 @@ async function main() {

  // Always save the combined report to a file so the workflow can capture it cleanly
  if (combinedReport) {
-    fs.writeFileSync('eval_regression_report.md', combinedReport);
+    const finalReport = detectionRationale + combinedReport;
+    fs.writeFileSync('eval_regression_report.md', finalReport);
    console.log(
      '\n📊 Final Markdown report saved to eval_regression_report.md',
    );
@@ -0,0 +1,98 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import fs from 'node:fs';
+import path from 'node:path';
+
+const SUITES_PATH = 'evals/suites.json';
+const EVALS_DIR = 'evals';
+
+/**
+ * Validates that all eval files are mapped in suites.json and that there are no overlaps.
+ */
+function main() {
+  if (!fs.existsSync(SUITES_PATH)) {
+    console.error(`❌ Error: ${SUITES_PATH} not found.`);
+    process.exit(1);
+  }
+
+  const suitesConfig = JSON.parse(fs.readFileSync(SUITES_PATH, 'utf-8'));
+  const allowedOverlaps = new Set(suitesConfig.allowedOverlaps || []);
+  const evalFilesOnDisk = fs
+    .readdirSync(EVALS_DIR)
+    .filter((f) => f.endsWith('.eval.ts'))
+    .map((f) => path.join(EVALS_DIR, f));
+
+  const evalToSuiteMap = new Map();
+  const errors = [];
+
+  // 1. Map evals to suites and check for overlaps/trigger-coverage
+  for (const [suiteName, suite] of Object.entries(suitesConfig)) {
+    if (suiteName === 'allowedOverlaps' || !suite.evals) continue;
+
+    for (const evalFile of suite.evals) {
+      if (evalFile === 'ALL_ALWAYS_PASSING') continue;
+
+      if (!fs.existsSync(evalFile)) {
+        errors.push(
+          `Suite **${suiteName}** references non-existent file: **${evalFile}**`,
+        );
+        continue;
+      }
+
+      // Check if the eval file itself is in the suite's trigger patterns
+      if (!suite.patterns || !suite.patterns.includes(evalFile)) {
+        errors.push(
+          `Trigger coverage missing: **${evalFile}** is in the **${suiteName}** suite but is missing from its **patterns** array. (Changes to the test won't trigger itself correctly).`,
+        );
+      }
+
+      if (evalToSuiteMap.has(evalFile) && !allowedOverlaps.has(evalFile)) {
+        errors.push(
+          `Overlap detected: **${evalFile}** is present in both **${evalToSuiteMap.get(evalFile)}** and **${suiteName}** suites.`,
+        );
+      } else {
+        const existingSuites = evalToSuiteMap.get(evalFile) || [];
+        evalToSuiteMap.set(
+          evalFile,
+          Array.isArray(existingSuites)
+            ? [...existingSuites, suiteName]
+            : [existingSuites, suiteName],
+        );
+      }
+    }
+  }
+
+  // 2. Check for orphaned evals (on disk but not in suites.json)
+  for (const diskFile of evalFilesOnDisk) {
+    if (!evalToSuiteMap.has(diskFile)) {
+      errors.push(
+        `Orphaned eval detected: **${diskFile}** is not mapped to any suite in ${SUITES_PATH}.`,
+      );
+    }
+  }
+
+  if (errors.length > 0) {
+    console.error('\n❌ Eval Suite Validation Failed:');
+    errors.forEach((err) => console.error(`  - ${err}`));
+
+    const hasOverlap = errors.some((err) => err.includes('Overlap detected'));
+    if (hasOverlap) {
+      console.error(
+        `\n💡 Tip: If this overlap is intentional, add the file path to the 'allowedOverlaps' list in ${SUITES_PATH}.`,
+      );
+    } else {
+      console.error(`\n💡 Tip: Update ${SUITES_PATH} to resolve these issues.`);
+    }
+    process.exit(1);
+  }
+
+  console.log(
+    '✅ Eval Suite Validation Passed: All files mapped and no overlaps found.',
+  );
+}
+
+main();