feat: implement high-signal PR regression check for evaluations (#23937)

2026-06-15 22:07:29 -07:00 · 2026-04-01 22:14:43 -07:00
parent e446733b53
commit 973092df50
8 changed files with 1002 additions and 69 deletions
@@ -0,0 +1,305 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/**
+ * @fileoverview Executes a high-signal regression check for behavioral evaluations.
+ *
+ * This script runs a targeted set of stable tests in an optimistic first pass.
+ * If failures occur, it employs a "Best-of-4" retry logic to handle natural flakiness.
+ * For confirmed failures (0/3), it performs Dynamic Baseline Verification by
+ * checking the failure against the 'main' branch to distinguish between
+ * model drift and PR-introduced regressions.
+ */
+
+import { execSync } from 'node:child_process';
+import fs from 'node:fs';
+import path from 'node:path';
+import { quote } from 'shell-quote';
+import { escapeRegex } from './eval_utils.js';
+
+/**
+ * Runs a set of tests using Vitest and returns the results.
+ */
+function runTests(files, pattern, model) {
+  const outputDir = path.resolve(
+    process.cwd(),
+    `evals/logs/pr-run-${Date.now()}`,
+  );
+  fs.mkdirSync(outputDir, { recursive: true });
+
+  const filesToRun = files || 'evals/';
+  console.log(
+    `🚀 Running tests in ${filesToRun} with pattern: ${pattern?.slice(0, 100)}...`,
+  );
+
+  try {
+    const cmd = `npx vitest run --config evals/vitest.config.ts ${filesToRun} -t "${pattern}" --reporter=json --reporter=default --outputFile="${path.join(outputDir, 'report.json')}"`;
+    execSync(cmd, {
+      stdio: 'inherit',
+      env: { ...process.env, RUN_EVALS: '1', GEMINI_MODEL: model },
+    });
+  } catch {
+    // Vitest returns a non-zero exit code when tests fail. This is expected.
+    // We continue execution and handle the failures by parsing the JSON report.
+  }
+
+  const reportPath = path.join(outputDir, 'report.json');
+  return fs.existsSync(reportPath)
+    ? JSON.parse(fs.readFileSync(reportPath, 'utf-8'))
+    : null;
+}
+
+/**
+ * Helper to find a specific assertion by name across all test files.
+ */
+function findAssertion(report, testName) {
+  if (!report?.testResults) return null;
+  for (const fileResult of report.testResults) {
+    const assertion = fileResult.assertionResults.find(
+      (a) => a.title === testName,
+    );
+    if (assertion) return assertion;
+  }
+  return null;
+}
+
+/**
+ * Parses command line arguments to identify model, files, and test pattern.
+ */
+function parseArgs() {
+  const modelArg = process.argv[2];
+  const remainingArgs = process.argv.slice(3);
+  const fullArgsString = remainingArgs.join(' ');
+  const testPatternIndex = remainingArgs.indexOf('--test-pattern');
+
+  if (testPatternIndex !== -1) {
+    return {
+      model: modelArg,
+      files: remainingArgs.slice(0, testPatternIndex).join(' '),
+      pattern: remainingArgs.slice(testPatternIndex + 1).join(' '),
+    };
+  }
+
+  if (fullArgsString.includes('--test-pattern')) {
+    const parts = fullArgsString.split('--test-pattern');
+    return {
+      model: modelArg,
+      files: parts[0].trim(),
+      pattern: parts[1].trim(),
+    };
+  }
+
+  // Fallback for manual mode: Pattern Model
+  const manualPattern = process.argv[2];
+  const manualModel = process.argv[3];
+  if (!manualModel) {
+    console.error('❌ Error: No target model specified.');
+    process.exit(1);
+  }
+
+  let manualFiles = 'evals/';
+  try {
+    const grepResult = execSync(
+      `grep -l ${quote([manualPattern])} evals/*.eval.ts`,
+      { encoding: 'utf-8' },
+    );
+    manualFiles = grepResult.split('\n').filter(Boolean).join(' ');
+  } catch {
+    // Grep returns exit code 1 if no files match the pattern.
+    // In this case, we fall back to scanning all files in the evals/ directory.
+  }
+
+  return {
+    model: manualModel,
+    files: manualFiles,
+    pattern: manualPattern,
+    isManual: true,
+  };
+}
+
+/**
+ * Runs the targeted retry logic (Best-of-4) for a failing test.
+ */
+async function runRetries(testName, results, files, model) {
+  console.log(`\nRe-evaluating: ${testName}`);
+
+  while (
+    results[testName].passed < 2 &&
+    results[testName].total - results[testName].passed < 3 &&
+    results[testName].total < 4
+  ) {
+    const attemptNum = results[testName].total + 1;
+    console.log(`  Running attempt ${attemptNum}...`);
+
+    const retry = runTests(files, escapeRegex(testName), model);
+    const retryAssertion = findAssertion(retry, testName);
+
+    results[testName].total++;
+    if (retryAssertion?.status === 'passed') {
+      results[testName].passed++;
+      console.log(
+        `  ✅ Attempt ${attemptNum} passed. Score: ${results[testName].passed}/${results[testName].total}`,
+      );
+    } else {
+      console.log(
+        `  ❌ Attempt ${attemptNum} failed (${retryAssertion?.status || 'unknown'}). Score: ${results[testName].passed}/${results[testName].total}`,
+      );
+    }
+
+    if (results[testName].passed >= 2) {
+      console.log(
+        `  ✅ Test cleared as Noisy Pass (${results[testName].passed}/${results[testName].total})`,
+      );
+    } else if (results[testName].total - results[testName].passed >= 3) {
+      await verifyBaseline(testName, results, files, model);
+    }
+  }
+}
+
+/**
+ * Verifies a potential regression against the 'main' branch.
+ */
+async function verifyBaseline(testName, results, files, model) {
+  console.log('\n--- Step 3: Dynamic Baseline Verification ---');
+  console.log(
+    `⚠️ Potential regression detected. Verifying baseline on 'main'...`,
+  );
+
+  try {
+    execSync('git stash push -m "eval-regression-check-stash"', {
+      stdio: 'inherit',
+    });
+    const hasStash = execSync('git stash list')
+      .toString()
+      .includes('eval-regression-check-stash');
+    execSync('git checkout main', { stdio: 'inherit' });
+
+    console.log(
+      `\n--- Running Baseline Verification on 'main' (Best-of-3) ---`,
+    );
+    let baselinePasses = 0;
+    let baselineTotal = 0;
+
+    while (baselinePasses === 0 && baselineTotal < 3) {
+      baselineTotal++;
+      console.log(`  Baseline Attempt ${baselineTotal}...`);
+      const baselineRun = runTests(files, escapeRegex(testName), model);
+      if (findAssertion(baselineRun, testName)?.status === 'passed') {
+        baselinePasses++;
+        console.log(`  ✅ Baseline Attempt ${baselineTotal} passed.`);
+      } else {
+        console.log(`  ❌ Baseline Attempt ${baselineTotal} failed.`);
+      }
+    }
+
+    execSync('git checkout -', { stdio: 'inherit' });
+    if (hasStash) execSync('git stash pop', { stdio: 'inherit' });
+
+    if (baselinePasses === 0) {
+      console.log(
+        `  ℹ️ Test also fails on 'main'. Marking as PRE-EXISTING (Cleared).`,
+      );
+      results[testName].status = 'pre-existing';
+      results[testName].passed = results[testName].total; // Clear for report
+    } else {
+      console.log(
+        `  ❌ Test passes on 'main' but fails in PR. Marking as CONFIRMED REGRESSION.`,
+      );
+      results[testName].status = 'regression';
+    }
+  } catch (error) {
+    console.error(`  ❌ Failed to verify baseline: ${error.message}`);
+
+    // Best-effort cleanup: try to return to the original branch.
+    try {
+      execSync('git checkout -', { stdio: 'ignore' });
+    } catch {
+      // Ignore checkout errors during cleanup to avoid hiding the original error.
+    }
+  }
+}
+
+/**
+ * Processes initial results and orchestrates retries/baseline checks.
+ */
+async function processResults(firstPass, pattern, model, files) {
+  if (!firstPass) return false;
+
+  const results = {};
+  const failingTests = [];
+  let totalProcessed = 0;
+
+  for (const fileResult of firstPass.testResults) {
+    for (const assertion of fileResult.assertionResults) {
+      if (assertion.status !== 'passed' && assertion.status !== 'failed') {
+        continue;
+      }
+
+      const name = assertion.title;
+      results[name] = {
+        passed: assertion.status === 'passed' ? 1 : 0,
+        total: 1,
+        file: fileResult.name,
+      };
+      if (assertion.status === 'failed') failingTests.push(name);
+      totalProcessed++;
+    }
+  }
+
+  if (totalProcessed === 0) {
+    console.error('❌ Error: No matching tests were found or executed.');
+    return false;
+  }
+
+  if (failingTests.length === 0) {
+    console.log('✅ All trustworthy tests passed on the first try!');
+  } else {
+    console.log('\n--- Step 2: Best-of-4 Retries ---');
+    console.log(
+      `⚠️ ${failingTests.length} tests failed the optimistic run. Starting retries...`,
+    );
+    for (const testName of failingTests) {
+      await runRetries(testName, results, files, model);
+    }
+  }
+
+  saveResults(results);
+  return true;
+}
+
+function saveResults(results) {
+  const finalReport = { timestamp: new Date().toISOString(), results };
+  fs.writeFileSync(
+    'evals/logs/pr_final_report.json',
+    JSON.stringify(finalReport, null, 2),
+  );
+  console.log('\nFinal report saved to evals/logs/pr_final_report.json');
+}
+
+async function main() {
+  const { model, files, pattern, isManual } = parseArgs();
+
+  if (isManual) {
+    const firstPass = runTests(files, pattern, model);
+    const success = await processResults(firstPass, pattern, model, files);
+    process.exit(success ? 0 : 1);
+  }
+
+  if (!pattern) {
+    console.log('No trustworthy tests to run.');
+    process.exit(0);
+  }
+
+  console.log('\n--- Step 1: Optimistic Run (N=1) ---');
+  const firstPass = runTests(files, pattern, model);
+  const success = await processResults(firstPass, pattern, model, files);
+  process.exit(success ? 0 : 1);
+}
+
+main().catch((err) => {
+  console.error(err);
+  process.exit(1);
+});