/** * @license * Copyright 2026 Google LLC * SPDX-License-Identifier: Apache-2.0 */ /** * @fileoverview Executes a high-signal regression check for behavioral evaluations. * * This script runs a targeted set of stable tests in an optimistic first pass. * If failures occur, it employs a "Best-of-4" retry logic to handle natural flakiness. * For confirmed failures (0/3), it performs Dynamic Baseline Verification by * checking the failure against the 'main' branch to distinguish between * model drift and PR-introduced regressions. */ import { execSync } from 'node:child_process'; import fs from 'node:fs'; import path from 'node:path'; import { quote } from 'shell-quote'; import { escapeRegex } from './eval_utils.js'; /** * Runs a set of tests using Vitest and returns the results. */ function runTests(files, pattern, model) { const outputDir = path.resolve( process.cwd(), `evals/logs/pr-run-${Date.now()}`, ); fs.mkdirSync(outputDir, { recursive: true }); const filesToRun = files || 'evals/'; console.log( `🚀 Running tests in ${filesToRun} with pattern: ${pattern?.slice(0, 100)}...`, ); try { const cmd = `npx vitest run --config evals/vitest.config.ts ${filesToRun} -t "${pattern}" --reporter=json --reporter=default --outputFile="${path.join(outputDir, 'report.json')}"`; execSync(cmd, { stdio: 'inherit', env: { ...process.env, RUN_EVALS: '1', GEMINI_MODEL: model }, }); } catch { // Vitest returns a non-zero exit code when tests fail. This is expected. // We continue execution and handle the failures by parsing the JSON report. } const reportPath = path.join(outputDir, 'report.json'); return fs.existsSync(reportPath) ? JSON.parse(fs.readFileSync(reportPath, 'utf-8')) : null; } /** * Helper to find a specific assertion by name across all test files. */ function findAssertion(report, testName) { if (!report?.testResults) return null; for (const fileResult of report.testResults) { const assertion = fileResult.assertionResults.find( (a) => a.title === testName, ); if (assertion) return assertion; } return null; } /** * Parses command line arguments to identify model, files, and test pattern. */ function parseArgs() { const modelArg = process.argv[2]; const remainingArgs = process.argv.slice(3); const fullArgsString = remainingArgs.join(' '); const testPatternIndex = remainingArgs.indexOf('--test-pattern'); if (testPatternIndex !== -1) { return { model: modelArg, files: remainingArgs.slice(0, testPatternIndex).join(' '), pattern: remainingArgs.slice(testPatternIndex + 1).join(' '), }; } if (fullArgsString.includes('--test-pattern')) { const parts = fullArgsString.split('--test-pattern'); return { model: modelArg, files: parts[0].trim(), pattern: parts[1].trim(), }; } // Fallback for manual mode: Pattern Model const manualPattern = process.argv[2]; const manualModel = process.argv[3]; if (!manualModel) { console.error('❌ Error: No target model specified.'); process.exit(1); } let manualFiles = 'evals/'; try { const grepResult = execSync( `grep -l ${quote([manualPattern])} evals/*.eval.ts`, { encoding: 'utf-8' }, ); manualFiles = grepResult.split('\n').filter(Boolean).join(' '); } catch { // Grep returns exit code 1 if no files match the pattern. // In this case, we fall back to scanning all files in the evals/ directory. } return { model: manualModel, files: manualFiles, pattern: manualPattern, isManual: true, }; } /** * Runs the targeted retry logic (Best-of-4) for a failing test. */ async function runRetries(testName, results, files, model) { console.log(`\nRe-evaluating: ${testName}`); while ( results[testName].passed < 2 && results[testName].total - results[testName].passed < 3 && results[testName].total < 4 ) { const attemptNum = results[testName].total + 1; console.log(` Running attempt ${attemptNum}...`); const retry = runTests(files, escapeRegex(testName), model); const retryAssertion = findAssertion(retry, testName); results[testName].total++; if (retryAssertion?.status === 'passed') { results[testName].passed++; console.log( ` ✅ Attempt ${attemptNum} passed. Score: ${results[testName].passed}/${results[testName].total}`, ); } else { console.log( ` ❌ Attempt ${attemptNum} failed (${retryAssertion?.status || 'unknown'}). Score: ${results[testName].passed}/${results[testName].total}`, ); } if (results[testName].passed >= 2) { console.log( ` ✅ Test cleared as Noisy Pass (${results[testName].passed}/${results[testName].total})`, ); } else if (results[testName].total - results[testName].passed >= 3) { await verifyBaseline(testName, results, files, model); } } } /** * Verifies a potential regression against the 'main' branch. */ async function verifyBaseline(testName, results, files, model) { console.log('\n--- Step 3: Dynamic Baseline Verification ---'); console.log( `âš ī¸ Potential regression detected. Verifying baseline on 'main'...`, ); try { execSync('git stash push -m "eval-regression-check-stash"', { stdio: 'inherit', }); const hasStash = execSync('git stash list') .toString() .includes('eval-regression-check-stash'); execSync('git checkout main', { stdio: 'inherit' }); console.log( `\n--- Running Baseline Verification on 'main' (Best-of-3) ---`, ); let baselinePasses = 0; let baselineTotal = 0; while (baselinePasses === 0 && baselineTotal < 3) { baselineTotal++; console.log(` Baseline Attempt ${baselineTotal}...`); const baselineRun = runTests(files, escapeRegex(testName), model); if (findAssertion(baselineRun, testName)?.status === 'passed') { baselinePasses++; console.log(` ✅ Baseline Attempt ${baselineTotal} passed.`); } else { console.log(` ❌ Baseline Attempt ${baselineTotal} failed.`); } } execSync('git checkout -', { stdio: 'inherit' }); if (hasStash) execSync('git stash pop', { stdio: 'inherit' }); if (baselinePasses === 0) { console.log( ` â„šī¸ Test also fails on 'main'. Marking as PRE-EXISTING (Cleared).`, ); results[testName].status = 'pre-existing'; results[testName].passed = results[testName].total; // Clear for report } else { console.log( ` ❌ Test passes on 'main' but fails in PR. Marking as CONFIRMED REGRESSION.`, ); results[testName].status = 'regression'; } } catch (error) { console.error(` ❌ Failed to verify baseline: ${error.message}`); // Best-effort cleanup: try to return to the original branch. try { execSync('git checkout -', { stdio: 'ignore' }); } catch { // Ignore checkout errors during cleanup to avoid hiding the original error. } } } /** * Processes initial results and orchestrates retries/baseline checks. */ async function processResults(firstPass, pattern, model, files) { if (!firstPass) return false; const results = {}; const failingTests = []; let totalProcessed = 0; for (const fileResult of firstPass.testResults) { for (const assertion of fileResult.assertionResults) { if (assertion.status !== 'passed' && assertion.status !== 'failed') { continue; } const name = assertion.title; results[name] = { passed: assertion.status === 'passed' ? 1 : 0, total: 1, file: fileResult.name, }; if (assertion.status === 'failed') failingTests.push(name); totalProcessed++; } } if (totalProcessed === 0) { console.error('❌ Error: No matching tests were found or executed.'); return false; } if (failingTests.length === 0) { console.log('✅ All trustworthy tests passed on the first try!'); } else { console.log('\n--- Step 2: Best-of-4 Retries ---'); console.log( `âš ī¸ ${failingTests.length} tests failed the optimistic run. Starting retries...`, ); for (const testName of failingTests) { await runRetries(testName, results, files, model); } } saveResults(results); return true; } function saveResults(results) { const finalReport = { timestamp: new Date().toISOString(), results }; fs.writeFileSync( 'evals/logs/pr_final_report.json', JSON.stringify(finalReport, null, 2), ); console.log('\nFinal report saved to evals/logs/pr_final_report.json'); } async function main() { const { model, files, pattern, isManual } = parseArgs(); if (isManual) { const firstPass = runTests(files, pattern, model); const success = await processResults(firstPass, pattern, model, files); process.exit(success ? 0 : 1); } if (!pattern) { console.log('No trustworthy tests to run.'); process.exit(0); } console.log('\n--- Step 1: Optimistic Run (N=1) ---'); const firstPass = runTests(files, pattern, model); const success = await processResults(firstPass, pattern, model, files); process.exit(success ? 0 : 1); } main().catch((err) => { console.error(err); process.exit(1); });