mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-04-03 09:50:40 -07:00
feat: implement high-signal PR regression check for evaluations (#23937)
This commit is contained in:
305
scripts/run_regression_check.js
Normal file
305
scripts/run_regression_check.js
Normal file
@@ -0,0 +1,305 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
/**
|
||||
* @fileoverview Executes a high-signal regression check for behavioral evaluations.
|
||||
*
|
||||
* This script runs a targeted set of stable tests in an optimistic first pass.
|
||||
* If failures occur, it employs a "Best-of-4" retry logic to handle natural flakiness.
|
||||
* For confirmed failures (0/3), it performs Dynamic Baseline Verification by
|
||||
* checking the failure against the 'main' branch to distinguish between
|
||||
* model drift and PR-introduced regressions.
|
||||
*/
|
||||
|
||||
import { execSync } from 'node:child_process';
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
import { quote } from 'shell-quote';
|
||||
import { escapeRegex } from './eval_utils.js';
|
||||
|
||||
/**
|
||||
* Runs a set of tests using Vitest and returns the results.
|
||||
*/
|
||||
function runTests(files, pattern, model) {
|
||||
const outputDir = path.resolve(
|
||||
process.cwd(),
|
||||
`evals/logs/pr-run-${Date.now()}`,
|
||||
);
|
||||
fs.mkdirSync(outputDir, { recursive: true });
|
||||
|
||||
const filesToRun = files || 'evals/';
|
||||
console.log(
|
||||
`🚀 Running tests in ${filesToRun} with pattern: ${pattern?.slice(0, 100)}...`,
|
||||
);
|
||||
|
||||
try {
|
||||
const cmd = `npx vitest run --config evals/vitest.config.ts ${filesToRun} -t "${pattern}" --reporter=json --reporter=default --outputFile="${path.join(outputDir, 'report.json')}"`;
|
||||
execSync(cmd, {
|
||||
stdio: 'inherit',
|
||||
env: { ...process.env, RUN_EVALS: '1', GEMINI_MODEL: model },
|
||||
});
|
||||
} catch {
|
||||
// Vitest returns a non-zero exit code when tests fail. This is expected.
|
||||
// We continue execution and handle the failures by parsing the JSON report.
|
||||
}
|
||||
|
||||
const reportPath = path.join(outputDir, 'report.json');
|
||||
return fs.existsSync(reportPath)
|
||||
? JSON.parse(fs.readFileSync(reportPath, 'utf-8'))
|
||||
: null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper to find a specific assertion by name across all test files.
|
||||
*/
|
||||
function findAssertion(report, testName) {
|
||||
if (!report?.testResults) return null;
|
||||
for (const fileResult of report.testResults) {
|
||||
const assertion = fileResult.assertionResults.find(
|
||||
(a) => a.title === testName,
|
||||
);
|
||||
if (assertion) return assertion;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses command line arguments to identify model, files, and test pattern.
|
||||
*/
|
||||
function parseArgs() {
|
||||
const modelArg = process.argv[2];
|
||||
const remainingArgs = process.argv.slice(3);
|
||||
const fullArgsString = remainingArgs.join(' ');
|
||||
const testPatternIndex = remainingArgs.indexOf('--test-pattern');
|
||||
|
||||
if (testPatternIndex !== -1) {
|
||||
return {
|
||||
model: modelArg,
|
||||
files: remainingArgs.slice(0, testPatternIndex).join(' '),
|
||||
pattern: remainingArgs.slice(testPatternIndex + 1).join(' '),
|
||||
};
|
||||
}
|
||||
|
||||
if (fullArgsString.includes('--test-pattern')) {
|
||||
const parts = fullArgsString.split('--test-pattern');
|
||||
return {
|
||||
model: modelArg,
|
||||
files: parts[0].trim(),
|
||||
pattern: parts[1].trim(),
|
||||
};
|
||||
}
|
||||
|
||||
// Fallback for manual mode: Pattern Model
|
||||
const manualPattern = process.argv[2];
|
||||
const manualModel = process.argv[3];
|
||||
if (!manualModel) {
|
||||
console.error('❌ Error: No target model specified.');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
let manualFiles = 'evals/';
|
||||
try {
|
||||
const grepResult = execSync(
|
||||
`grep -l ${quote([manualPattern])} evals/*.eval.ts`,
|
||||
{ encoding: 'utf-8' },
|
||||
);
|
||||
manualFiles = grepResult.split('\n').filter(Boolean).join(' ');
|
||||
} catch {
|
||||
// Grep returns exit code 1 if no files match the pattern.
|
||||
// In this case, we fall back to scanning all files in the evals/ directory.
|
||||
}
|
||||
|
||||
return {
|
||||
model: manualModel,
|
||||
files: manualFiles,
|
||||
pattern: manualPattern,
|
||||
isManual: true,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Runs the targeted retry logic (Best-of-4) for a failing test.
|
||||
*/
|
||||
async function runRetries(testName, results, files, model) {
|
||||
console.log(`\nRe-evaluating: ${testName}`);
|
||||
|
||||
while (
|
||||
results[testName].passed < 2 &&
|
||||
results[testName].total - results[testName].passed < 3 &&
|
||||
results[testName].total < 4
|
||||
) {
|
||||
const attemptNum = results[testName].total + 1;
|
||||
console.log(` Running attempt ${attemptNum}...`);
|
||||
|
||||
const retry = runTests(files, escapeRegex(testName), model);
|
||||
const retryAssertion = findAssertion(retry, testName);
|
||||
|
||||
results[testName].total++;
|
||||
if (retryAssertion?.status === 'passed') {
|
||||
results[testName].passed++;
|
||||
console.log(
|
||||
` ✅ Attempt ${attemptNum} passed. Score: ${results[testName].passed}/${results[testName].total}`,
|
||||
);
|
||||
} else {
|
||||
console.log(
|
||||
` ❌ Attempt ${attemptNum} failed (${retryAssertion?.status || 'unknown'}). Score: ${results[testName].passed}/${results[testName].total}`,
|
||||
);
|
||||
}
|
||||
|
||||
if (results[testName].passed >= 2) {
|
||||
console.log(
|
||||
` ✅ Test cleared as Noisy Pass (${results[testName].passed}/${results[testName].total})`,
|
||||
);
|
||||
} else if (results[testName].total - results[testName].passed >= 3) {
|
||||
await verifyBaseline(testName, results, files, model);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies a potential regression against the 'main' branch.
|
||||
*/
|
||||
async function verifyBaseline(testName, results, files, model) {
|
||||
console.log('\n--- Step 3: Dynamic Baseline Verification ---');
|
||||
console.log(
|
||||
`⚠️ Potential regression detected. Verifying baseline on 'main'...`,
|
||||
);
|
||||
|
||||
try {
|
||||
execSync('git stash push -m "eval-regression-check-stash"', {
|
||||
stdio: 'inherit',
|
||||
});
|
||||
const hasStash = execSync('git stash list')
|
||||
.toString()
|
||||
.includes('eval-regression-check-stash');
|
||||
execSync('git checkout main', { stdio: 'inherit' });
|
||||
|
||||
console.log(
|
||||
`\n--- Running Baseline Verification on 'main' (Best-of-3) ---`,
|
||||
);
|
||||
let baselinePasses = 0;
|
||||
let baselineTotal = 0;
|
||||
|
||||
while (baselinePasses === 0 && baselineTotal < 3) {
|
||||
baselineTotal++;
|
||||
console.log(` Baseline Attempt ${baselineTotal}...`);
|
||||
const baselineRun = runTests(files, escapeRegex(testName), model);
|
||||
if (findAssertion(baselineRun, testName)?.status === 'passed') {
|
||||
baselinePasses++;
|
||||
console.log(` ✅ Baseline Attempt ${baselineTotal} passed.`);
|
||||
} else {
|
||||
console.log(` ❌ Baseline Attempt ${baselineTotal} failed.`);
|
||||
}
|
||||
}
|
||||
|
||||
execSync('git checkout -', { stdio: 'inherit' });
|
||||
if (hasStash) execSync('git stash pop', { stdio: 'inherit' });
|
||||
|
||||
if (baselinePasses === 0) {
|
||||
console.log(
|
||||
` ℹ️ Test also fails on 'main'. Marking as PRE-EXISTING (Cleared).`,
|
||||
);
|
||||
results[testName].status = 'pre-existing';
|
||||
results[testName].passed = results[testName].total; // Clear for report
|
||||
} else {
|
||||
console.log(
|
||||
` ❌ Test passes on 'main' but fails in PR. Marking as CONFIRMED REGRESSION.`,
|
||||
);
|
||||
results[testName].status = 'regression';
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(` ❌ Failed to verify baseline: ${error.message}`);
|
||||
|
||||
// Best-effort cleanup: try to return to the original branch.
|
||||
try {
|
||||
execSync('git checkout -', { stdio: 'ignore' });
|
||||
} catch {
|
||||
// Ignore checkout errors during cleanup to avoid hiding the original error.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Processes initial results and orchestrates retries/baseline checks.
|
||||
*/
|
||||
async function processResults(firstPass, pattern, model, files) {
|
||||
if (!firstPass) return false;
|
||||
|
||||
const results = {};
|
||||
const failingTests = [];
|
||||
let totalProcessed = 0;
|
||||
|
||||
for (const fileResult of firstPass.testResults) {
|
||||
for (const assertion of fileResult.assertionResults) {
|
||||
if (assertion.status !== 'passed' && assertion.status !== 'failed') {
|
||||
continue;
|
||||
}
|
||||
|
||||
const name = assertion.title;
|
||||
results[name] = {
|
||||
passed: assertion.status === 'passed' ? 1 : 0,
|
||||
total: 1,
|
||||
file: fileResult.name,
|
||||
};
|
||||
if (assertion.status === 'failed') failingTests.push(name);
|
||||
totalProcessed++;
|
||||
}
|
||||
}
|
||||
|
||||
if (totalProcessed === 0) {
|
||||
console.error('❌ Error: No matching tests were found or executed.');
|
||||
return false;
|
||||
}
|
||||
|
||||
if (failingTests.length === 0) {
|
||||
console.log('✅ All trustworthy tests passed on the first try!');
|
||||
} else {
|
||||
console.log('\n--- Step 2: Best-of-4 Retries ---');
|
||||
console.log(
|
||||
`⚠️ ${failingTests.length} tests failed the optimistic run. Starting retries...`,
|
||||
);
|
||||
for (const testName of failingTests) {
|
||||
await runRetries(testName, results, files, model);
|
||||
}
|
||||
}
|
||||
|
||||
saveResults(results);
|
||||
return true;
|
||||
}
|
||||
|
||||
function saveResults(results) {
|
||||
const finalReport = { timestamp: new Date().toISOString(), results };
|
||||
fs.writeFileSync(
|
||||
'evals/logs/pr_final_report.json',
|
||||
JSON.stringify(finalReport, null, 2),
|
||||
);
|
||||
console.log('\nFinal report saved to evals/logs/pr_final_report.json');
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const { model, files, pattern, isManual } = parseArgs();
|
||||
|
||||
if (isManual) {
|
||||
const firstPass = runTests(files, pattern, model);
|
||||
const success = await processResults(firstPass, pattern, model, files);
|
||||
process.exit(success ? 0 : 1);
|
||||
}
|
||||
|
||||
if (!pattern) {
|
||||
console.log('No trustworthy tests to run.');
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
console.log('\n--- Step 1: Optimistic Run (N=1) ---');
|
||||
const firstPass = runTests(files, pattern, model);
|
||||
const success = await processResults(firstPass, pattern, model, files);
|
||||
process.exit(success ? 0 : 1);
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error(err);
|
||||
process.exit(1);
|
||||
});
|
||||
Reference in New Issue
Block a user