mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-05-18 07:43:00 -07:00
feat(evals): implement force-run for modified test files
This commit is contained in:
@@ -20,6 +20,7 @@
|
||||
|
||||
import { execSync } from 'node:child_process';
|
||||
import fs from 'node:fs';
|
||||
import { minimatch } from 'minimatch';
|
||||
|
||||
const CORE_STEERING_PATHS = [
|
||||
'packages/core/src/prompts/',
|
||||
@@ -36,25 +37,6 @@ const STEERING_SIGNATURES = [
|
||||
"kind: 'local'",
|
||||
];
|
||||
|
||||
function minimatch(file, pattern) {
|
||||
if (pattern.endsWith('/**')) {
|
||||
const prefix = pattern.slice(0, -3);
|
||||
return file.startsWith(prefix);
|
||||
}
|
||||
if (pattern.includes('*')) {
|
||||
const regex = new RegExp(
|
||||
'^' +
|
||||
pattern
|
||||
.replace(/\./g, '\\.')
|
||||
.replace(/\*\*/g, '.*')
|
||||
.replace(/\*/g, '[^/]*') +
|
||||
'$',
|
||||
);
|
||||
return regex.test(file);
|
||||
}
|
||||
return file === pattern;
|
||||
}
|
||||
|
||||
function main() {
|
||||
const targetBranch = process.env.GITHUB_BASE_REF || 'main';
|
||||
const verbose = process.argv.includes('--verbose');
|
||||
@@ -122,6 +104,7 @@ function main() {
|
||||
const reasons = [];
|
||||
const affectedSuites = new Set();
|
||||
const rationales = [];
|
||||
const modifiedTestFiles = [];
|
||||
|
||||
// Load suites for --related mode
|
||||
let suitesConfig = null;
|
||||
@@ -148,6 +131,7 @@ function main() {
|
||||
) {
|
||||
detected = true;
|
||||
reasons.push(`Matched test file: ${file}`);
|
||||
modifiedTestFiles.push(file);
|
||||
}
|
||||
|
||||
// Related suite detection
|
||||
@@ -157,9 +141,11 @@ function main() {
|
||||
|
||||
if (suite.patterns.some((pattern) => minimatch(file, pattern))) {
|
||||
affectedSuites.add(suiteName);
|
||||
rationales.push(
|
||||
`Testing **${suiteName}** because **${file}** was modified.`,
|
||||
);
|
||||
const isTestFile = file.endsWith('.eval.ts');
|
||||
const rationale = isTestFile
|
||||
? `Force-testing all tests in **${file}** (part of **${suiteName}** suite) because the file was modified.`
|
||||
: `Testing **${suiteName}** because **${file}** was modified.`;
|
||||
rationales.push(rationale);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -214,6 +200,7 @@ function main() {
|
||||
reasons,
|
||||
affectedSuites: Array.from(affectedSuites),
|
||||
rationales,
|
||||
modifiedTestFiles,
|
||||
},
|
||||
null,
|
||||
2,
|
||||
|
||||
@@ -40,10 +40,22 @@ function main() {
|
||||
.map((s) => s.trim());
|
||||
}
|
||||
|
||||
// Parse --force-run-files argument
|
||||
const forceRunArgIndex = process.argv.indexOf('--force-run-files');
|
||||
let forceRunFiles = [];
|
||||
if (forceRunArgIndex !== -1 && process.argv[forceRunArgIndex + 1]) {
|
||||
forceRunFiles = process.argv[forceRunArgIndex + 1]
|
||||
.split(',')
|
||||
.map((f) => f.trim());
|
||||
}
|
||||
|
||||
console.error(`🔍 Identifying trustworthy evals for model: ${targetModel}`);
|
||||
if (requestedSuites) {
|
||||
console.error(`📂 Filtering by suites: ${requestedSuites.join(', ')}`);
|
||||
}
|
||||
if (forceRunFiles.length > 0) {
|
||||
console.error(`🚀 Force-running tests in: ${forceRunFiles.join(', ')}`);
|
||||
}
|
||||
|
||||
const history = fetchNightlyHistory(LOOKBACK_COUNT);
|
||||
if (history.length === 0) {
|
||||
@@ -104,17 +116,37 @@ function main() {
|
||||
const volatileTests = [];
|
||||
const newTests = [];
|
||||
|
||||
// Add tests from force-run files that might not have history yet
|
||||
for (const file of forceRunFiles) {
|
||||
if (fs.existsSync(file)) {
|
||||
const content = fs.readFileSync(file, 'utf-8');
|
||||
const testNameRegex = /name:\s*['"](.*)['"]/g;
|
||||
let match;
|
||||
while ((match = testNameRegex.exec(content)) !== null) {
|
||||
const testName = match[1];
|
||||
if (!trustworthyTests.includes(testName)) {
|
||||
trustworthyTests.push(testName);
|
||||
trustworthyFiles.add(file);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (const [testName, info] of Object.entries(testHistories)) {
|
||||
const dailyRates = info.dailyRates;
|
||||
const aggregateRate = info.totalPassed / info.totalRuns;
|
||||
const isForceRunFile =
|
||||
info.file && forceRunFiles.some((f) => info.file.includes(f));
|
||||
|
||||
// 1. Minimum data points required
|
||||
if (dailyRates.length < MIN_VALID_RUNS) {
|
||||
newTests.push(testName);
|
||||
// 1. Minimum data points required (unless force run)
|
||||
if (dailyRates.length < MIN_VALID_RUNS && !isForceRunFile) {
|
||||
if (!trustworthyTests.includes(testName)) {
|
||||
newTests.push(testName);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// 2. Trustworthy Criterion:
|
||||
// 2. Trustworthy Criterion (unless force run):
|
||||
// - Every single day must be above the floor (e.g. > 60%)
|
||||
// - The overall aggregate must be high-signal (e.g. > 80%)
|
||||
const isDailyStable = dailyRates.every(
|
||||
@@ -122,10 +154,10 @@ function main() {
|
||||
);
|
||||
const isAggregateHighSignal = aggregateRate > AGGREGATE_PASS_RATE_THRESHOLD;
|
||||
|
||||
if (isDailyStable && isAggregateHighSignal) {
|
||||
if ((isDailyStable && isAggregateHighSignal) || isForceRunFile) {
|
||||
// Suite filtering logic
|
||||
let isFileAllowed = true;
|
||||
if (requestedSuites && !runAllStable) {
|
||||
if (requestedSuites && !runAllStable && !isForceRunFile) {
|
||||
if (info.file) {
|
||||
const match = info.file.match(/evals\/.*\.eval\.ts/);
|
||||
if (match && !allowedFiles.has(match[0])) {
|
||||
@@ -139,7 +171,9 @@ function main() {
|
||||
}
|
||||
|
||||
if (isFileAllowed) {
|
||||
trustworthyTests.push(testName);
|
||||
if (!trustworthyTests.includes(testName)) {
|
||||
trustworthyTests.push(testName);
|
||||
}
|
||||
if (info.file) {
|
||||
const match = info.file.match(/evals\/.*\.eval\.ts/);
|
||||
if (match) {
|
||||
|
||||
@@ -28,6 +28,7 @@ async function main() {
|
||||
let hasRegression = false;
|
||||
let detectionRationale = '';
|
||||
let affectedSuitesStr = '';
|
||||
let forceRunFilesStr = '';
|
||||
|
||||
console.log(
|
||||
`🚀 Starting evaluation orchestration for models: ${models.join(', ')}`,
|
||||
@@ -44,6 +45,12 @@ async function main() {
|
||||
|
||||
if (detection.affectedSuites && detection.affectedSuites.length > 0) {
|
||||
affectedSuitesStr = detection.affectedSuites.join(',');
|
||||
if (
|
||||
detection.modifiedTestFiles &&
|
||||
detection.modifiedTestFiles.length > 0
|
||||
) {
|
||||
forceRunFilesStr = detection.modifiedTestFiles.join(',');
|
||||
}
|
||||
detectionRationale = '### 🧪 Related Evaluation Rationale\n\n';
|
||||
detection.rationales.forEach((r) => {
|
||||
detectionRationale += `- ${r}\n`;
|
||||
@@ -76,8 +83,11 @@ async function main() {
|
||||
const suitesFlag = affectedSuitesStr
|
||||
? `--suites ${affectedSuitesStr}`
|
||||
: '';
|
||||
const forceRunFlag = forceRunFilesStr
|
||||
? `--force-run-files ${forceRunFilesStr}`
|
||||
: '';
|
||||
const output = execSync(
|
||||
`node scripts/get_trustworthy_evals.js "${model}" ${suitesFlag}`,
|
||||
`node scripts/get_trustworthy_evals.js "${model}" ${suitesFlag} ${forceRunFlag}`,
|
||||
{
|
||||
encoding: 'utf-8',
|
||||
stdio: ['inherit', 'pipe', 'inherit'], // Capture stdout but pass stdin/stderr
|
||||
|
||||
Reference in New Issue
Block a user