feat(evals): implement force-run for modified test files

This commit is contained in:
Alisa Novikova
2026-04-08 21:58:23 -07:00
parent a591cd9f8f
commit 573199c58f
3 changed files with 61 additions and 30 deletions
+9 -22
View File
@@ -20,6 +20,7 @@
import { execSync } from 'node:child_process';
import fs from 'node:fs';
import { minimatch } from 'minimatch';
const CORE_STEERING_PATHS = [
'packages/core/src/prompts/',
@@ -36,25 +37,6 @@ const STEERING_SIGNATURES = [
"kind: 'local'",
];
function minimatch(file, pattern) {
if (pattern.endsWith('/**')) {
const prefix = pattern.slice(0, -3);
return file.startsWith(prefix);
}
if (pattern.includes('*')) {
const regex = new RegExp(
'^' +
pattern
.replace(/\./g, '\\.')
.replace(/\*\*/g, '.*')
.replace(/\*/g, '[^/]*') +
'$',
);
return regex.test(file);
}
return file === pattern;
}
function main() {
const targetBranch = process.env.GITHUB_BASE_REF || 'main';
const verbose = process.argv.includes('--verbose');
@@ -122,6 +104,7 @@ function main() {
const reasons = [];
const affectedSuites = new Set();
const rationales = [];
const modifiedTestFiles = [];
// Load suites for --related mode
let suitesConfig = null;
@@ -148,6 +131,7 @@ function main() {
) {
detected = true;
reasons.push(`Matched test file: ${file}`);
modifiedTestFiles.push(file);
}
// Related suite detection
@@ -157,9 +141,11 @@ function main() {
if (suite.patterns.some((pattern) => minimatch(file, pattern))) {
affectedSuites.add(suiteName);
rationales.push(
`Testing **${suiteName}** because **${file}** was modified.`,
);
const isTestFile = file.endsWith('.eval.ts');
const rationale = isTestFile
? `Force-testing all tests in **${file}** (part of **${suiteName}** suite) because the file was modified.`
: `Testing **${suiteName}** because **${file}** was modified.`;
rationales.push(rationale);
}
}
}
@@ -214,6 +200,7 @@ function main() {
reasons,
affectedSuites: Array.from(affectedSuites),
rationales,
modifiedTestFiles,
},
null,
2,
+41 -7
View File
@@ -40,10 +40,22 @@ function main() {
.map((s) => s.trim());
}
// Parse --force-run-files argument
const forceRunArgIndex = process.argv.indexOf('--force-run-files');
let forceRunFiles = [];
if (forceRunArgIndex !== -1 && process.argv[forceRunArgIndex + 1]) {
forceRunFiles = process.argv[forceRunArgIndex + 1]
.split(',')
.map((f) => f.trim());
}
console.error(`🔍 Identifying trustworthy evals for model: ${targetModel}`);
if (requestedSuites) {
console.error(`📂 Filtering by suites: ${requestedSuites.join(', ')}`);
}
if (forceRunFiles.length > 0) {
console.error(`🚀 Force-running tests in: ${forceRunFiles.join(', ')}`);
}
const history = fetchNightlyHistory(LOOKBACK_COUNT);
if (history.length === 0) {
@@ -104,17 +116,37 @@ function main() {
const volatileTests = [];
const newTests = [];
// Add tests from force-run files that might not have history yet
for (const file of forceRunFiles) {
if (fs.existsSync(file)) {
const content = fs.readFileSync(file, 'utf-8');
const testNameRegex = /name:\s*['"](.*)['"]/g;
let match;
while ((match = testNameRegex.exec(content)) !== null) {
const testName = match[1];
if (!trustworthyTests.includes(testName)) {
trustworthyTests.push(testName);
trustworthyFiles.add(file);
}
}
}
}
for (const [testName, info] of Object.entries(testHistories)) {
const dailyRates = info.dailyRates;
const aggregateRate = info.totalPassed / info.totalRuns;
const isForceRunFile =
info.file && forceRunFiles.some((f) => info.file.includes(f));
// 1. Minimum data points required
if (dailyRates.length < MIN_VALID_RUNS) {
newTests.push(testName);
// 1. Minimum data points required (unless force run)
if (dailyRates.length < MIN_VALID_RUNS && !isForceRunFile) {
if (!trustworthyTests.includes(testName)) {
newTests.push(testName);
}
continue;
}
// 2. Trustworthy Criterion:
// 2. Trustworthy Criterion (unless force run):
// - Every single day must be above the floor (e.g. > 60%)
// - The overall aggregate must be high-signal (e.g. > 80%)
const isDailyStable = dailyRates.every(
@@ -122,10 +154,10 @@ function main() {
);
const isAggregateHighSignal = aggregateRate > AGGREGATE_PASS_RATE_THRESHOLD;
if (isDailyStable && isAggregateHighSignal) {
if ((isDailyStable && isAggregateHighSignal) || isForceRunFile) {
// Suite filtering logic
let isFileAllowed = true;
if (requestedSuites && !runAllStable) {
if (requestedSuites && !runAllStable && !isForceRunFile) {
if (info.file) {
const match = info.file.match(/evals\/.*\.eval\.ts/);
if (match && !allowedFiles.has(match[0])) {
@@ -139,7 +171,9 @@ function main() {
}
if (isFileAllowed) {
trustworthyTests.push(testName);
if (!trustworthyTests.includes(testName)) {
trustworthyTests.push(testName);
}
if (info.file) {
const match = info.file.match(/evals\/.*\.eval\.ts/);
if (match) {
+11 -1
View File
@@ -28,6 +28,7 @@ async function main() {
let hasRegression = false;
let detectionRationale = '';
let affectedSuitesStr = '';
let forceRunFilesStr = '';
console.log(
`🚀 Starting evaluation orchestration for models: ${models.join(', ')}`,
@@ -44,6 +45,12 @@ async function main() {
if (detection.affectedSuites && detection.affectedSuites.length > 0) {
affectedSuitesStr = detection.affectedSuites.join(',');
if (
detection.modifiedTestFiles &&
detection.modifiedTestFiles.length > 0
) {
forceRunFilesStr = detection.modifiedTestFiles.join(',');
}
detectionRationale = '### 🧪 Related Evaluation Rationale\n\n';
detection.rationales.forEach((r) => {
detectionRationale += `- ${r}\n`;
@@ -76,8 +83,11 @@ async function main() {
const suitesFlag = affectedSuitesStr
? `--suites ${affectedSuitesStr}`
: '';
const forceRunFlag = forceRunFilesStr
? `--force-run-files ${forceRunFilesStr}`
: '';
const output = execSync(
`node scripts/get_trustworthy_evals.js "${model}" ${suitesFlag}`,
`node scripts/get_trustworthy_evals.js "${model}" ${suitesFlag} ${forceRunFlag}`,
{
encoding: 'utf-8',
stdio: ['inherit', 'pipe', 'inherit'], // Capture stdout but pass stdin/stderr