From 573199c58f5a965f2b2c5b8ffd2c1a4748421dce Mon Sep 17 00:00:00 2001 From: Alisa Novikova <62909685+alisa-alisa@users.noreply.github.com> Date: Wed, 8 Apr 2026 21:58:23 -0700 Subject: [PATCH] feat(evals): implement force-run for modified test files --- scripts/changed_prompt.js | 31 ++++++--------------- scripts/get_trustworthy_evals.js | 48 +++++++++++++++++++++++++++----- scripts/run_eval_regression.js | 12 +++++++- 3 files changed, 61 insertions(+), 30 deletions(-) diff --git a/scripts/changed_prompt.js b/scripts/changed_prompt.js index 4c76e90b38..a58e5e10f1 100644 --- a/scripts/changed_prompt.js +++ b/scripts/changed_prompt.js @@ -20,6 +20,7 @@ import { execSync } from 'node:child_process'; import fs from 'node:fs'; +import { minimatch } from 'minimatch'; const CORE_STEERING_PATHS = [ 'packages/core/src/prompts/', @@ -36,25 +37,6 @@ const STEERING_SIGNATURES = [ "kind: 'local'", ]; -function minimatch(file, pattern) { - if (pattern.endsWith('/**')) { - const prefix = pattern.slice(0, -3); - return file.startsWith(prefix); - } - if (pattern.includes('*')) { - const regex = new RegExp( - '^' + - pattern - .replace(/\./g, '\\.') - .replace(/\*\*/g, '.*') - .replace(/\*/g, '[^/]*') + - '$', - ); - return regex.test(file); - } - return file === pattern; -} - function main() { const targetBranch = process.env.GITHUB_BASE_REF || 'main'; const verbose = process.argv.includes('--verbose'); @@ -122,6 +104,7 @@ function main() { const reasons = []; const affectedSuites = new Set(); const rationales = []; + const modifiedTestFiles = []; // Load suites for --related mode let suitesConfig = null; @@ -148,6 +131,7 @@ function main() { ) { detected = true; reasons.push(`Matched test file: ${file}`); + modifiedTestFiles.push(file); } // Related suite detection @@ -157,9 +141,11 @@ function main() { if (suite.patterns.some((pattern) => minimatch(file, pattern))) { affectedSuites.add(suiteName); - rationales.push( - `Testing **${suiteName}** because **${file}** was modified.`, - ); + const isTestFile = file.endsWith('.eval.ts'); + const rationale = isTestFile + ? `Force-testing all tests in **${file}** (part of **${suiteName}** suite) because the file was modified.` + : `Testing **${suiteName}** because **${file}** was modified.`; + rationales.push(rationale); } } } @@ -214,6 +200,7 @@ function main() { reasons, affectedSuites: Array.from(affectedSuites), rationales, + modifiedTestFiles, }, null, 2, diff --git a/scripts/get_trustworthy_evals.js b/scripts/get_trustworthy_evals.js index ae25550134..6997269921 100644 --- a/scripts/get_trustworthy_evals.js +++ b/scripts/get_trustworthy_evals.js @@ -40,10 +40,22 @@ function main() { .map((s) => s.trim()); } + // Parse --force-run-files argument + const forceRunArgIndex = process.argv.indexOf('--force-run-files'); + let forceRunFiles = []; + if (forceRunArgIndex !== -1 && process.argv[forceRunArgIndex + 1]) { + forceRunFiles = process.argv[forceRunArgIndex + 1] + .split(',') + .map((f) => f.trim()); + } + console.error(`๐Ÿ” Identifying trustworthy evals for model: ${targetModel}`); if (requestedSuites) { console.error(`๐Ÿ“‚ Filtering by suites: ${requestedSuites.join(', ')}`); } + if (forceRunFiles.length > 0) { + console.error(`๐Ÿš€ Force-running tests in: ${forceRunFiles.join(', ')}`); + } const history = fetchNightlyHistory(LOOKBACK_COUNT); if (history.length === 0) { @@ -104,17 +116,37 @@ function main() { const volatileTests = []; const newTests = []; + // Add tests from force-run files that might not have history yet + for (const file of forceRunFiles) { + if (fs.existsSync(file)) { + const content = fs.readFileSync(file, 'utf-8'); + const testNameRegex = /name:\s*['"](.*)['"]/g; + let match; + while ((match = testNameRegex.exec(content)) !== null) { + const testName = match[1]; + if (!trustworthyTests.includes(testName)) { + trustworthyTests.push(testName); + trustworthyFiles.add(file); + } + } + } + } + for (const [testName, info] of Object.entries(testHistories)) { const dailyRates = info.dailyRates; const aggregateRate = info.totalPassed / info.totalRuns; + const isForceRunFile = + info.file && forceRunFiles.some((f) => info.file.includes(f)); - // 1. Minimum data points required - if (dailyRates.length < MIN_VALID_RUNS) { - newTests.push(testName); + // 1. Minimum data points required (unless force run) + if (dailyRates.length < MIN_VALID_RUNS && !isForceRunFile) { + if (!trustworthyTests.includes(testName)) { + newTests.push(testName); + } continue; } - // 2. Trustworthy Criterion: + // 2. Trustworthy Criterion (unless force run): // - Every single day must be above the floor (e.g. > 60%) // - The overall aggregate must be high-signal (e.g. > 80%) const isDailyStable = dailyRates.every( @@ -122,10 +154,10 @@ function main() { ); const isAggregateHighSignal = aggregateRate > AGGREGATE_PASS_RATE_THRESHOLD; - if (isDailyStable && isAggregateHighSignal) { + if ((isDailyStable && isAggregateHighSignal) || isForceRunFile) { // Suite filtering logic let isFileAllowed = true; - if (requestedSuites && !runAllStable) { + if (requestedSuites && !runAllStable && !isForceRunFile) { if (info.file) { const match = info.file.match(/evals\/.*\.eval\.ts/); if (match && !allowedFiles.has(match[0])) { @@ -139,7 +171,9 @@ function main() { } if (isFileAllowed) { - trustworthyTests.push(testName); + if (!trustworthyTests.includes(testName)) { + trustworthyTests.push(testName); + } if (info.file) { const match = info.file.match(/evals\/.*\.eval\.ts/); if (match) { diff --git a/scripts/run_eval_regression.js b/scripts/run_eval_regression.js index 03cdde90c1..6efc490973 100644 --- a/scripts/run_eval_regression.js +++ b/scripts/run_eval_regression.js @@ -28,6 +28,7 @@ async function main() { let hasRegression = false; let detectionRationale = ''; let affectedSuitesStr = ''; + let forceRunFilesStr = ''; console.log( `๐Ÿš€ Starting evaluation orchestration for models: ${models.join(', ')}`, @@ -44,6 +45,12 @@ async function main() { if (detection.affectedSuites && detection.affectedSuites.length > 0) { affectedSuitesStr = detection.affectedSuites.join(','); + if ( + detection.modifiedTestFiles && + detection.modifiedTestFiles.length > 0 + ) { + forceRunFilesStr = detection.modifiedTestFiles.join(','); + } detectionRationale = '### ๐Ÿงช Related Evaluation Rationale\n\n'; detection.rationales.forEach((r) => { detectionRationale += `- ${r}\n`; @@ -76,8 +83,11 @@ async function main() { const suitesFlag = affectedSuitesStr ? `--suites ${affectedSuitesStr}` : ''; + const forceRunFlag = forceRunFilesStr + ? `--force-run-files ${forceRunFilesStr}` + : ''; const output = execSync( - `node scripts/get_trustworthy_evals.js "${model}" ${suitesFlag}`, + `node scripts/get_trustworthy_evals.js "${model}" ${suitesFlag} ${forceRunFlag}`, { encoding: 'utf-8', stdio: ['inherit', 'pipe', 'inherit'], // Capture stdout but pass stdin/stderr