From 573199c58f5a965f2b2c5b8ffd2c1a4748421dce Mon Sep 17 00:00:00 2001
From: Alisa Novikova <62909685+alisa-alisa@users.noreply.github.com>
Date: Wed, 8 Apr 2026 21:58:23 -0700
Subject: [PATCH] feat(evals): implement force-run for modified test files

---
 scripts/changed_prompt.js        | 31 ++++++---------------
 scripts/get_trustworthy_evals.js | 48 +++++++++++++++++++++++++++-----
 scripts/run_eval_regression.js   | 12 +++++++-
 3 files changed, 61 insertions(+), 30 deletions(-)

diff --git a/scripts/changed_prompt.js b/scripts/changed_prompt.js
index 4c76e90b38..a58e5e10f1 100644
--- a/scripts/changed_prompt.js
+++ b/scripts/changed_prompt.js
@@ -20,6 +20,7 @@
 
 import { execSync } from 'node:child_process';
 import fs from 'node:fs';
+import { minimatch } from 'minimatch';
 
 const CORE_STEERING_PATHS = [
   'packages/core/src/prompts/',
@@ -36,25 +37,6 @@ const STEERING_SIGNATURES = [
   "kind: 'local'",
 ];
 
-function minimatch(file, pattern) {
-  if (pattern.endsWith('/**')) {
-    const prefix = pattern.slice(0, -3);
-    return file.startsWith(prefix);
-  }
-  if (pattern.includes('*')) {
-    const regex = new RegExp(
-      '^' +
-        pattern
-          .replace(/\./g, '\\.')
-          .replace(/\*\*/g, '.*')
-          .replace(/\*/g, '[^/]*') +
-        '$',
-    );
-    return regex.test(file);
-  }
-  return file === pattern;
-}
-
 function main() {
   const targetBranch = process.env.GITHUB_BASE_REF || 'main';
   const verbose = process.argv.includes('--verbose');
@@ -122,6 +104,7 @@ function main() {
     const reasons = [];
     const affectedSuites = new Set();
     const rationales = [];
+    const modifiedTestFiles = [];
 
     // Load suites for --related mode
     let suitesConfig = null;
@@ -148,6 +131,7 @@ function main() {
       ) {
         detected = true;
         reasons.push(`Matched test file: ${file}`);
+        modifiedTestFiles.push(file);
       }
 
       // Related suite detection
@@ -157,9 +141,11 @@ function main() {
 
           if (suite.patterns.some((pattern) => minimatch(file, pattern))) {
             affectedSuites.add(suiteName);
-            rationales.push(
-              `Testing **${suiteName}** because **${file}** was modified.`,
-            );
+            const isTestFile = file.endsWith('.eval.ts');
+            const rationale = isTestFile
+              ? `Force-testing all tests in **${file}** (part of **${suiteName}** suite) because the file was modified.`
+              : `Testing **${suiteName}** because **${file}** was modified.`;
+            rationales.push(rationale);
           }
         }
       }
@@ -214,6 +200,7 @@ function main() {
             reasons,
             affectedSuites: Array.from(affectedSuites),
             rationales,
+            modifiedTestFiles,
           },
           null,
           2,
diff --git a/scripts/get_trustworthy_evals.js b/scripts/get_trustworthy_evals.js
index ae25550134..6997269921 100644
--- a/scripts/get_trustworthy_evals.js
+++ b/scripts/get_trustworthy_evals.js
@@ -40,10 +40,22 @@ function main() {
       .map((s) => s.trim());
   }
 
+  // Parse --force-run-files argument
+  const forceRunArgIndex = process.argv.indexOf('--force-run-files');
+  let forceRunFiles = [];
+  if (forceRunArgIndex !== -1 && process.argv[forceRunArgIndex + 1]) {
+    forceRunFiles = process.argv[forceRunArgIndex + 1]
+      .split(',')
+      .map((f) => f.trim());
+  }
+
   console.error(`🔍 Identifying trustworthy evals for model: ${targetModel}`);
   if (requestedSuites) {
     console.error(`📂 Filtering by suites: ${requestedSuites.join(', ')}`);
   }
+  if (forceRunFiles.length > 0) {
+    console.error(`🚀 Force-running tests in: ${forceRunFiles.join(', ')}`);
+  }
 
   const history = fetchNightlyHistory(LOOKBACK_COUNT);
   if (history.length === 0) {
@@ -104,17 +116,37 @@ function main() {
   const volatileTests = [];
   const newTests = [];
 
+  // Add tests from force-run files that might not have history yet
+  for (const file of forceRunFiles) {
+    if (fs.existsSync(file)) {
+      const content = fs.readFileSync(file, 'utf-8');
+      const testNameRegex = /name:\s*['"](.*)['"]/g;
+      let match;
+      while ((match = testNameRegex.exec(content)) !== null) {
+        const testName = match[1];
+        if (!trustworthyTests.includes(testName)) {
+          trustworthyTests.push(testName);
+          trustworthyFiles.add(file);
+        }
+      }
+    }
+  }
+
   for (const [testName, info] of Object.entries(testHistories)) {
     const dailyRates = info.dailyRates;
     const aggregateRate = info.totalPassed / info.totalRuns;
+    const isForceRunFile =
+      info.file && forceRunFiles.some((f) => info.file.includes(f));
 
-    // 1. Minimum data points required
-    if (dailyRates.length < MIN_VALID_RUNS) {
-      newTests.push(testName);
+    // 1. Minimum data points required (unless force run)
+    if (dailyRates.length < MIN_VALID_RUNS && !isForceRunFile) {
+      if (!trustworthyTests.includes(testName)) {
+        newTests.push(testName);
+      }
       continue;
     }
 
-    // 2. Trustworthy Criterion:
+    // 2. Trustworthy Criterion (unless force run):
     // - Every single day must be above the floor (e.g. > 60%)
     // - The overall aggregate must be high-signal (e.g. > 80%)
     const isDailyStable = dailyRates.every(
@@ -122,10 +154,10 @@ function main() {
     );
     const isAggregateHighSignal = aggregateRate > AGGREGATE_PASS_RATE_THRESHOLD;
 
-    if (isDailyStable && isAggregateHighSignal) {
+    if ((isDailyStable && isAggregateHighSignal) || isForceRunFile) {
       // Suite filtering logic
       let isFileAllowed = true;
-      if (requestedSuites && !runAllStable) {
+      if (requestedSuites && !runAllStable && !isForceRunFile) {
         if (info.file) {
           const match = info.file.match(/evals\/.*\.eval\.ts/);
           if (match && !allowedFiles.has(match[0])) {
@@ -139,7 +171,9 @@ function main() {
       }
 
       if (isFileAllowed) {
-        trustworthyTests.push(testName);
+        if (!trustworthyTests.includes(testName)) {
+          trustworthyTests.push(testName);
+        }
         if (info.file) {
           const match = info.file.match(/evals\/.*\.eval\.ts/);
           if (match) {
diff --git a/scripts/run_eval_regression.js b/scripts/run_eval_regression.js
index 03cdde90c1..6efc490973 100644
--- a/scripts/run_eval_regression.js
+++ b/scripts/run_eval_regression.js
@@ -28,6 +28,7 @@ async function main() {
   let hasRegression = false;
   let detectionRationale = '';
   let affectedSuitesStr = '';
+  let forceRunFilesStr = '';
 
   console.log(
     `🚀 Starting evaluation orchestration for models: ${models.join(', ')}`,
@@ -44,6 +45,12 @@ async function main() {
 
       if (detection.affectedSuites && detection.affectedSuites.length > 0) {
         affectedSuitesStr = detection.affectedSuites.join(',');
+        if (
+          detection.modifiedTestFiles &&
+          detection.modifiedTestFiles.length > 0
+        ) {
+          forceRunFilesStr = detection.modifiedTestFiles.join(',');
+        }
         detectionRationale = '### 🧪 Related Evaluation Rationale\n\n';
         detection.rationales.forEach((r) => {
           detectionRationale += `- ${r}\n`;
@@ -76,8 +83,11 @@ async function main() {
       const suitesFlag = affectedSuitesStr
         ? `--suites ${affectedSuitesStr}`
         : '';
+      const forceRunFlag = forceRunFilesStr
+        ? `--force-run-files ${forceRunFilesStr}`
+        : '';
       const output = execSync(
-        `node scripts/get_trustworthy_evals.js "${model}" ${suitesFlag}`,
+        `node scripts/get_trustworthy_evals.js "${model}" ${suitesFlag} ${forceRunFlag}`,
         {
           encoding: 'utf-8',
           stdio: ['inherit', 'pipe', 'inherit'], // Capture stdout but pass stdin/stderr