From 90ff8feb4c33a86af3f2653a5c8941e0029335d8 Mon Sep 17 00:00:00 2001 From: Alisa Novikova <62909685+alisa-alisa@users.noreply.github.com> Date: Wed, 8 Apr 2026 22:25:31 -0700 Subject: [PATCH] fix(evals): correctly handle new failing test files and fix lint error --- scripts/run_regression_check.js | 54 ++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 21 deletions(-) diff --git a/scripts/run_regression_check.js b/scripts/run_regression_check.js index 1250671c30..d92585fd49 100644 --- a/scripts/run_regression_check.js +++ b/scripts/run_regression_check.js @@ -183,33 +183,45 @@ async function verifyBaseline(testName, results, files, model) { let baselinePasses = 0; let baselineTotal = 0; - while (baselinePasses === 0 && baselineTotal < 3) { - baselineTotal++; - console.log(` Baseline Attempt ${baselineTotal}...`); - const baselineRun = runTests(files, escapeRegex(testName), model); - if (findAssertion(baselineRun, testName)?.status === 'passed') { - baselinePasses++; - console.log(` ✅ Baseline Attempt ${baselineTotal} passed.`); + // Determine the primary file for this test + const testFile = results[testName].file + ? path.relative(process.cwd(), results[testName].file) + : null; + + if (testFile && !fs.existsSync(testFile)) { + console.log( + ` ℹ️ Test file **${testFile}** does not exist on 'main'. Marking as NEW CONFIRMED REGRESSION.`, + ); + results[testName].status = 'regression'; + } else { + while (baselinePasses === 0 && baselineTotal < 3) { + baselineTotal++; + console.log(` Baseline Attempt ${baselineTotal}...`); + const baselineRun = runTests(files, escapeRegex(testName), model); + if (findAssertion(baselineRun, testName)?.status === 'passed') { + baselinePasses++; + console.log(` ✅ Baseline Attempt ${baselineTotal} passed.`); + } else { + console.log(` ❌ Baseline Attempt ${baselineTotal} failed.`); + } + } + + if (baselinePasses === 0) { + console.log( + ` ℹ️ Test also fails on 'main'. Marking as PRE-EXISTING (Cleared).`, + ); + results[testName].status = 'pre-existing'; + results[testName].passed = results[testName].total; // Clear for report } else { - console.log(` ❌ Baseline Attempt ${baselineTotal} failed.`); + console.log( + ` ❌ Test passes on 'main' but fails in PR. Marking as CONFIRMED REGRESSION.`, + ); + results[testName].status = 'regression'; } } execSync('git checkout -', { stdio: 'inherit' }); if (hasStash) execSync('git stash pop', { stdio: 'inherit' }); - - if (baselinePasses === 0) { - console.log( - ` ℹ️ Test also fails on 'main'. Marking as PRE-EXISTING (Cleared).`, - ); - results[testName].status = 'pre-existing'; - results[testName].passed = results[testName].total; // Clear for report - } else { - console.log( - ` ❌ Test passes on 'main' but fails in PR. Marking as CONFIRMED REGRESSION.`, - ); - results[testName].status = 'regression'; - } } catch (error) { console.error(` ❌ Failed to verify baseline: ${error.message}`);