mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-06-15 05:47:18 -07:00
fix(evals): correctly handle new failing test files and fix lint error
This commit is contained in:
@@ -183,33 +183,45 @@ async function verifyBaseline(testName, results, files, model) {
|
||||
let baselinePasses = 0;
|
||||
let baselineTotal = 0;
|
||||
|
||||
while (baselinePasses === 0 && baselineTotal < 3) {
|
||||
baselineTotal++;
|
||||
console.log(` Baseline Attempt ${baselineTotal}...`);
|
||||
const baselineRun = runTests(files, escapeRegex(testName), model);
|
||||
if (findAssertion(baselineRun, testName)?.status === 'passed') {
|
||||
baselinePasses++;
|
||||
console.log(` ✅ Baseline Attempt ${baselineTotal} passed.`);
|
||||
// Determine the primary file for this test
|
||||
const testFile = results[testName].file
|
||||
? path.relative(process.cwd(), results[testName].file)
|
||||
: null;
|
||||
|
||||
if (testFile && !fs.existsSync(testFile)) {
|
||||
console.log(
|
||||
` ℹ️ Test file **${testFile}** does not exist on 'main'. Marking as NEW CONFIRMED REGRESSION.`,
|
||||
);
|
||||
results[testName].status = 'regression';
|
||||
} else {
|
||||
while (baselinePasses === 0 && baselineTotal < 3) {
|
||||
baselineTotal++;
|
||||
console.log(` Baseline Attempt ${baselineTotal}...`);
|
||||
const baselineRun = runTests(files, escapeRegex(testName), model);
|
||||
if (findAssertion(baselineRun, testName)?.status === 'passed') {
|
||||
baselinePasses++;
|
||||
console.log(` ✅ Baseline Attempt ${baselineTotal} passed.`);
|
||||
} else {
|
||||
console.log(` ❌ Baseline Attempt ${baselineTotal} failed.`);
|
||||
}
|
||||
}
|
||||
|
||||
if (baselinePasses === 0) {
|
||||
console.log(
|
||||
` ℹ️ Test also fails on 'main'. Marking as PRE-EXISTING (Cleared).`,
|
||||
);
|
||||
results[testName].status = 'pre-existing';
|
||||
results[testName].passed = results[testName].total; // Clear for report
|
||||
} else {
|
||||
console.log(` ❌ Baseline Attempt ${baselineTotal} failed.`);
|
||||
console.log(
|
||||
` ❌ Test passes on 'main' but fails in PR. Marking as CONFIRMED REGRESSION.`,
|
||||
);
|
||||
results[testName].status = 'regression';
|
||||
}
|
||||
}
|
||||
|
||||
execSync('git checkout -', { stdio: 'inherit' });
|
||||
if (hasStash) execSync('git stash pop', { stdio: 'inherit' });
|
||||
|
||||
if (baselinePasses === 0) {
|
||||
console.log(
|
||||
` ℹ️ Test also fails on 'main'. Marking as PRE-EXISTING (Cleared).`,
|
||||
);
|
||||
results[testName].status = 'pre-existing';
|
||||
results[testName].passed = results[testName].total; // Clear for report
|
||||
} else {
|
||||
console.log(
|
||||
` ❌ Test passes on 'main' but fails in PR. Marking as CONFIRMED REGRESSION.`,
|
||||
);
|
||||
results[testName].status = 'regression';
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(` ❌ Failed to verify baseline: ${error.message}`);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user