mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-04-26 21:14:35 -07:00
feat: implement high-signal PR regression check for evaluations (#23937)
This commit is contained in:
@@ -1,69 +0,0 @@
|
|||||||
name: 'Evals: PR Guidance'
|
|
||||||
|
|
||||||
on:
|
|
||||||
pull_request:
|
|
||||||
paths:
|
|
||||||
- 'packages/core/src/**/*.ts'
|
|
||||||
- '!**/*.test.ts'
|
|
||||||
- '!**/*.test.tsx'
|
|
||||||
|
|
||||||
permissions:
|
|
||||||
pull-requests: 'write'
|
|
||||||
contents: 'read'
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
provide-guidance:
|
|
||||||
name: 'Model Steering Guidance'
|
|
||||||
runs-on: 'ubuntu-latest'
|
|
||||||
if: "github.repository == 'google-gemini/gemini-cli'"
|
|
||||||
steps:
|
|
||||||
- name: 'Checkout'
|
|
||||||
uses: 'actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955' # ratchet:actions/checkout@v4
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
|
|
||||||
- name: 'Set up Node.js'
|
|
||||||
uses: 'actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020' # ratchet:actions/setup-node@v4.4.0
|
|
||||||
with:
|
|
||||||
node-version-file: '.nvmrc'
|
|
||||||
cache: 'npm'
|
|
||||||
|
|
||||||
- name: 'Detect Steering Changes'
|
|
||||||
id: 'detect'
|
|
||||||
run: |
|
|
||||||
STEERING_DETECTED=$(node scripts/changed_prompt.js --steering-only)
|
|
||||||
echo "STEERING_DETECTED=$STEERING_DETECTED" >> "$GITHUB_OUTPUT"
|
|
||||||
|
|
||||||
- name: 'Analyze PR Content'
|
|
||||||
if: "steps.detect.outputs.STEERING_DETECTED == 'true'"
|
|
||||||
id: 'analysis'
|
|
||||||
env:
|
|
||||||
GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}'
|
|
||||||
run: |
|
|
||||||
# Check for behavioral eval changes
|
|
||||||
EVAL_CHANGES=$(git diff --name-only origin/${{ github.base_ref }}...HEAD | grep "^evals/" || true)
|
|
||||||
if [ -z "$EVAL_CHANGES" ]; then
|
|
||||||
echo "MISSING_EVALS=true" >> "$GITHUB_OUTPUT"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Check if user is a maintainer (has write/admin access)
|
|
||||||
USER_PERMISSION=$(gh api repos/${{ github.repository }}/collaborators/${{ github.actor }}/permission --jq '.permission')
|
|
||||||
if [[ "$USER_PERMISSION" == "admin" || "$USER_PERMISSION" == "write" ]]; then
|
|
||||||
echo "IS_MAINTAINER=true" >> "$GITHUB_OUTPUT"
|
|
||||||
fi
|
|
||||||
|
|
||||||
- name: 'Post Guidance Comment'
|
|
||||||
if: "steps.detect.outputs.STEERING_DETECTED == 'true'"
|
|
||||||
uses: 'thollander/actions-comment-pull-request@65f9e5c9a1f2cd378bd74b2e057c9736982a8e74' # ratchet:thollander/actions-comment-pull-request@v3
|
|
||||||
with:
|
|
||||||
comment-tag: 'eval-guidance-bot'
|
|
||||||
message: |
|
|
||||||
### 🧠 Model Steering Guidance
|
|
||||||
|
|
||||||
This PR modifies files that affect the model's behavior (prompts, tools, or instructions).
|
|
||||||
|
|
||||||
${{ steps.analysis.outputs.MISSING_EVALS == 'true' && '- ⚠️ **Consider adding Evals:** No behavioral evaluations (`evals/*.eval.ts`) were added or updated in this PR. Consider adding a test case to verify the new behavior and prevent regressions.' || '' }}
|
|
||||||
${{ steps.analysis.outputs.IS_MAINTAINER == 'true' && '- 🚀 **Maintainer Reminder:** Please ensure that these changes do not regress results on benchmark evals before merging.' || '' }}
|
|
||||||
|
|
||||||
---
|
|
||||||
*This is an automated guidance message triggered by steering logic signatures.*
|
|
||||||
@@ -0,0 +1,137 @@
|
|||||||
|
name: 'Evals: PR Evaluation & Regression'
|
||||||
|
|
||||||
|
on:
|
||||||
|
pull_request:
|
||||||
|
types: ['opened', 'synchronize', 'reopened', 'ready_for_review']
|
||||||
|
paths:
|
||||||
|
- 'packages/core/src/prompts/**'
|
||||||
|
- 'packages/core/src/tools/**'
|
||||||
|
- 'packages/core/src/agents/**'
|
||||||
|
- 'evals/**'
|
||||||
|
- '!**/*.test.ts'
|
||||||
|
- '!**/*.test.tsx'
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
# Prevents multiple runs for the same PR simultaneously (saves tokens)
|
||||||
|
concurrency:
|
||||||
|
group: '${{ github.workflow }}-${{ github.head_ref || github.ref }}'
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
pull-requests: 'write'
|
||||||
|
contents: 'read'
|
||||||
|
actions: 'read'
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
pr-evaluation:
|
||||||
|
name: 'Evaluate Steering & Regressions'
|
||||||
|
runs-on: 'gemini-cli-ubuntu-16-core'
|
||||||
|
if: "github.repository == 'google-gemini/gemini-cli' && (github.event_name != 'pull_request' || (github.event.pull_request.draft == false && github.event.pull_request.head.repo.full_name == github.repository))"
|
||||||
|
# External contributors' PRs will wait for approval in this environment
|
||||||
|
environment: |-
|
||||||
|
${{ (github.event.pull_request.head.repo.full_name == github.repository) && 'internal' || 'external-evals' }}
|
||||||
|
env:
|
||||||
|
# CENTRALIZED MODEL LIST
|
||||||
|
MODEL_LIST: 'gemini-3-flash-preview'
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: 'Checkout'
|
||||||
|
uses: 'actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955' # ratchet:actions/checkout@v5
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: 'Set up Node.js'
|
||||||
|
uses: 'actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020' # ratchet:actions/setup-node@v4.4.0
|
||||||
|
with:
|
||||||
|
node-version-file: '.nvmrc'
|
||||||
|
cache: 'npm'
|
||||||
|
|
||||||
|
- name: 'Install dependencies'
|
||||||
|
run: 'npm ci'
|
||||||
|
|
||||||
|
- name: 'Build project'
|
||||||
|
run: 'npm run build'
|
||||||
|
|
||||||
|
- name: 'Detect Steering Changes'
|
||||||
|
id: 'detect'
|
||||||
|
run: |
|
||||||
|
SHOULD_RUN=$(node scripts/changed_prompt.js)
|
||||||
|
STEERING_DETECTED=$(node scripts/changed_prompt.js --steering-only)
|
||||||
|
echo "SHOULD_RUN=$SHOULD_RUN" >> "$GITHUB_OUTPUT"
|
||||||
|
echo "STEERING_DETECTED=$STEERING_DETECTED" >> "$GITHUB_OUTPUT"
|
||||||
|
|
||||||
|
- name: 'Analyze PR Content (Guidance)'
|
||||||
|
if: "steps.detect.outputs.STEERING_DETECTED == 'true'"
|
||||||
|
id: 'analysis'
|
||||||
|
env:
|
||||||
|
GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}'
|
||||||
|
run: |
|
||||||
|
# Check for behavioral eval changes
|
||||||
|
EVAL_CHANGES=$(git diff --name-only origin/${{ github.base_ref }}...HEAD | grep "^evals/" || true)
|
||||||
|
if [ -z "$EVAL_CHANGES" ]; then
|
||||||
|
echo "MISSING_EVALS=true" >> "$GITHUB_OUTPUT"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check if user is a maintainer
|
||||||
|
USER_PERMISSION=$(gh api repos/${{ github.repository }}/collaborators/${{ github.actor }}/permission --jq '.permission')
|
||||||
|
if [[ "$USER_PERMISSION" == "admin" || "$USER_PERMISSION" == "write" ]]; then
|
||||||
|
echo "IS_MAINTAINER=true" >> "$GITHUB_OUTPUT"
|
||||||
|
fi
|
||||||
|
|
||||||
|
- name: 'Execute Regression Check'
|
||||||
|
if: "steps.detect.outputs.SHOULD_RUN == 'true'"
|
||||||
|
env:
|
||||||
|
GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}'
|
||||||
|
GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}'
|
||||||
|
MODEL_LIST: '${{ env.MODEL_LIST }}'
|
||||||
|
run: |
|
||||||
|
# Run the regression check loop. The script saves the report to a file.
|
||||||
|
node scripts/run_eval_regression.js
|
||||||
|
|
||||||
|
# Use the generated report file if it exists
|
||||||
|
if [[ -f eval_regression_report.md ]]; then
|
||||||
|
echo "REPORT_FILE=eval_regression_report.md" >> "$GITHUB_ENV"
|
||||||
|
fi
|
||||||
|
|
||||||
|
- name: 'Post or Update PR Comment'
|
||||||
|
if: "always() && steps.detect.outputs.STEERING_DETECTED == 'true'"
|
||||||
|
env:
|
||||||
|
GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}'
|
||||||
|
run: |
|
||||||
|
# 1. Build the full comment body
|
||||||
|
{
|
||||||
|
if [[ -f eval_regression_report.md ]]; then
|
||||||
|
cat eval_regression_report.md
|
||||||
|
echo ""
|
||||||
|
fi
|
||||||
|
echo "### 🧠 Model Steering Guidance"
|
||||||
|
echo ""
|
||||||
|
echo "This PR modifies files that affect the model's behavior (prompts, tools, or instructions)."
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
if [[ "${{ steps.analysis.outputs.MISSING_EVALS }}" == "true" ]]; then
|
||||||
|
echo "- ⚠️ **Consider adding Evals:** No behavioral evaluations (\`evals/*.eval.ts\`) were added or updated in this PR. Consider [adding a test case](https://github.com/google-gemini/gemini-cli/blob/main/evals/README.md#creating-an-evaluation) to verify the new behavior and prevent regressions."
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "${{ steps.analysis.outputs.IS_MAINTAINER }}" == "true" ]]; then
|
||||||
|
echo "- 🚀 **Maintainer Reminder:** Please ensure that these changes do not regress results on benchmark evals before merging."
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "---"
|
||||||
|
echo "*This is an automated guidance message triggered by steering logic signatures.*"
|
||||||
|
echo "<!-- eval-pr-report -->"
|
||||||
|
} > full_comment.md
|
||||||
|
|
||||||
|
# 2. Find if a comment with our unique tag already exists
|
||||||
|
# We extract the numeric ID from the URL to ensure compatibility with the REST API
|
||||||
|
COMMENT_ID=$(gh pr view ${{ github.event.pull_request.number }} --json comments --jq '.comments[] | select(.body | contains("<!-- eval-pr-report -->")) | .url' | grep -oE "[0-9]+$" | head -n 1)
|
||||||
|
|
||||||
|
# 3. Update or Create the comment
|
||||||
|
if [ -n "$COMMENT_ID" ]; then
|
||||||
|
echo "Updating existing comment $COMMENT_ID via API..."
|
||||||
|
gh api -X PATCH "repos/${{ github.repository }}/issues/comments/$COMMENT_ID" -F body=@full_comment.md
|
||||||
|
else
|
||||||
|
echo "Creating new PR comment..."
|
||||||
|
gh pr comment ${{ github.event.pull_request.number }} --body-file full_comment.md
|
||||||
|
fi
|
||||||
@@ -212,6 +212,56 @@ The nightly workflow executes the full evaluation suite multiple times
|
|||||||
(currently 3 attempts) to account for non-determinism. These results are
|
(currently 3 attempts) to account for non-determinism. These results are
|
||||||
aggregated into a **Nightly Summary** attached to the workflow run.
|
aggregated into a **Nightly Summary** attached to the workflow run.
|
||||||
|
|
||||||
|
## Regression Check Scripts
|
||||||
|
|
||||||
|
The project includes several scripts to automate high-signal regression checking
|
||||||
|
in Pull Requests. These can also be run locally for debugging.
|
||||||
|
|
||||||
|
- **`scripts/get_trustworthy_evals.js`**: Analyzes nightly history to identify
|
||||||
|
stable tests (80%+ aggregate pass rate).
|
||||||
|
- **`scripts/run_regression_check.js`**: Runs a specific set of tests using the
|
||||||
|
"Best-of-4" logic and "Dynamic Baseline Verification".
|
||||||
|
- **`scripts/run_eval_regression.js`**: The main orchestrator that loops through
|
||||||
|
models and generates the final PR report.
|
||||||
|
|
||||||
|
### Running Regression Checks Locally
|
||||||
|
|
||||||
|
You can simulate the PR regression check locally to verify your changes before
|
||||||
|
pushing:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Run the full regression loop for a specific model
|
||||||
|
MODEL_LIST=gemini-3-flash-preview node scripts/run_eval_regression.js
|
||||||
|
```
|
||||||
|
|
||||||
|
To debug a specific failing test with the same logic used in CI:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. Get the Vitest pattern for trustworthy tests
|
||||||
|
OUTPUT=$(node scripts/get_trustworthy_evals.js "gemini-3-flash-preview")
|
||||||
|
|
||||||
|
# 2. Run the regression logic for those tests
|
||||||
|
node scripts/run_regression_check.js "gemini-3-flash-preview" "$OUTPUT"
|
||||||
|
```
|
||||||
|
|
||||||
|
### The Regression Quality Bar
|
||||||
|
|
||||||
|
Because LLMs are non-deterministic, the PR regression check uses a high-signal
|
||||||
|
probabilistic approach rather than a 100% pass requirement:
|
||||||
|
|
||||||
|
1. **Trustworthiness (60/80 Filter):** Only tests with a proven track record
|
||||||
|
are run. A test must score at least **60% (2/3)** every single night and
|
||||||
|
maintain an **80% aggregate** pass rate over the last 6 days.
|
||||||
|
2. **The 50% Pass Rule:** In a PR, a test is considered a **Pass** if the model
|
||||||
|
correctly performs the behavior at least half the time (**2 successes** out
|
||||||
|
of up to 4 attempts).
|
||||||
|
3. **Dynamic Baseline Verification:** If a test fails in a PR (e.g., 0/3), the
|
||||||
|
system automatically checks the `main` branch. If it fails there too, it is
|
||||||
|
marked as **Pre-existing** and cleared for the PR, ensuring you are only
|
||||||
|
blocked by regressions caused by your specific changes.
|
||||||
|
|
||||||
|
## Fixing Evaluations
|
||||||
|
|
||||||
#### How to interpret the report:
|
#### How to interpret the report:
|
||||||
|
|
||||||
- **Pass Rate (%)**: Each cell represents the percentage of successful runs for
|
- **Pass Rate (%)**: Each cell represents the percentage of successful runs for
|
||||||
|
|||||||
@@ -0,0 +1,142 @@
|
|||||||
|
/**
|
||||||
|
* @license
|
||||||
|
* Copyright 2026 Google LLC
|
||||||
|
* SPDX-License-Identifier: Apache-2.0
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @fileoverview Compares PR evaluation results against historical nightly baselines.
|
||||||
|
*
|
||||||
|
* This script generates a Markdown report for use in PR comments. It aligns with
|
||||||
|
* the 6-day lookback logic to show accurate historical pass rates and filters out
|
||||||
|
* pre-existing or noisy failures to ensure only actionable regressions are reported.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import fs from 'node:fs';
|
||||||
|
import path from 'node:path';
|
||||||
|
import { fetchNightlyHistory } from './eval_utils.js';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Main execution logic.
|
||||||
|
*/
|
||||||
|
function main() {
|
||||||
|
const prReportPath = 'evals/logs/pr_final_report.json';
|
||||||
|
const targetModel = process.argv[2];
|
||||||
|
|
||||||
|
if (!targetModel) {
|
||||||
|
console.error('❌ Error: No target model specified.');
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!fs.existsSync(prReportPath)) {
|
||||||
|
console.error('No PR report found.');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const prReport = JSON.parse(fs.readFileSync(prReportPath, 'utf-8'));
|
||||||
|
const history = fetchNightlyHistory(6); // Use same 6-day lookback
|
||||||
|
const latestNightly = aggregateHistoricalStats(history, targetModel);
|
||||||
|
|
||||||
|
const regressions = [];
|
||||||
|
const passes = [];
|
||||||
|
|
||||||
|
for (const [testName, pr] of Object.entries(prReport.results)) {
|
||||||
|
const prRate = pr.passed / pr.total;
|
||||||
|
if (pr.status === 'regression' || (prRate <= 0.34 && !pr.status)) {
|
||||||
|
// Use relative path from workspace root
|
||||||
|
const relativeFile = pr.file
|
||||||
|
? path.relative(process.cwd(), pr.file)
|
||||||
|
: 'evals/';
|
||||||
|
|
||||||
|
regressions.push({
|
||||||
|
name: testName,
|
||||||
|
file: relativeFile,
|
||||||
|
nightly: latestNightly[testName]
|
||||||
|
? (latestNightly[testName].passRate * 100).toFixed(0) + '%'
|
||||||
|
: 'N/A',
|
||||||
|
pr: (prRate * 100).toFixed(0) + '%',
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
passes.push(testName);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (regressions.length > 0) {
|
||||||
|
let markdown = '### 🚨 Action Required: Eval Regressions Detected\n\n';
|
||||||
|
markdown += `**Model:** \`${targetModel}\`\n\n`;
|
||||||
|
markdown +=
|
||||||
|
'The following trustworthy evaluations passed on **`main`** and in **recent Nightly runs**, but failed in this PR. These regressions must be addressed before merging.\n\n';
|
||||||
|
|
||||||
|
markdown += '| Test Name | Nightly | PR Result | Status |\n';
|
||||||
|
markdown += '| :--- | :---: | :---: | :--- |\n';
|
||||||
|
for (const r of regressions) {
|
||||||
|
markdown += `| ${r.name} | ${r.nightly} | ${r.pr} | ❌ **Regression** |\n`;
|
||||||
|
}
|
||||||
|
markdown += `\n*The check passed or was cleared for ${passes.length} other trustworthy evaluations.*\n\n`;
|
||||||
|
|
||||||
|
markdown += '<details>\n';
|
||||||
|
markdown +=
|
||||||
|
'<summary><b>🛠️ Troubleshooting & Fix Instructions</b></summary>\n\n';
|
||||||
|
|
||||||
|
for (let i = 0; i < regressions.length; i++) {
|
||||||
|
const r = regressions[i];
|
||||||
|
if (regressions.length > 1) {
|
||||||
|
markdown += `### Failure ${i + 1}: ${r.name}\n\n`;
|
||||||
|
}
|
||||||
|
|
||||||
|
markdown += '#### 1. Ask Gemini CLI to fix it (Recommended)\n';
|
||||||
|
markdown += 'Copy and paste this prompt to the agent:\n';
|
||||||
|
markdown += '```text\n';
|
||||||
|
markdown += `The eval "${r.name}" in ${r.file} is failing. Investigate and fix it using the behavioral-evals skill.\n`;
|
||||||
|
markdown += '```\n\n';
|
||||||
|
|
||||||
|
markdown += '#### 2. Reproduce Locally\n';
|
||||||
|
markdown += 'Run the following command to see the failure trajectory:\n';
|
||||||
|
markdown += '```bash\n';
|
||||||
|
const pattern = r.name.replace(/'/g, '.');
|
||||||
|
markdown += `GEMINI_MODEL=${targetModel} npm run test:all_evals -- ${r.file} --testNamePattern="${pattern}"\n`;
|
||||||
|
|
||||||
|
markdown += '```\n\n';
|
||||||
|
|
||||||
|
if (i < regressions.length - 1) {
|
||||||
|
markdown += '---\n\n';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
markdown += '#### 3. Manual Fix\n';
|
||||||
|
markdown +=
|
||||||
|
'See the [Fixing Guide](https://github.com/google-gemini/gemini-cli/blob/main/evals/README.md#fixing-evaluations) for detailed troubleshooting steps.\n';
|
||||||
|
markdown += '</details>\n';
|
||||||
|
|
||||||
|
process.stdout.write(markdown);
|
||||||
|
} else if (passes.length > 0) {
|
||||||
|
// Success State
|
||||||
|
process.stdout.write(
|
||||||
|
`✅ **${passes.length}** tests passed successfully on **${targetModel}**.\n`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Aggregates stats from history for a specific model.
|
||||||
|
*/
|
||||||
|
function aggregateHistoricalStats(history, model) {
|
||||||
|
const stats = {};
|
||||||
|
for (const item of history) {
|
||||||
|
const modelStats = item.stats[model];
|
||||||
|
if (!modelStats) continue;
|
||||||
|
|
||||||
|
for (const [testName, stat] of Object.entries(modelStats)) {
|
||||||
|
if (!stats[testName]) stats[testName] = { passed: 0, total: 0 };
|
||||||
|
stats[testName].passed += stat.passed;
|
||||||
|
stats[testName].total += stat.total;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const name in stats) {
|
||||||
|
stats[name].passRate = stats[name].passed / stats[name].total;
|
||||||
|
}
|
||||||
|
return stats;
|
||||||
|
}
|
||||||
|
|
||||||
|
main();
|
||||||
@@ -0,0 +1,136 @@
|
|||||||
|
/**
|
||||||
|
* @license
|
||||||
|
* Copyright 2026 Google LLC
|
||||||
|
* SPDX-License-Identifier: Apache-2.0
|
||||||
|
*/
|
||||||
|
|
||||||
|
import fs from 'node:fs';
|
||||||
|
import path from 'node:path';
|
||||||
|
import { execSync } from 'node:child_process';
|
||||||
|
import os from 'node:os';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Finds all report.json files recursively in a directory.
|
||||||
|
*/
|
||||||
|
export function findReports(dir) {
|
||||||
|
const reports = [];
|
||||||
|
if (!fs.existsSync(dir)) return reports;
|
||||||
|
|
||||||
|
const files = fs.readdirSync(dir);
|
||||||
|
for (const file of files) {
|
||||||
|
const fullPath = path.join(dir, file);
|
||||||
|
const stat = fs.statSync(fullPath);
|
||||||
|
if (stat.isDirectory()) {
|
||||||
|
reports.push(...findReports(fullPath));
|
||||||
|
} else if (file === 'report.json') {
|
||||||
|
reports.push(fullPath);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return reports;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extracts the model name from the artifact path.
|
||||||
|
*/
|
||||||
|
export function getModelFromPath(reportPath) {
|
||||||
|
const parts = reportPath.split(path.sep);
|
||||||
|
// Look for the directory that follows the 'eval-logs-' pattern
|
||||||
|
const artifactDir = parts.find((p) => p.startsWith('eval-logs-'));
|
||||||
|
if (!artifactDir) return 'unknown';
|
||||||
|
|
||||||
|
const match = artifactDir.match(/^eval-logs-(.+)-(\d+)$/);
|
||||||
|
return match ? match[1] : 'unknown';
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Escapes special characters in a string for use in a regular expression.
|
||||||
|
*/
|
||||||
|
export function escapeRegex(string) {
|
||||||
|
return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Aggregates stats from a list of report.json files.
|
||||||
|
* @returns {Record<string, Record<string, {passed: number, total: number, file?: string}>>} statsByModel
|
||||||
|
*/
|
||||||
|
export function getStatsFromReports(reports) {
|
||||||
|
const statsByModel = {};
|
||||||
|
|
||||||
|
for (const reportPath of reports) {
|
||||||
|
try {
|
||||||
|
const model = getModelFromPath(reportPath);
|
||||||
|
if (!statsByModel[model]) {
|
||||||
|
statsByModel[model] = {};
|
||||||
|
}
|
||||||
|
const testStats = statsByModel[model];
|
||||||
|
|
||||||
|
const content = fs.readFileSync(reportPath, 'utf-8');
|
||||||
|
const json = JSON.parse(content);
|
||||||
|
|
||||||
|
for (const testResult of json.testResults) {
|
||||||
|
const filePath = testResult.name;
|
||||||
|
for (const assertion of testResult.assertionResults) {
|
||||||
|
const name = assertion.title;
|
||||||
|
if (!testStats[name]) {
|
||||||
|
testStats[name] = { passed: 0, total: 0, file: filePath };
|
||||||
|
}
|
||||||
|
testStats[name].total++;
|
||||||
|
if (assertion.status === 'passed') {
|
||||||
|
testStats[name].passed++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Error processing report at ${reportPath}:`, error.message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return statsByModel;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetches historical nightly data using the GitHub CLI.
|
||||||
|
* @returns {Array<{runId: string, stats: Record<string, any>}>} history
|
||||||
|
*/
|
||||||
|
export function fetchNightlyHistory(lookbackCount) {
|
||||||
|
const history = [];
|
||||||
|
try {
|
||||||
|
const cmd = `gh run list --workflow evals-nightly.yml --branch main --limit ${
|
||||||
|
lookbackCount + 2
|
||||||
|
} --json databaseId,status`;
|
||||||
|
const runsJson = execSync(cmd, { encoding: 'utf-8' });
|
||||||
|
let runs = JSON.parse(runsJson);
|
||||||
|
|
||||||
|
// Filter for completed runs and take the top N
|
||||||
|
runs = runs.filter((r) => r.status === 'completed').slice(0, lookbackCount);
|
||||||
|
|
||||||
|
for (const run of runs) {
|
||||||
|
const tmpDir = fs.mkdtempSync(
|
||||||
|
path.join(os.tmpdir(), `gemini-evals-hist-${run.databaseId}-`),
|
||||||
|
);
|
||||||
|
try {
|
||||||
|
execSync(
|
||||||
|
`gh run download ${run.databaseId} -p "eval-logs-*" -D "${tmpDir}"`,
|
||||||
|
{ stdio: 'ignore' },
|
||||||
|
);
|
||||||
|
|
||||||
|
const runReports = findReports(tmpDir);
|
||||||
|
if (runReports.length > 0) {
|
||||||
|
history.push({
|
||||||
|
runId: run.databaseId,
|
||||||
|
stats: getStatsFromReports(runReports),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error(
|
||||||
|
`Failed to process artifacts for run ${run.databaseId}:`,
|
||||||
|
error.message,
|
||||||
|
);
|
||||||
|
} finally {
|
||||||
|
fs.rmSync(tmpDir, { recursive: true, force: true });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Failed to fetch history:', error.message);
|
||||||
|
}
|
||||||
|
return history;
|
||||||
|
}
|
||||||
@@ -0,0 +1,125 @@
|
|||||||
|
/**
|
||||||
|
* @license
|
||||||
|
* Copyright 2026 Google LLC
|
||||||
|
* SPDX-License-Identifier: Apache-2.0
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @fileoverview Identifies "Trustworthy" behavioral evaluations from nightly history.
|
||||||
|
*
|
||||||
|
* This script analyzes the last 6 days of nightly runs to find tests that meet
|
||||||
|
* strict stability criteria (80% aggregate pass rate and 60% daily floor).
|
||||||
|
* It outputs a list of files and a Vitest pattern used by the PR regression check
|
||||||
|
* to ensure high-signal validation and minimize noise.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { fetchNightlyHistory, escapeRegex } from './eval_utils.js';
|
||||||
|
|
||||||
|
const LOOKBACK_COUNT = 6;
|
||||||
|
const MIN_VALID_RUNS = 5; // At least 5 out of 6 must be available
|
||||||
|
const PASS_RATE_THRESHOLD = 0.6; // Daily floor (e.g., 2/3)
|
||||||
|
const AGGREGATE_PASS_RATE_THRESHOLD = 0.8; // Weekly signal (e.g., 15/18)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Main execution logic.
|
||||||
|
*/
|
||||||
|
function main() {
|
||||||
|
const targetModel = process.argv[2];
|
||||||
|
if (!targetModel) {
|
||||||
|
console.error('❌ Error: No target model specified.');
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
console.error(`🔍 Identifying trustworthy evals for model: ${targetModel}`);
|
||||||
|
|
||||||
|
const history = fetchNightlyHistory(LOOKBACK_COUNT);
|
||||||
|
if (history.length === 0) {
|
||||||
|
console.error('❌ No historical data found.');
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Aggregate results for the target model across all history
|
||||||
|
const testHistories = {}; // { [testName]: { totalPassed: 0, totalRuns: 0, dailyRates: [], file: string } }
|
||||||
|
|
||||||
|
for (const item of history) {
|
||||||
|
const modelStats = item.stats[targetModel];
|
||||||
|
if (!modelStats) continue;
|
||||||
|
|
||||||
|
for (const [testName, stat] of Object.entries(modelStats)) {
|
||||||
|
if (!testHistories[testName]) {
|
||||||
|
testHistories[testName] = {
|
||||||
|
totalPassed: 0,
|
||||||
|
totalRuns: 0,
|
||||||
|
dailyRates: [],
|
||||||
|
file: stat.file,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
testHistories[testName].totalPassed += stat.passed;
|
||||||
|
testHistories[testName].totalRuns += stat.total;
|
||||||
|
testHistories[testName].dailyRates.push(stat.passed / stat.total);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const trustworthyTests = [];
|
||||||
|
const trustworthyFiles = new Set();
|
||||||
|
const volatileTests = [];
|
||||||
|
const newTests = [];
|
||||||
|
|
||||||
|
for (const [testName, info] of Object.entries(testHistories)) {
|
||||||
|
const dailyRates = info.dailyRates;
|
||||||
|
const aggregateRate = info.totalPassed / info.totalRuns;
|
||||||
|
|
||||||
|
// 1. Minimum data points required
|
||||||
|
if (dailyRates.length < MIN_VALID_RUNS) {
|
||||||
|
newTests.push(testName);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. Trustworthy Criterion:
|
||||||
|
// - Every single day must be above the floor (e.g. > 60%)
|
||||||
|
// - The overall aggregate must be high-signal (e.g. > 80%)
|
||||||
|
const isDailyStable = dailyRates.every(
|
||||||
|
(rate) => rate > PASS_RATE_THRESHOLD,
|
||||||
|
);
|
||||||
|
const isAggregateHighSignal = aggregateRate > AGGREGATE_PASS_RATE_THRESHOLD;
|
||||||
|
|
||||||
|
if (isDailyStable && isAggregateHighSignal) {
|
||||||
|
trustworthyTests.push(testName);
|
||||||
|
if (info.file) {
|
||||||
|
const match = info.file.match(/evals\/.*\.eval\.ts/);
|
||||||
|
if (match) {
|
||||||
|
trustworthyFiles.add(match[0]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
volatileTests.push(testName);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.error(
|
||||||
|
`✅ Found ${trustworthyTests.length} trustworthy tests across ${trustworthyFiles.size} files:`,
|
||||||
|
);
|
||||||
|
trustworthyTests.sort().forEach((name) => console.error(` - ${name}`));
|
||||||
|
console.error(`\n⚪ Ignored ${volatileTests.length} volatile tests.`);
|
||||||
|
console.error(
|
||||||
|
`🆕 Ignored ${newTests.length} tests with insufficient history.`,
|
||||||
|
);
|
||||||
|
|
||||||
|
// Output the list of names as a regex-friendly pattern for vitest -t
|
||||||
|
const pattern = trustworthyTests.map((name) => escapeRegex(name)).join('|');
|
||||||
|
|
||||||
|
// Also output unique file paths as a space-separated string
|
||||||
|
const files = Array.from(trustworthyFiles).join(' ');
|
||||||
|
|
||||||
|
// Print the combined output to stdout for use in shell scripts (only if piped/CI)
|
||||||
|
if (!process.stdout.isTTY) {
|
||||||
|
// Format: FILE_LIST --test-pattern TEST_PATTERN
|
||||||
|
// This allows the workflow to easily use it
|
||||||
|
process.stdout.write(`${files} --test-pattern ${pattern || ''}\n`);
|
||||||
|
} else {
|
||||||
|
console.error(
|
||||||
|
'\n💡 Note: Raw regex pattern and file list are hidden in interactive terminal. It will be printed when piped or in CI.',
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
main();
|
||||||
@@ -0,0 +1,107 @@
|
|||||||
|
/**
|
||||||
|
* @license
|
||||||
|
* Copyright 2026 Google LLC
|
||||||
|
* SPDX-License-Identifier: Apache-2.0
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @fileoverview Orchestrates the PR evaluation process across multiple models.
|
||||||
|
*
|
||||||
|
* This script loops through a provided list of models, identifies trustworthy
|
||||||
|
* tests for each, executes the frugal regression check, and collects results
|
||||||
|
* into a single unified report. It exits with code 1 if any confirmed
|
||||||
|
* regressions are detected.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { execSync } from 'node:child_process';
|
||||||
|
import fs from 'node:fs';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Main execution logic.
|
||||||
|
*/
|
||||||
|
async function main() {
|
||||||
|
const modelList = process.env.MODEL_LIST || 'gemini-3-flash-preview';
|
||||||
|
const models = modelList.split(',').map((m) => m.trim());
|
||||||
|
|
||||||
|
let combinedReport = '';
|
||||||
|
let hasRegression = false;
|
||||||
|
|
||||||
|
console.log(
|
||||||
|
`🚀 Starting evaluation orchestration for models: ${models.join(', ')}`,
|
||||||
|
);
|
||||||
|
|
||||||
|
for (const model of models) {
|
||||||
|
console.log(`\n--- Processing Model: ${model} ---`);
|
||||||
|
|
||||||
|
try {
|
||||||
|
// 1. Identify Trustworthy Evals
|
||||||
|
console.log(`🔍 Identifying trustworthy tests for ${model}...`);
|
||||||
|
const output = execSync(
|
||||||
|
`node scripts/get_trustworthy_evals.js "${model}"`,
|
||||||
|
{
|
||||||
|
encoding: 'utf-8',
|
||||||
|
stdio: ['inherit', 'pipe', 'inherit'], // Capture stdout but pass stdin/stderr
|
||||||
|
},
|
||||||
|
).trim();
|
||||||
|
|
||||||
|
if (!output) {
|
||||||
|
console.log(`ℹ️ No trustworthy tests found for ${model}. Skipping.`);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. Run Frugal Regression Check
|
||||||
|
console.log(`🧪 Running regression check for ${model}...`);
|
||||||
|
execSync(`node scripts/run_regression_check.js "${model}" "${output}"`, {
|
||||||
|
stdio: 'inherit',
|
||||||
|
});
|
||||||
|
|
||||||
|
// 3. Generate Report
|
||||||
|
console.log(`📊 Generating report for ${model}...`);
|
||||||
|
const report = execSync(`node scripts/compare_evals.js "${model}"`, {
|
||||||
|
encoding: 'utf-8',
|
||||||
|
stdio: ['inherit', 'pipe', 'inherit'],
|
||||||
|
}).trim();
|
||||||
|
|
||||||
|
if (report) {
|
||||||
|
if (combinedReport) {
|
||||||
|
combinedReport += '\n\n---\n\n';
|
||||||
|
}
|
||||||
|
combinedReport += report;
|
||||||
|
|
||||||
|
// 4. Check for Regressions
|
||||||
|
// If the report contains the "Action Required" marker, it means a confirmed regression was found.
|
||||||
|
if (report.includes('Action Required')) {
|
||||||
|
hasRegression = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`❌ Error processing model ${model}:`, error.message);
|
||||||
|
// We flag a failure if any model encountered a critical error
|
||||||
|
hasRegression = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Always save the combined report to a file so the workflow can capture it cleanly
|
||||||
|
if (combinedReport) {
|
||||||
|
fs.writeFileSync('eval_regression_report.md', combinedReport);
|
||||||
|
console.log(
|
||||||
|
'\n📊 Final Markdown report saved to eval_regression_report.md',
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Log status for CI visibility, but don't exit with error
|
||||||
|
if (hasRegression) {
|
||||||
|
console.error(
|
||||||
|
'\n⚠️ Confirmed regressions detected across one or more models. See PR comment for details.',
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
console.log('\n✅ All evaluations passed successfully (or were cleared).');
|
||||||
|
}
|
||||||
|
|
||||||
|
process.exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch((err) => {
|
||||||
|
console.error(err);
|
||||||
|
process.exit(1);
|
||||||
|
});
|
||||||
@@ -0,0 +1,305 @@
|
|||||||
|
/**
|
||||||
|
* @license
|
||||||
|
* Copyright 2026 Google LLC
|
||||||
|
* SPDX-License-Identifier: Apache-2.0
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @fileoverview Executes a high-signal regression check for behavioral evaluations.
|
||||||
|
*
|
||||||
|
* This script runs a targeted set of stable tests in an optimistic first pass.
|
||||||
|
* If failures occur, it employs a "Best-of-4" retry logic to handle natural flakiness.
|
||||||
|
* For confirmed failures (0/3), it performs Dynamic Baseline Verification by
|
||||||
|
* checking the failure against the 'main' branch to distinguish between
|
||||||
|
* model drift and PR-introduced regressions.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { execSync } from 'node:child_process';
|
||||||
|
import fs from 'node:fs';
|
||||||
|
import path from 'node:path';
|
||||||
|
import { quote } from 'shell-quote';
|
||||||
|
import { escapeRegex } from './eval_utils.js';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Runs a set of tests using Vitest and returns the results.
|
||||||
|
*/
|
||||||
|
function runTests(files, pattern, model) {
|
||||||
|
const outputDir = path.resolve(
|
||||||
|
process.cwd(),
|
||||||
|
`evals/logs/pr-run-${Date.now()}`,
|
||||||
|
);
|
||||||
|
fs.mkdirSync(outputDir, { recursive: true });
|
||||||
|
|
||||||
|
const filesToRun = files || 'evals/';
|
||||||
|
console.log(
|
||||||
|
`🚀 Running tests in ${filesToRun} with pattern: ${pattern?.slice(0, 100)}...`,
|
||||||
|
);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const cmd = `npx vitest run --config evals/vitest.config.ts ${filesToRun} -t "${pattern}" --reporter=json --reporter=default --outputFile="${path.join(outputDir, 'report.json')}"`;
|
||||||
|
execSync(cmd, {
|
||||||
|
stdio: 'inherit',
|
||||||
|
env: { ...process.env, RUN_EVALS: '1', GEMINI_MODEL: model },
|
||||||
|
});
|
||||||
|
} catch {
|
||||||
|
// Vitest returns a non-zero exit code when tests fail. This is expected.
|
||||||
|
// We continue execution and handle the failures by parsing the JSON report.
|
||||||
|
}
|
||||||
|
|
||||||
|
const reportPath = path.join(outputDir, 'report.json');
|
||||||
|
return fs.existsSync(reportPath)
|
||||||
|
? JSON.parse(fs.readFileSync(reportPath, 'utf-8'))
|
||||||
|
: null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Helper to find a specific assertion by name across all test files.
|
||||||
|
*/
|
||||||
|
function findAssertion(report, testName) {
|
||||||
|
if (!report?.testResults) return null;
|
||||||
|
for (const fileResult of report.testResults) {
|
||||||
|
const assertion = fileResult.assertionResults.find(
|
||||||
|
(a) => a.title === testName,
|
||||||
|
);
|
||||||
|
if (assertion) return assertion;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parses command line arguments to identify model, files, and test pattern.
|
||||||
|
*/
|
||||||
|
function parseArgs() {
|
||||||
|
const modelArg = process.argv[2];
|
||||||
|
const remainingArgs = process.argv.slice(3);
|
||||||
|
const fullArgsString = remainingArgs.join(' ');
|
||||||
|
const testPatternIndex = remainingArgs.indexOf('--test-pattern');
|
||||||
|
|
||||||
|
if (testPatternIndex !== -1) {
|
||||||
|
return {
|
||||||
|
model: modelArg,
|
||||||
|
files: remainingArgs.slice(0, testPatternIndex).join(' '),
|
||||||
|
pattern: remainingArgs.slice(testPatternIndex + 1).join(' '),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (fullArgsString.includes('--test-pattern')) {
|
||||||
|
const parts = fullArgsString.split('--test-pattern');
|
||||||
|
return {
|
||||||
|
model: modelArg,
|
||||||
|
files: parts[0].trim(),
|
||||||
|
pattern: parts[1].trim(),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback for manual mode: Pattern Model
|
||||||
|
const manualPattern = process.argv[2];
|
||||||
|
const manualModel = process.argv[3];
|
||||||
|
if (!manualModel) {
|
||||||
|
console.error('❌ Error: No target model specified.');
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
let manualFiles = 'evals/';
|
||||||
|
try {
|
||||||
|
const grepResult = execSync(
|
||||||
|
`grep -l ${quote([manualPattern])} evals/*.eval.ts`,
|
||||||
|
{ encoding: 'utf-8' },
|
||||||
|
);
|
||||||
|
manualFiles = grepResult.split('\n').filter(Boolean).join(' ');
|
||||||
|
} catch {
|
||||||
|
// Grep returns exit code 1 if no files match the pattern.
|
||||||
|
// In this case, we fall back to scanning all files in the evals/ directory.
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
model: manualModel,
|
||||||
|
files: manualFiles,
|
||||||
|
pattern: manualPattern,
|
||||||
|
isManual: true,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Runs the targeted retry logic (Best-of-4) for a failing test.
|
||||||
|
*/
|
||||||
|
async function runRetries(testName, results, files, model) {
|
||||||
|
console.log(`\nRe-evaluating: ${testName}`);
|
||||||
|
|
||||||
|
while (
|
||||||
|
results[testName].passed < 2 &&
|
||||||
|
results[testName].total - results[testName].passed < 3 &&
|
||||||
|
results[testName].total < 4
|
||||||
|
) {
|
||||||
|
const attemptNum = results[testName].total + 1;
|
||||||
|
console.log(` Running attempt ${attemptNum}...`);
|
||||||
|
|
||||||
|
const retry = runTests(files, escapeRegex(testName), model);
|
||||||
|
const retryAssertion = findAssertion(retry, testName);
|
||||||
|
|
||||||
|
results[testName].total++;
|
||||||
|
if (retryAssertion?.status === 'passed') {
|
||||||
|
results[testName].passed++;
|
||||||
|
console.log(
|
||||||
|
` ✅ Attempt ${attemptNum} passed. Score: ${results[testName].passed}/${results[testName].total}`,
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
console.log(
|
||||||
|
` ❌ Attempt ${attemptNum} failed (${retryAssertion?.status || 'unknown'}). Score: ${results[testName].passed}/${results[testName].total}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (results[testName].passed >= 2) {
|
||||||
|
console.log(
|
||||||
|
` ✅ Test cleared as Noisy Pass (${results[testName].passed}/${results[testName].total})`,
|
||||||
|
);
|
||||||
|
} else if (results[testName].total - results[testName].passed >= 3) {
|
||||||
|
await verifyBaseline(testName, results, files, model);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies a potential regression against the 'main' branch.
|
||||||
|
*/
|
||||||
|
async function verifyBaseline(testName, results, files, model) {
|
||||||
|
console.log('\n--- Step 3: Dynamic Baseline Verification ---');
|
||||||
|
console.log(
|
||||||
|
`⚠️ Potential regression detected. Verifying baseline on 'main'...`,
|
||||||
|
);
|
||||||
|
|
||||||
|
try {
|
||||||
|
execSync('git stash push -m "eval-regression-check-stash"', {
|
||||||
|
stdio: 'inherit',
|
||||||
|
});
|
||||||
|
const hasStash = execSync('git stash list')
|
||||||
|
.toString()
|
||||||
|
.includes('eval-regression-check-stash');
|
||||||
|
execSync('git checkout main', { stdio: 'inherit' });
|
||||||
|
|
||||||
|
console.log(
|
||||||
|
`\n--- Running Baseline Verification on 'main' (Best-of-3) ---`,
|
||||||
|
);
|
||||||
|
let baselinePasses = 0;
|
||||||
|
let baselineTotal = 0;
|
||||||
|
|
||||||
|
while (baselinePasses === 0 && baselineTotal < 3) {
|
||||||
|
baselineTotal++;
|
||||||
|
console.log(` Baseline Attempt ${baselineTotal}...`);
|
||||||
|
const baselineRun = runTests(files, escapeRegex(testName), model);
|
||||||
|
if (findAssertion(baselineRun, testName)?.status === 'passed') {
|
||||||
|
baselinePasses++;
|
||||||
|
console.log(` ✅ Baseline Attempt ${baselineTotal} passed.`);
|
||||||
|
} else {
|
||||||
|
console.log(` ❌ Baseline Attempt ${baselineTotal} failed.`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
execSync('git checkout -', { stdio: 'inherit' });
|
||||||
|
if (hasStash) execSync('git stash pop', { stdio: 'inherit' });
|
||||||
|
|
||||||
|
if (baselinePasses === 0) {
|
||||||
|
console.log(
|
||||||
|
` ℹ️ Test also fails on 'main'. Marking as PRE-EXISTING (Cleared).`,
|
||||||
|
);
|
||||||
|
results[testName].status = 'pre-existing';
|
||||||
|
results[testName].passed = results[testName].total; // Clear for report
|
||||||
|
} else {
|
||||||
|
console.log(
|
||||||
|
` ❌ Test passes on 'main' but fails in PR. Marking as CONFIRMED REGRESSION.`,
|
||||||
|
);
|
||||||
|
results[testName].status = 'regression';
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error(` ❌ Failed to verify baseline: ${error.message}`);
|
||||||
|
|
||||||
|
// Best-effort cleanup: try to return to the original branch.
|
||||||
|
try {
|
||||||
|
execSync('git checkout -', { stdio: 'ignore' });
|
||||||
|
} catch {
|
||||||
|
// Ignore checkout errors during cleanup to avoid hiding the original error.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Processes initial results and orchestrates retries/baseline checks.
|
||||||
|
*/
|
||||||
|
async function processResults(firstPass, pattern, model, files) {
|
||||||
|
if (!firstPass) return false;
|
||||||
|
|
||||||
|
const results = {};
|
||||||
|
const failingTests = [];
|
||||||
|
let totalProcessed = 0;
|
||||||
|
|
||||||
|
for (const fileResult of firstPass.testResults) {
|
||||||
|
for (const assertion of fileResult.assertionResults) {
|
||||||
|
if (assertion.status !== 'passed' && assertion.status !== 'failed') {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const name = assertion.title;
|
||||||
|
results[name] = {
|
||||||
|
passed: assertion.status === 'passed' ? 1 : 0,
|
||||||
|
total: 1,
|
||||||
|
file: fileResult.name,
|
||||||
|
};
|
||||||
|
if (assertion.status === 'failed') failingTests.push(name);
|
||||||
|
totalProcessed++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (totalProcessed === 0) {
|
||||||
|
console.error('❌ Error: No matching tests were found or executed.');
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (failingTests.length === 0) {
|
||||||
|
console.log('✅ All trustworthy tests passed on the first try!');
|
||||||
|
} else {
|
||||||
|
console.log('\n--- Step 2: Best-of-4 Retries ---');
|
||||||
|
console.log(
|
||||||
|
`⚠️ ${failingTests.length} tests failed the optimistic run. Starting retries...`,
|
||||||
|
);
|
||||||
|
for (const testName of failingTests) {
|
||||||
|
await runRetries(testName, results, files, model);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
saveResults(results);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
function saveResults(results) {
|
||||||
|
const finalReport = { timestamp: new Date().toISOString(), results };
|
||||||
|
fs.writeFileSync(
|
||||||
|
'evals/logs/pr_final_report.json',
|
||||||
|
JSON.stringify(finalReport, null, 2),
|
||||||
|
);
|
||||||
|
console.log('\nFinal report saved to evals/logs/pr_final_report.json');
|
||||||
|
}
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
const { model, files, pattern, isManual } = parseArgs();
|
||||||
|
|
||||||
|
if (isManual) {
|
||||||
|
const firstPass = runTests(files, pattern, model);
|
||||||
|
const success = await processResults(firstPass, pattern, model, files);
|
||||||
|
process.exit(success ? 0 : 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!pattern) {
|
||||||
|
console.log('No trustworthy tests to run.');
|
||||||
|
process.exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('\n--- Step 1: Optimistic Run (N=1) ---');
|
||||||
|
const firstPass = runTests(files, pattern, model);
|
||||||
|
const success = await processResults(firstPass, pattern, model, files);
|
||||||
|
process.exit(success ? 0 : 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch((err) => {
|
||||||
|
console.error(err);
|
||||||
|
process.exit(1);
|
||||||
|
});
|
||||||
Reference in New Issue
Block a user