diff --git a/.github/workflows/eval-guidance.yml b/.github/workflows/eval-guidance.yml deleted file mode 100644 index e1f1ab3168..0000000000 --- a/.github/workflows/eval-guidance.yml +++ /dev/null @@ -1,69 +0,0 @@ -name: 'Evals: PR Guidance' - -on: - pull_request: - paths: - - 'packages/core/src/**/*.ts' - - '!**/*.test.ts' - - '!**/*.test.tsx' - -permissions: - pull-requests: 'write' - contents: 'read' - -jobs: - provide-guidance: - name: 'Model Steering Guidance' - runs-on: 'ubuntu-latest' - if: "github.repository == 'google-gemini/gemini-cli'" - steps: - - name: 'Checkout' - uses: 'actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955' # ratchet:actions/checkout@v4 - with: - fetch-depth: 0 - - - name: 'Set up Node.js' - uses: 'actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020' # ratchet:actions/setup-node@v4.4.0 - with: - node-version-file: '.nvmrc' - cache: 'npm' - - - name: 'Detect Steering Changes' - id: 'detect' - run: | - STEERING_DETECTED=$(node scripts/changed_prompt.js --steering-only) - echo "STEERING_DETECTED=$STEERING_DETECTED" >> "$GITHUB_OUTPUT" - - - name: 'Analyze PR Content' - if: "steps.detect.outputs.STEERING_DETECTED == 'true'" - id: 'analysis' - env: - GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}' - run: | - # Check for behavioral eval changes - EVAL_CHANGES=$(git diff --name-only origin/${{ github.base_ref }}...HEAD | grep "^evals/" || true) - if [ -z "$EVAL_CHANGES" ]; then - echo "MISSING_EVALS=true" >> "$GITHUB_OUTPUT" - fi - - # Check if user is a maintainer (has write/admin access) - USER_PERMISSION=$(gh api repos/${{ github.repository }}/collaborators/${{ github.actor }}/permission --jq '.permission') - if [[ "$USER_PERMISSION" == "admin" || "$USER_PERMISSION" == "write" ]]; then - echo "IS_MAINTAINER=true" >> "$GITHUB_OUTPUT" - fi - - - name: 'Post Guidance Comment' - if: "steps.detect.outputs.STEERING_DETECTED == 'true'" - uses: 'thollander/actions-comment-pull-request@65f9e5c9a1f2cd378bd74b2e057c9736982a8e74' # ratchet:thollander/actions-comment-pull-request@v3 - with: - comment-tag: 'eval-guidance-bot' - message: | - ### ๐Ÿง  Model Steering Guidance - - This PR modifies files that affect the model's behavior (prompts, tools, or instructions). - - ${{ steps.analysis.outputs.MISSING_EVALS == 'true' && '- โš ๏ธ **Consider adding Evals:** No behavioral evaluations (`evals/*.eval.ts`) were added or updated in this PR. Consider adding a test case to verify the new behavior and prevent regressions.' || '' }} - ${{ steps.analysis.outputs.IS_MAINTAINER == 'true' && '- ๐Ÿš€ **Maintainer Reminder:** Please ensure that these changes do not regress results on benchmark evals before merging.' || '' }} - - --- - *This is an automated guidance message triggered by steering logic signatures.* diff --git a/.github/workflows/eval-pr.yml b/.github/workflows/eval-pr.yml new file mode 100644 index 0000000000..e0f839e667 --- /dev/null +++ b/.github/workflows/eval-pr.yml @@ -0,0 +1,137 @@ +name: 'Evals: PR Evaluation & Regression' + +on: + pull_request: + types: ['opened', 'synchronize', 'reopened', 'ready_for_review'] + paths: + - 'packages/core/src/prompts/**' + - 'packages/core/src/tools/**' + - 'packages/core/src/agents/**' + - 'evals/**' + - '!**/*.test.ts' + - '!**/*.test.tsx' + workflow_dispatch: + +# Prevents multiple runs for the same PR simultaneously (saves tokens) +concurrency: + group: '${{ github.workflow }}-${{ github.head_ref || github.ref }}' + cancel-in-progress: true + +permissions: + pull-requests: 'write' + contents: 'read' + actions: 'read' + +jobs: + pr-evaluation: + name: 'Evaluate Steering & Regressions' + runs-on: 'gemini-cli-ubuntu-16-core' + if: "github.repository == 'google-gemini/gemini-cli' && (github.event_name != 'pull_request' || (github.event.pull_request.draft == false && github.event.pull_request.head.repo.full_name == github.repository))" + # External contributors' PRs will wait for approval in this environment + environment: |- + ${{ (github.event.pull_request.head.repo.full_name == github.repository) && 'internal' || 'external-evals' }} + env: + # CENTRALIZED MODEL LIST + MODEL_LIST: 'gemini-3-flash-preview' + + steps: + - name: 'Checkout' + uses: 'actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955' # ratchet:actions/checkout@v5 + with: + fetch-depth: 0 + + - name: 'Set up Node.js' + uses: 'actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020' # ratchet:actions/setup-node@v4.4.0 + with: + node-version-file: '.nvmrc' + cache: 'npm' + + - name: 'Install dependencies' + run: 'npm ci' + + - name: 'Build project' + run: 'npm run build' + + - name: 'Detect Steering Changes' + id: 'detect' + run: | + SHOULD_RUN=$(node scripts/changed_prompt.js) + STEERING_DETECTED=$(node scripts/changed_prompt.js --steering-only) + echo "SHOULD_RUN=$SHOULD_RUN" >> "$GITHUB_OUTPUT" + echo "STEERING_DETECTED=$STEERING_DETECTED" >> "$GITHUB_OUTPUT" + + - name: 'Analyze PR Content (Guidance)' + if: "steps.detect.outputs.STEERING_DETECTED == 'true'" + id: 'analysis' + env: + GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}' + run: | + # Check for behavioral eval changes + EVAL_CHANGES=$(git diff --name-only origin/${{ github.base_ref }}...HEAD | grep "^evals/" || true) + if [ -z "$EVAL_CHANGES" ]; then + echo "MISSING_EVALS=true" >> "$GITHUB_OUTPUT" + fi + + # Check if user is a maintainer + USER_PERMISSION=$(gh api repos/${{ github.repository }}/collaborators/${{ github.actor }}/permission --jq '.permission') + if [[ "$USER_PERMISSION" == "admin" || "$USER_PERMISSION" == "write" ]]; then + echo "IS_MAINTAINER=true" >> "$GITHUB_OUTPUT" + fi + + - name: 'Execute Regression Check' + if: "steps.detect.outputs.SHOULD_RUN == 'true'" + env: + GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}' + GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}' + MODEL_LIST: '${{ env.MODEL_LIST }}' + run: | + # Run the regression check loop. The script saves the report to a file. + node scripts/run_eval_regression.js + + # Use the generated report file if it exists + if [[ -f eval_regression_report.md ]]; then + echo "REPORT_FILE=eval_regression_report.md" >> "$GITHUB_ENV" + fi + + - name: 'Post or Update PR Comment' + if: "always() && steps.detect.outputs.STEERING_DETECTED == 'true'" + env: + GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}' + run: | + # 1. Build the full comment body + { + if [[ -f eval_regression_report.md ]]; then + cat eval_regression_report.md + echo "" + fi + echo "### ๐Ÿง  Model Steering Guidance" + echo "" + echo "This PR modifies files that affect the model's behavior (prompts, tools, or instructions)." + echo "" + + if [[ "${{ steps.analysis.outputs.MISSING_EVALS }}" == "true" ]]; then + echo "- โš ๏ธ **Consider adding Evals:** No behavioral evaluations (\`evals/*.eval.ts\`) were added or updated in this PR. Consider [adding a test case](https://github.com/google-gemini/gemini-cli/blob/main/evals/README.md#creating-an-evaluation) to verify the new behavior and prevent regressions." + fi + + if [[ "${{ steps.analysis.outputs.IS_MAINTAINER }}" == "true" ]]; then + echo "- ๐Ÿš€ **Maintainer Reminder:** Please ensure that these changes do not regress results on benchmark evals before merging." + fi + + echo "" + echo "---" + echo "*This is an automated guidance message triggered by steering logic signatures.*" + echo "" + } > full_comment.md + + # 2. Find if a comment with our unique tag already exists + # We extract the numeric ID from the URL to ensure compatibility with the REST API + COMMENT_ID=$(gh pr view ${{ github.event.pull_request.number }} --json comments --jq '.comments[] | select(.body | contains("")) | .url' | grep -oE "[0-9]+$" | head -n 1) + + # 3. Update or Create the comment + if [ -n "$COMMENT_ID" ]; then + echo "Updating existing comment $COMMENT_ID via API..." + gh api -X PATCH "repos/${{ github.repository }}/issues/comments/$COMMENT_ID" -F body=@full_comment.md + else + echo "Creating new PR comment..." + gh pr comment ${{ github.event.pull_request.number }} --body-file full_comment.md + fi diff --git a/evals/README.md b/evals/README.md index 9e3697a6b8..aebfe38ebc 100644 --- a/evals/README.md +++ b/evals/README.md @@ -212,6 +212,56 @@ The nightly workflow executes the full evaluation suite multiple times (currently 3 attempts) to account for non-determinism. These results are aggregated into a **Nightly Summary** attached to the workflow run. +## Regression Check Scripts + +The project includes several scripts to automate high-signal regression checking +in Pull Requests. These can also be run locally for debugging. + +- **`scripts/get_trustworthy_evals.js`**: Analyzes nightly history to identify + stable tests (80%+ aggregate pass rate). +- **`scripts/run_regression_check.js`**: Runs a specific set of tests using the + "Best-of-4" logic and "Dynamic Baseline Verification". +- **`scripts/run_eval_regression.js`**: The main orchestrator that loops through + models and generates the final PR report. + +### Running Regression Checks Locally + +You can simulate the PR regression check locally to verify your changes before +pushing: + +```bash +# Run the full regression loop for a specific model +MODEL_LIST=gemini-3-flash-preview node scripts/run_eval_regression.js +``` + +To debug a specific failing test with the same logic used in CI: + +```bash +# 1. Get the Vitest pattern for trustworthy tests +OUTPUT=$(node scripts/get_trustworthy_evals.js "gemini-3-flash-preview") + +# 2. Run the regression logic for those tests +node scripts/run_regression_check.js "gemini-3-flash-preview" "$OUTPUT" +``` + +### The Regression Quality Bar + +Because LLMs are non-deterministic, the PR regression check uses a high-signal +probabilistic approach rather than a 100% pass requirement: + +1. **Trustworthiness (60/80 Filter):** Only tests with a proven track record + are run. A test must score at least **60% (2/3)** every single night and + maintain an **80% aggregate** pass rate over the last 6 days. +2. **The 50% Pass Rule:** In a PR, a test is considered a **Pass** if the model + correctly performs the behavior at least half the time (**2 successes** out + of up to 4 attempts). +3. **Dynamic Baseline Verification:** If a test fails in a PR (e.g., 0/3), the + system automatically checks the `main` branch. If it fails there too, it is + marked as **Pre-existing** and cleared for the PR, ensuring you are only + blocked by regressions caused by your specific changes. + +## Fixing Evaluations + #### How to interpret the report: - **Pass Rate (%)**: Each cell represents the percentage of successful runs for diff --git a/scripts/compare_evals.js b/scripts/compare_evals.js new file mode 100644 index 0000000000..a5ea15361f --- /dev/null +++ b/scripts/compare_evals.js @@ -0,0 +1,142 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * @fileoverview Compares PR evaluation results against historical nightly baselines. + * + * This script generates a Markdown report for use in PR comments. It aligns with + * the 6-day lookback logic to show accurate historical pass rates and filters out + * pre-existing or noisy failures to ensure only actionable regressions are reported. + */ + +import fs from 'node:fs'; +import path from 'node:path'; +import { fetchNightlyHistory } from './eval_utils.js'; + +/** + * Main execution logic. + */ +function main() { + const prReportPath = 'evals/logs/pr_final_report.json'; + const targetModel = process.argv[2]; + + if (!targetModel) { + console.error('โŒ Error: No target model specified.'); + process.exit(1); + } + + if (!fs.existsSync(prReportPath)) { + console.error('No PR report found.'); + return; + } + + const prReport = JSON.parse(fs.readFileSync(prReportPath, 'utf-8')); + const history = fetchNightlyHistory(6); // Use same 6-day lookback + const latestNightly = aggregateHistoricalStats(history, targetModel); + + const regressions = []; + const passes = []; + + for (const [testName, pr] of Object.entries(prReport.results)) { + const prRate = pr.passed / pr.total; + if (pr.status === 'regression' || (prRate <= 0.34 && !pr.status)) { + // Use relative path from workspace root + const relativeFile = pr.file + ? path.relative(process.cwd(), pr.file) + : 'evals/'; + + regressions.push({ + name: testName, + file: relativeFile, + nightly: latestNightly[testName] + ? (latestNightly[testName].passRate * 100).toFixed(0) + '%' + : 'N/A', + pr: (prRate * 100).toFixed(0) + '%', + }); + } else { + passes.push(testName); + } + } + + if (regressions.length > 0) { + let markdown = '### ๐Ÿšจ Action Required: Eval Regressions Detected\n\n'; + markdown += `**Model:** \`${targetModel}\`\n\n`; + markdown += + 'The following trustworthy evaluations passed on **`main`** and in **recent Nightly runs**, but failed in this PR. These regressions must be addressed before merging.\n\n'; + + markdown += '| Test Name | Nightly | PR Result | Status |\n'; + markdown += '| :--- | :---: | :---: | :--- |\n'; + for (const r of regressions) { + markdown += `| ${r.name} | ${r.nightly} | ${r.pr} | โŒ **Regression** |\n`; + } + markdown += `\n*The check passed or was cleared for ${passes.length} other trustworthy evaluations.*\n\n`; + + markdown += '
\n'; + markdown += + '๐Ÿ› ๏ธ Troubleshooting & Fix Instructions\n\n'; + + for (let i = 0; i < regressions.length; i++) { + const r = regressions[i]; + if (regressions.length > 1) { + markdown += `### Failure ${i + 1}: ${r.name}\n\n`; + } + + markdown += '#### 1. Ask Gemini CLI to fix it (Recommended)\n'; + markdown += 'Copy and paste this prompt to the agent:\n'; + markdown += '```text\n'; + markdown += `The eval "${r.name}" in ${r.file} is failing. Investigate and fix it using the behavioral-evals skill.\n`; + markdown += '```\n\n'; + + markdown += '#### 2. Reproduce Locally\n'; + markdown += 'Run the following command to see the failure trajectory:\n'; + markdown += '```bash\n'; + const pattern = r.name.replace(/'/g, '.'); + markdown += `GEMINI_MODEL=${targetModel} npm run test:all_evals -- ${r.file} --testNamePattern="${pattern}"\n`; + + markdown += '```\n\n'; + + if (i < regressions.length - 1) { + markdown += '---\n\n'; + } + } + + markdown += '#### 3. Manual Fix\n'; + markdown += + 'See the [Fixing Guide](https://github.com/google-gemini/gemini-cli/blob/main/evals/README.md#fixing-evaluations) for detailed troubleshooting steps.\n'; + markdown += '
\n'; + + process.stdout.write(markdown); + } else if (passes.length > 0) { + // Success State + process.stdout.write( + `โœ… **${passes.length}** tests passed successfully on **${targetModel}**.\n`, + ); + } +} + +/** + * Aggregates stats from history for a specific model. + */ +function aggregateHistoricalStats(history, model) { + const stats = {}; + for (const item of history) { + const modelStats = item.stats[model]; + if (!modelStats) continue; + + for (const [testName, stat] of Object.entries(modelStats)) { + if (!stats[testName]) stats[testName] = { passed: 0, total: 0 }; + stats[testName].passed += stat.passed; + stats[testName].total += stat.total; + } + } + + for (const name in stats) { + stats[name].passRate = stats[name].passed / stats[name].total; + } + return stats; +} + +main(); diff --git a/scripts/eval_utils.js b/scripts/eval_utils.js new file mode 100644 index 0000000000..6d13f11891 --- /dev/null +++ b/scripts/eval_utils.js @@ -0,0 +1,136 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import fs from 'node:fs'; +import path from 'node:path'; +import { execSync } from 'node:child_process'; +import os from 'node:os'; + +/** + * Finds all report.json files recursively in a directory. + */ +export function findReports(dir) { + const reports = []; + if (!fs.existsSync(dir)) return reports; + + const files = fs.readdirSync(dir); + for (const file of files) { + const fullPath = path.join(dir, file); + const stat = fs.statSync(fullPath); + if (stat.isDirectory()) { + reports.push(...findReports(fullPath)); + } else if (file === 'report.json') { + reports.push(fullPath); + } + } + return reports; +} + +/** + * Extracts the model name from the artifact path. + */ +export function getModelFromPath(reportPath) { + const parts = reportPath.split(path.sep); + // Look for the directory that follows the 'eval-logs-' pattern + const artifactDir = parts.find((p) => p.startsWith('eval-logs-')); + if (!artifactDir) return 'unknown'; + + const match = artifactDir.match(/^eval-logs-(.+)-(\d+)$/); + return match ? match[1] : 'unknown'; +} + +/** + * Escapes special characters in a string for use in a regular expression. + */ +export function escapeRegex(string) { + return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); +} + +/** + * Aggregates stats from a list of report.json files. + * @returns {Record>} statsByModel + */ +export function getStatsFromReports(reports) { + const statsByModel = {}; + + for (const reportPath of reports) { + try { + const model = getModelFromPath(reportPath); + if (!statsByModel[model]) { + statsByModel[model] = {}; + } + const testStats = statsByModel[model]; + + const content = fs.readFileSync(reportPath, 'utf-8'); + const json = JSON.parse(content); + + for (const testResult of json.testResults) { + const filePath = testResult.name; + for (const assertion of testResult.assertionResults) { + const name = assertion.title; + if (!testStats[name]) { + testStats[name] = { passed: 0, total: 0, file: filePath }; + } + testStats[name].total++; + if (assertion.status === 'passed') { + testStats[name].passed++; + } + } + } + } catch (error) { + console.error(`Error processing report at ${reportPath}:`, error.message); + } + } + return statsByModel; +} + +/** + * Fetches historical nightly data using the GitHub CLI. + * @returns {Array<{runId: string, stats: Record}>} history + */ +export function fetchNightlyHistory(lookbackCount) { + const history = []; + try { + const cmd = `gh run list --workflow evals-nightly.yml --branch main --limit ${ + lookbackCount + 2 + } --json databaseId,status`; + const runsJson = execSync(cmd, { encoding: 'utf-8' }); + let runs = JSON.parse(runsJson); + + // Filter for completed runs and take the top N + runs = runs.filter((r) => r.status === 'completed').slice(0, lookbackCount); + + for (const run of runs) { + const tmpDir = fs.mkdtempSync( + path.join(os.tmpdir(), `gemini-evals-hist-${run.databaseId}-`), + ); + try { + execSync( + `gh run download ${run.databaseId} -p "eval-logs-*" -D "${tmpDir}"`, + { stdio: 'ignore' }, + ); + + const runReports = findReports(tmpDir); + if (runReports.length > 0) { + history.push({ + runId: run.databaseId, + stats: getStatsFromReports(runReports), + }); + } + } catch (error) { + console.error( + `Failed to process artifacts for run ${run.databaseId}:`, + error.message, + ); + } finally { + fs.rmSync(tmpDir, { recursive: true, force: true }); + } + } + } catch (error) { + console.error('Failed to fetch history:', error.message); + } + return history; +} diff --git a/scripts/get_trustworthy_evals.js b/scripts/get_trustworthy_evals.js new file mode 100644 index 0000000000..c87d148e7a --- /dev/null +++ b/scripts/get_trustworthy_evals.js @@ -0,0 +1,125 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * @fileoverview Identifies "Trustworthy" behavioral evaluations from nightly history. + * + * This script analyzes the last 6 days of nightly runs to find tests that meet + * strict stability criteria (80% aggregate pass rate and 60% daily floor). + * It outputs a list of files and a Vitest pattern used by the PR regression check + * to ensure high-signal validation and minimize noise. + */ + +import { fetchNightlyHistory, escapeRegex } from './eval_utils.js'; + +const LOOKBACK_COUNT = 6; +const MIN_VALID_RUNS = 5; // At least 5 out of 6 must be available +const PASS_RATE_THRESHOLD = 0.6; // Daily floor (e.g., 2/3) +const AGGREGATE_PASS_RATE_THRESHOLD = 0.8; // Weekly signal (e.g., 15/18) + +/** + * Main execution logic. + */ +function main() { + const targetModel = process.argv[2]; + if (!targetModel) { + console.error('โŒ Error: No target model specified.'); + process.exit(1); + } + console.error(`๐Ÿ” Identifying trustworthy evals for model: ${targetModel}`); + + const history = fetchNightlyHistory(LOOKBACK_COUNT); + if (history.length === 0) { + console.error('โŒ No historical data found.'); + process.exit(1); + } + + // Aggregate results for the target model across all history + const testHistories = {}; // { [testName]: { totalPassed: 0, totalRuns: 0, dailyRates: [], file: string } } + + for (const item of history) { + const modelStats = item.stats[targetModel]; + if (!modelStats) continue; + + for (const [testName, stat] of Object.entries(modelStats)) { + if (!testHistories[testName]) { + testHistories[testName] = { + totalPassed: 0, + totalRuns: 0, + dailyRates: [], + file: stat.file, + }; + } + testHistories[testName].totalPassed += stat.passed; + testHistories[testName].totalRuns += stat.total; + testHistories[testName].dailyRates.push(stat.passed / stat.total); + } + } + + const trustworthyTests = []; + const trustworthyFiles = new Set(); + const volatileTests = []; + const newTests = []; + + for (const [testName, info] of Object.entries(testHistories)) { + const dailyRates = info.dailyRates; + const aggregateRate = info.totalPassed / info.totalRuns; + + // 1. Minimum data points required + if (dailyRates.length < MIN_VALID_RUNS) { + newTests.push(testName); + continue; + } + + // 2. Trustworthy Criterion: + // - Every single day must be above the floor (e.g. > 60%) + // - The overall aggregate must be high-signal (e.g. > 80%) + const isDailyStable = dailyRates.every( + (rate) => rate > PASS_RATE_THRESHOLD, + ); + const isAggregateHighSignal = aggregateRate > AGGREGATE_PASS_RATE_THRESHOLD; + + if (isDailyStable && isAggregateHighSignal) { + trustworthyTests.push(testName); + if (info.file) { + const match = info.file.match(/evals\/.*\.eval\.ts/); + if (match) { + trustworthyFiles.add(match[0]); + } + } + } else { + volatileTests.push(testName); + } + } + + console.error( + `โœ… Found ${trustworthyTests.length} trustworthy tests across ${trustworthyFiles.size} files:`, + ); + trustworthyTests.sort().forEach((name) => console.error(` - ${name}`)); + console.error(`\nโšช Ignored ${volatileTests.length} volatile tests.`); + console.error( + `๐Ÿ†• Ignored ${newTests.length} tests with insufficient history.`, + ); + + // Output the list of names as a regex-friendly pattern for vitest -t + const pattern = trustworthyTests.map((name) => escapeRegex(name)).join('|'); + + // Also output unique file paths as a space-separated string + const files = Array.from(trustworthyFiles).join(' '); + + // Print the combined output to stdout for use in shell scripts (only if piped/CI) + if (!process.stdout.isTTY) { + // Format: FILE_LIST --test-pattern TEST_PATTERN + // This allows the workflow to easily use it + process.stdout.write(`${files} --test-pattern ${pattern || ''}\n`); + } else { + console.error( + '\n๐Ÿ’ก Note: Raw regex pattern and file list are hidden in interactive terminal. It will be printed when piped or in CI.', + ); + } +} + +main(); diff --git a/scripts/run_eval_regression.js b/scripts/run_eval_regression.js new file mode 100644 index 0000000000..7a64a6a2f9 --- /dev/null +++ b/scripts/run_eval_regression.js @@ -0,0 +1,107 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * @fileoverview Orchestrates the PR evaluation process across multiple models. + * + * This script loops through a provided list of models, identifies trustworthy + * tests for each, executes the frugal regression check, and collects results + * into a single unified report. It exits with code 1 if any confirmed + * regressions are detected. + */ + +import { execSync } from 'node:child_process'; +import fs from 'node:fs'; + +/** + * Main execution logic. + */ +async function main() { + const modelList = process.env.MODEL_LIST || 'gemini-3-flash-preview'; + const models = modelList.split(',').map((m) => m.trim()); + + let combinedReport = ''; + let hasRegression = false; + + console.log( + `๐Ÿš€ Starting evaluation orchestration for models: ${models.join(', ')}`, + ); + + for (const model of models) { + console.log(`\n--- Processing Model: ${model} ---`); + + try { + // 1. Identify Trustworthy Evals + console.log(`๐Ÿ” Identifying trustworthy tests for ${model}...`); + const output = execSync( + `node scripts/get_trustworthy_evals.js "${model}"`, + { + encoding: 'utf-8', + stdio: ['inherit', 'pipe', 'inherit'], // Capture stdout but pass stdin/stderr + }, + ).trim(); + + if (!output) { + console.log(`โ„น๏ธ No trustworthy tests found for ${model}. Skipping.`); + continue; + } + + // 2. Run Frugal Regression Check + console.log(`๐Ÿงช Running regression check for ${model}...`); + execSync(`node scripts/run_regression_check.js "${model}" "${output}"`, { + stdio: 'inherit', + }); + + // 3. Generate Report + console.log(`๐Ÿ“Š Generating report for ${model}...`); + const report = execSync(`node scripts/compare_evals.js "${model}"`, { + encoding: 'utf-8', + stdio: ['inherit', 'pipe', 'inherit'], + }).trim(); + + if (report) { + if (combinedReport) { + combinedReport += '\n\n---\n\n'; + } + combinedReport += report; + + // 4. Check for Regressions + // If the report contains the "Action Required" marker, it means a confirmed regression was found. + if (report.includes('Action Required')) { + hasRegression = true; + } + } + } catch (error) { + console.error(`โŒ Error processing model ${model}:`, error.message); + // We flag a failure if any model encountered a critical error + hasRegression = true; + } + } + + // Always save the combined report to a file so the workflow can capture it cleanly + if (combinedReport) { + fs.writeFileSync('eval_regression_report.md', combinedReport); + console.log( + '\n๐Ÿ“Š Final Markdown report saved to eval_regression_report.md', + ); + } + + // Log status for CI visibility, but don't exit with error + if (hasRegression) { + console.error( + '\nโš ๏ธ Confirmed regressions detected across one or more models. See PR comment for details.', + ); + } else { + console.log('\nโœ… All evaluations passed successfully (or were cleared).'); + } + + process.exit(0); +} + +main().catch((err) => { + console.error(err); + process.exit(1); +}); diff --git a/scripts/run_regression_check.js b/scripts/run_regression_check.js new file mode 100644 index 0000000000..1250671c30 --- /dev/null +++ b/scripts/run_regression_check.js @@ -0,0 +1,305 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * @fileoverview Executes a high-signal regression check for behavioral evaluations. + * + * This script runs a targeted set of stable tests in an optimistic first pass. + * If failures occur, it employs a "Best-of-4" retry logic to handle natural flakiness. + * For confirmed failures (0/3), it performs Dynamic Baseline Verification by + * checking the failure against the 'main' branch to distinguish between + * model drift and PR-introduced regressions. + */ + +import { execSync } from 'node:child_process'; +import fs from 'node:fs'; +import path from 'node:path'; +import { quote } from 'shell-quote'; +import { escapeRegex } from './eval_utils.js'; + +/** + * Runs a set of tests using Vitest and returns the results. + */ +function runTests(files, pattern, model) { + const outputDir = path.resolve( + process.cwd(), + `evals/logs/pr-run-${Date.now()}`, + ); + fs.mkdirSync(outputDir, { recursive: true }); + + const filesToRun = files || 'evals/'; + console.log( + `๐Ÿš€ Running tests in ${filesToRun} with pattern: ${pattern?.slice(0, 100)}...`, + ); + + try { + const cmd = `npx vitest run --config evals/vitest.config.ts ${filesToRun} -t "${pattern}" --reporter=json --reporter=default --outputFile="${path.join(outputDir, 'report.json')}"`; + execSync(cmd, { + stdio: 'inherit', + env: { ...process.env, RUN_EVALS: '1', GEMINI_MODEL: model }, + }); + } catch { + // Vitest returns a non-zero exit code when tests fail. This is expected. + // We continue execution and handle the failures by parsing the JSON report. + } + + const reportPath = path.join(outputDir, 'report.json'); + return fs.existsSync(reportPath) + ? JSON.parse(fs.readFileSync(reportPath, 'utf-8')) + : null; +} + +/** + * Helper to find a specific assertion by name across all test files. + */ +function findAssertion(report, testName) { + if (!report?.testResults) return null; + for (const fileResult of report.testResults) { + const assertion = fileResult.assertionResults.find( + (a) => a.title === testName, + ); + if (assertion) return assertion; + } + return null; +} + +/** + * Parses command line arguments to identify model, files, and test pattern. + */ +function parseArgs() { + const modelArg = process.argv[2]; + const remainingArgs = process.argv.slice(3); + const fullArgsString = remainingArgs.join(' '); + const testPatternIndex = remainingArgs.indexOf('--test-pattern'); + + if (testPatternIndex !== -1) { + return { + model: modelArg, + files: remainingArgs.slice(0, testPatternIndex).join(' '), + pattern: remainingArgs.slice(testPatternIndex + 1).join(' '), + }; + } + + if (fullArgsString.includes('--test-pattern')) { + const parts = fullArgsString.split('--test-pattern'); + return { + model: modelArg, + files: parts[0].trim(), + pattern: parts[1].trim(), + }; + } + + // Fallback for manual mode: Pattern Model + const manualPattern = process.argv[2]; + const manualModel = process.argv[3]; + if (!manualModel) { + console.error('โŒ Error: No target model specified.'); + process.exit(1); + } + + let manualFiles = 'evals/'; + try { + const grepResult = execSync( + `grep -l ${quote([manualPattern])} evals/*.eval.ts`, + { encoding: 'utf-8' }, + ); + manualFiles = grepResult.split('\n').filter(Boolean).join(' '); + } catch { + // Grep returns exit code 1 if no files match the pattern. + // In this case, we fall back to scanning all files in the evals/ directory. + } + + return { + model: manualModel, + files: manualFiles, + pattern: manualPattern, + isManual: true, + }; +} + +/** + * Runs the targeted retry logic (Best-of-4) for a failing test. + */ +async function runRetries(testName, results, files, model) { + console.log(`\nRe-evaluating: ${testName}`); + + while ( + results[testName].passed < 2 && + results[testName].total - results[testName].passed < 3 && + results[testName].total < 4 + ) { + const attemptNum = results[testName].total + 1; + console.log(` Running attempt ${attemptNum}...`); + + const retry = runTests(files, escapeRegex(testName), model); + const retryAssertion = findAssertion(retry, testName); + + results[testName].total++; + if (retryAssertion?.status === 'passed') { + results[testName].passed++; + console.log( + ` โœ… Attempt ${attemptNum} passed. Score: ${results[testName].passed}/${results[testName].total}`, + ); + } else { + console.log( + ` โŒ Attempt ${attemptNum} failed (${retryAssertion?.status || 'unknown'}). Score: ${results[testName].passed}/${results[testName].total}`, + ); + } + + if (results[testName].passed >= 2) { + console.log( + ` โœ… Test cleared as Noisy Pass (${results[testName].passed}/${results[testName].total})`, + ); + } else if (results[testName].total - results[testName].passed >= 3) { + await verifyBaseline(testName, results, files, model); + } + } +} + +/** + * Verifies a potential regression against the 'main' branch. + */ +async function verifyBaseline(testName, results, files, model) { + console.log('\n--- Step 3: Dynamic Baseline Verification ---'); + console.log( + `โš ๏ธ Potential regression detected. Verifying baseline on 'main'...`, + ); + + try { + execSync('git stash push -m "eval-regression-check-stash"', { + stdio: 'inherit', + }); + const hasStash = execSync('git stash list') + .toString() + .includes('eval-regression-check-stash'); + execSync('git checkout main', { stdio: 'inherit' }); + + console.log( + `\n--- Running Baseline Verification on 'main' (Best-of-3) ---`, + ); + let baselinePasses = 0; + let baselineTotal = 0; + + while (baselinePasses === 0 && baselineTotal < 3) { + baselineTotal++; + console.log(` Baseline Attempt ${baselineTotal}...`); + const baselineRun = runTests(files, escapeRegex(testName), model); + if (findAssertion(baselineRun, testName)?.status === 'passed') { + baselinePasses++; + console.log(` โœ… Baseline Attempt ${baselineTotal} passed.`); + } else { + console.log(` โŒ Baseline Attempt ${baselineTotal} failed.`); + } + } + + execSync('git checkout -', { stdio: 'inherit' }); + if (hasStash) execSync('git stash pop', { stdio: 'inherit' }); + + if (baselinePasses === 0) { + console.log( + ` โ„น๏ธ Test also fails on 'main'. Marking as PRE-EXISTING (Cleared).`, + ); + results[testName].status = 'pre-existing'; + results[testName].passed = results[testName].total; // Clear for report + } else { + console.log( + ` โŒ Test passes on 'main' but fails in PR. Marking as CONFIRMED REGRESSION.`, + ); + results[testName].status = 'regression'; + } + } catch (error) { + console.error(` โŒ Failed to verify baseline: ${error.message}`); + + // Best-effort cleanup: try to return to the original branch. + try { + execSync('git checkout -', { stdio: 'ignore' }); + } catch { + // Ignore checkout errors during cleanup to avoid hiding the original error. + } + } +} + +/** + * Processes initial results and orchestrates retries/baseline checks. + */ +async function processResults(firstPass, pattern, model, files) { + if (!firstPass) return false; + + const results = {}; + const failingTests = []; + let totalProcessed = 0; + + for (const fileResult of firstPass.testResults) { + for (const assertion of fileResult.assertionResults) { + if (assertion.status !== 'passed' && assertion.status !== 'failed') { + continue; + } + + const name = assertion.title; + results[name] = { + passed: assertion.status === 'passed' ? 1 : 0, + total: 1, + file: fileResult.name, + }; + if (assertion.status === 'failed') failingTests.push(name); + totalProcessed++; + } + } + + if (totalProcessed === 0) { + console.error('โŒ Error: No matching tests were found or executed.'); + return false; + } + + if (failingTests.length === 0) { + console.log('โœ… All trustworthy tests passed on the first try!'); + } else { + console.log('\n--- Step 2: Best-of-4 Retries ---'); + console.log( + `โš ๏ธ ${failingTests.length} tests failed the optimistic run. Starting retries...`, + ); + for (const testName of failingTests) { + await runRetries(testName, results, files, model); + } + } + + saveResults(results); + return true; +} + +function saveResults(results) { + const finalReport = { timestamp: new Date().toISOString(), results }; + fs.writeFileSync( + 'evals/logs/pr_final_report.json', + JSON.stringify(finalReport, null, 2), + ); + console.log('\nFinal report saved to evals/logs/pr_final_report.json'); +} + +async function main() { + const { model, files, pattern, isManual } = parseArgs(); + + if (isManual) { + const firstPass = runTests(files, pattern, model); + const success = await processResults(firstPass, pattern, model, files); + process.exit(success ? 0 : 1); + } + + if (!pattern) { + console.log('No trustworthy tests to run.'); + process.exit(0); + } + + console.log('\n--- Step 1: Optimistic Run (N=1) ---'); + const firstPass = runTests(files, pattern, model); + const success = await processResults(firstPass, pattern, model, files); + process.exit(success ? 0 : 1); +} + +main().catch((err) => { + console.error(err); + process.exit(1); +});