diff --git a/.github/workflows/eval-guidance.yml b/.github/workflows/eval-guidance.yml
deleted file mode 100644
index e1f1ab3168..0000000000
--- a/.github/workflows/eval-guidance.yml
+++ /dev/null
@@ -1,69 +0,0 @@
-name: 'Evals: PR Guidance'
-
-on:
- pull_request:
- paths:
- - 'packages/core/src/**/*.ts'
- - '!**/*.test.ts'
- - '!**/*.test.tsx'
-
-permissions:
- pull-requests: 'write'
- contents: 'read'
-
-jobs:
- provide-guidance:
- name: 'Model Steering Guidance'
- runs-on: 'ubuntu-latest'
- if: "github.repository == 'google-gemini/gemini-cli'"
- steps:
- - name: 'Checkout'
- uses: 'actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955' # ratchet:actions/checkout@v4
- with:
- fetch-depth: 0
-
- - name: 'Set up Node.js'
- uses: 'actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020' # ratchet:actions/setup-node@v4.4.0
- with:
- node-version-file: '.nvmrc'
- cache: 'npm'
-
- - name: 'Detect Steering Changes'
- id: 'detect'
- run: |
- STEERING_DETECTED=$(node scripts/changed_prompt.js --steering-only)
- echo "STEERING_DETECTED=$STEERING_DETECTED" >> "$GITHUB_OUTPUT"
-
- - name: 'Analyze PR Content'
- if: "steps.detect.outputs.STEERING_DETECTED == 'true'"
- id: 'analysis'
- env:
- GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}'
- run: |
- # Check for behavioral eval changes
- EVAL_CHANGES=$(git diff --name-only origin/${{ github.base_ref }}...HEAD | grep "^evals/" || true)
- if [ -z "$EVAL_CHANGES" ]; then
- echo "MISSING_EVALS=true" >> "$GITHUB_OUTPUT"
- fi
-
- # Check if user is a maintainer (has write/admin access)
- USER_PERMISSION=$(gh api repos/${{ github.repository }}/collaborators/${{ github.actor }}/permission --jq '.permission')
- if [[ "$USER_PERMISSION" == "admin" || "$USER_PERMISSION" == "write" ]]; then
- echo "IS_MAINTAINER=true" >> "$GITHUB_OUTPUT"
- fi
-
- - name: 'Post Guidance Comment'
- if: "steps.detect.outputs.STEERING_DETECTED == 'true'"
- uses: 'thollander/actions-comment-pull-request@65f9e5c9a1f2cd378bd74b2e057c9736982a8e74' # ratchet:thollander/actions-comment-pull-request@v3
- with:
- comment-tag: 'eval-guidance-bot'
- message: |
- ### ๐ง Model Steering Guidance
-
- This PR modifies files that affect the model's behavior (prompts, tools, or instructions).
-
- ${{ steps.analysis.outputs.MISSING_EVALS == 'true' && '- โ ๏ธ **Consider adding Evals:** No behavioral evaluations (`evals/*.eval.ts`) were added or updated in this PR. Consider adding a test case to verify the new behavior and prevent regressions.' || '' }}
- ${{ steps.analysis.outputs.IS_MAINTAINER == 'true' && '- ๐ **Maintainer Reminder:** Please ensure that these changes do not regress results on benchmark evals before merging.' || '' }}
-
- ---
- *This is an automated guidance message triggered by steering logic signatures.*
diff --git a/.github/workflows/eval-pr.yml b/.github/workflows/eval-pr.yml
new file mode 100644
index 0000000000..e0f839e667
--- /dev/null
+++ b/.github/workflows/eval-pr.yml
@@ -0,0 +1,137 @@
+name: 'Evals: PR Evaluation & Regression'
+
+on:
+ pull_request:
+ types: ['opened', 'synchronize', 'reopened', 'ready_for_review']
+ paths:
+ - 'packages/core/src/prompts/**'
+ - 'packages/core/src/tools/**'
+ - 'packages/core/src/agents/**'
+ - 'evals/**'
+ - '!**/*.test.ts'
+ - '!**/*.test.tsx'
+ workflow_dispatch:
+
+# Prevents multiple runs for the same PR simultaneously (saves tokens)
+concurrency:
+ group: '${{ github.workflow }}-${{ github.head_ref || github.ref }}'
+ cancel-in-progress: true
+
+permissions:
+ pull-requests: 'write'
+ contents: 'read'
+ actions: 'read'
+
+jobs:
+ pr-evaluation:
+ name: 'Evaluate Steering & Regressions'
+ runs-on: 'gemini-cli-ubuntu-16-core'
+ if: "github.repository == 'google-gemini/gemini-cli' && (github.event_name != 'pull_request' || (github.event.pull_request.draft == false && github.event.pull_request.head.repo.full_name == github.repository))"
+ # External contributors' PRs will wait for approval in this environment
+ environment: |-
+ ${{ (github.event.pull_request.head.repo.full_name == github.repository) && 'internal' || 'external-evals' }}
+ env:
+ # CENTRALIZED MODEL LIST
+ MODEL_LIST: 'gemini-3-flash-preview'
+
+ steps:
+ - name: 'Checkout'
+ uses: 'actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955' # ratchet:actions/checkout@v5
+ with:
+ fetch-depth: 0
+
+ - name: 'Set up Node.js'
+ uses: 'actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020' # ratchet:actions/setup-node@v4.4.0
+ with:
+ node-version-file: '.nvmrc'
+ cache: 'npm'
+
+ - name: 'Install dependencies'
+ run: 'npm ci'
+
+ - name: 'Build project'
+ run: 'npm run build'
+
+ - name: 'Detect Steering Changes'
+ id: 'detect'
+ run: |
+ SHOULD_RUN=$(node scripts/changed_prompt.js)
+ STEERING_DETECTED=$(node scripts/changed_prompt.js --steering-only)
+ echo "SHOULD_RUN=$SHOULD_RUN" >> "$GITHUB_OUTPUT"
+ echo "STEERING_DETECTED=$STEERING_DETECTED" >> "$GITHUB_OUTPUT"
+
+ - name: 'Analyze PR Content (Guidance)'
+ if: "steps.detect.outputs.STEERING_DETECTED == 'true'"
+ id: 'analysis'
+ env:
+ GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}'
+ run: |
+ # Check for behavioral eval changes
+ EVAL_CHANGES=$(git diff --name-only origin/${{ github.base_ref }}...HEAD | grep "^evals/" || true)
+ if [ -z "$EVAL_CHANGES" ]; then
+ echo "MISSING_EVALS=true" >> "$GITHUB_OUTPUT"
+ fi
+
+ # Check if user is a maintainer
+ USER_PERMISSION=$(gh api repos/${{ github.repository }}/collaborators/${{ github.actor }}/permission --jq '.permission')
+ if [[ "$USER_PERMISSION" == "admin" || "$USER_PERMISSION" == "write" ]]; then
+ echo "IS_MAINTAINER=true" >> "$GITHUB_OUTPUT"
+ fi
+
+ - name: 'Execute Regression Check'
+ if: "steps.detect.outputs.SHOULD_RUN == 'true'"
+ env:
+ GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}'
+ GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}'
+ MODEL_LIST: '${{ env.MODEL_LIST }}'
+ run: |
+ # Run the regression check loop. The script saves the report to a file.
+ node scripts/run_eval_regression.js
+
+ # Use the generated report file if it exists
+ if [[ -f eval_regression_report.md ]]; then
+ echo "REPORT_FILE=eval_regression_report.md" >> "$GITHUB_ENV"
+ fi
+
+ - name: 'Post or Update PR Comment'
+ if: "always() && steps.detect.outputs.STEERING_DETECTED == 'true'"
+ env:
+ GH_TOKEN: '${{ secrets.GITHUB_TOKEN }}'
+ run: |
+ # 1. Build the full comment body
+ {
+ if [[ -f eval_regression_report.md ]]; then
+ cat eval_regression_report.md
+ echo ""
+ fi
+ echo "### ๐ง Model Steering Guidance"
+ echo ""
+ echo "This PR modifies files that affect the model's behavior (prompts, tools, or instructions)."
+ echo ""
+
+ if [[ "${{ steps.analysis.outputs.MISSING_EVALS }}" == "true" ]]; then
+ echo "- โ ๏ธ **Consider adding Evals:** No behavioral evaluations (\`evals/*.eval.ts\`) were added or updated in this PR. Consider [adding a test case](https://github.com/google-gemini/gemini-cli/blob/main/evals/README.md#creating-an-evaluation) to verify the new behavior and prevent regressions."
+ fi
+
+ if [[ "${{ steps.analysis.outputs.IS_MAINTAINER }}" == "true" ]]; then
+ echo "- ๐ **Maintainer Reminder:** Please ensure that these changes do not regress results on benchmark evals before merging."
+ fi
+
+ echo ""
+ echo "---"
+ echo "*This is an automated guidance message triggered by steering logic signatures.*"
+ echo ""
+ } > full_comment.md
+
+ # 2. Find if a comment with our unique tag already exists
+ # We extract the numeric ID from the URL to ensure compatibility with the REST API
+ COMMENT_ID=$(gh pr view ${{ github.event.pull_request.number }} --json comments --jq '.comments[] | select(.body | contains("")) | .url' | grep -oE "[0-9]+$" | head -n 1)
+
+ # 3. Update or Create the comment
+ if [ -n "$COMMENT_ID" ]; then
+ echo "Updating existing comment $COMMENT_ID via API..."
+ gh api -X PATCH "repos/${{ github.repository }}/issues/comments/$COMMENT_ID" -F body=@full_comment.md
+ else
+ echo "Creating new PR comment..."
+ gh pr comment ${{ github.event.pull_request.number }} --body-file full_comment.md
+ fi
diff --git a/evals/README.md b/evals/README.md
index 9e3697a6b8..aebfe38ebc 100644
--- a/evals/README.md
+++ b/evals/README.md
@@ -212,6 +212,56 @@ The nightly workflow executes the full evaluation suite multiple times
(currently 3 attempts) to account for non-determinism. These results are
aggregated into a **Nightly Summary** attached to the workflow run.
+## Regression Check Scripts
+
+The project includes several scripts to automate high-signal regression checking
+in Pull Requests. These can also be run locally for debugging.
+
+- **`scripts/get_trustworthy_evals.js`**: Analyzes nightly history to identify
+ stable tests (80%+ aggregate pass rate).
+- **`scripts/run_regression_check.js`**: Runs a specific set of tests using the
+ "Best-of-4" logic and "Dynamic Baseline Verification".
+- **`scripts/run_eval_regression.js`**: The main orchestrator that loops through
+ models and generates the final PR report.
+
+### Running Regression Checks Locally
+
+You can simulate the PR regression check locally to verify your changes before
+pushing:
+
+```bash
+# Run the full regression loop for a specific model
+MODEL_LIST=gemini-3-flash-preview node scripts/run_eval_regression.js
+```
+
+To debug a specific failing test with the same logic used in CI:
+
+```bash
+# 1. Get the Vitest pattern for trustworthy tests
+OUTPUT=$(node scripts/get_trustworthy_evals.js "gemini-3-flash-preview")
+
+# 2. Run the regression logic for those tests
+node scripts/run_regression_check.js "gemini-3-flash-preview" "$OUTPUT"
+```
+
+### The Regression Quality Bar
+
+Because LLMs are non-deterministic, the PR regression check uses a high-signal
+probabilistic approach rather than a 100% pass requirement:
+
+1. **Trustworthiness (60/80 Filter):** Only tests with a proven track record
+ are run. A test must score at least **60% (2/3)** every single night and
+ maintain an **80% aggregate** pass rate over the last 6 days.
+2. **The 50% Pass Rule:** In a PR, a test is considered a **Pass** if the model
+ correctly performs the behavior at least half the time (**2 successes** out
+ of up to 4 attempts).
+3. **Dynamic Baseline Verification:** If a test fails in a PR (e.g., 0/3), the
+ system automatically checks the `main` branch. If it fails there too, it is
+ marked as **Pre-existing** and cleared for the PR, ensuring you are only
+ blocked by regressions caused by your specific changes.
+
+## Fixing Evaluations
+
#### How to interpret the report:
- **Pass Rate (%)**: Each cell represents the percentage of successful runs for
diff --git a/scripts/compare_evals.js b/scripts/compare_evals.js
new file mode 100644
index 0000000000..a5ea15361f
--- /dev/null
+++ b/scripts/compare_evals.js
@@ -0,0 +1,142 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/**
+ * @fileoverview Compares PR evaluation results against historical nightly baselines.
+ *
+ * This script generates a Markdown report for use in PR comments. It aligns with
+ * the 6-day lookback logic to show accurate historical pass rates and filters out
+ * pre-existing or noisy failures to ensure only actionable regressions are reported.
+ */
+
+import fs from 'node:fs';
+import path from 'node:path';
+import { fetchNightlyHistory } from './eval_utils.js';
+
+/**
+ * Main execution logic.
+ */
+function main() {
+ const prReportPath = 'evals/logs/pr_final_report.json';
+ const targetModel = process.argv[2];
+
+ if (!targetModel) {
+ console.error('โ Error: No target model specified.');
+ process.exit(1);
+ }
+
+ if (!fs.existsSync(prReportPath)) {
+ console.error('No PR report found.');
+ return;
+ }
+
+ const prReport = JSON.parse(fs.readFileSync(prReportPath, 'utf-8'));
+ const history = fetchNightlyHistory(6); // Use same 6-day lookback
+ const latestNightly = aggregateHistoricalStats(history, targetModel);
+
+ const regressions = [];
+ const passes = [];
+
+ for (const [testName, pr] of Object.entries(prReport.results)) {
+ const prRate = pr.passed / pr.total;
+ if (pr.status === 'regression' || (prRate <= 0.34 && !pr.status)) {
+ // Use relative path from workspace root
+ const relativeFile = pr.file
+ ? path.relative(process.cwd(), pr.file)
+ : 'evals/';
+
+ regressions.push({
+ name: testName,
+ file: relativeFile,
+ nightly: latestNightly[testName]
+ ? (latestNightly[testName].passRate * 100).toFixed(0) + '%'
+ : 'N/A',
+ pr: (prRate * 100).toFixed(0) + '%',
+ });
+ } else {
+ passes.push(testName);
+ }
+ }
+
+ if (regressions.length > 0) {
+ let markdown = '### ๐จ Action Required: Eval Regressions Detected\n\n';
+ markdown += `**Model:** \`${targetModel}\`\n\n`;
+ markdown +=
+ 'The following trustworthy evaluations passed on **`main`** and in **recent Nightly runs**, but failed in this PR. These regressions must be addressed before merging.\n\n';
+
+ markdown += '| Test Name | Nightly | PR Result | Status |\n';
+ markdown += '| :--- | :---: | :---: | :--- |\n';
+ for (const r of regressions) {
+ markdown += `| ${r.name} | ${r.nightly} | ${r.pr} | โ **Regression** |\n`;
+ }
+ markdown += `\n*The check passed or was cleared for ${passes.length} other trustworthy evaluations.*\n\n`;
+
+ markdown += '\n';
+ markdown +=
+ '๐ ๏ธ Troubleshooting & Fix Instructions
\n\n';
+
+ for (let i = 0; i < regressions.length; i++) {
+ const r = regressions[i];
+ if (regressions.length > 1) {
+ markdown += `### Failure ${i + 1}: ${r.name}\n\n`;
+ }
+
+ markdown += '#### 1. Ask Gemini CLI to fix it (Recommended)\n';
+ markdown += 'Copy and paste this prompt to the agent:\n';
+ markdown += '```text\n';
+ markdown += `The eval "${r.name}" in ${r.file} is failing. Investigate and fix it using the behavioral-evals skill.\n`;
+ markdown += '```\n\n';
+
+ markdown += '#### 2. Reproduce Locally\n';
+ markdown += 'Run the following command to see the failure trajectory:\n';
+ markdown += '```bash\n';
+ const pattern = r.name.replace(/'/g, '.');
+ markdown += `GEMINI_MODEL=${targetModel} npm run test:all_evals -- ${r.file} --testNamePattern="${pattern}"\n`;
+
+ markdown += '```\n\n';
+
+ if (i < regressions.length - 1) {
+ markdown += '---\n\n';
+ }
+ }
+
+ markdown += '#### 3. Manual Fix\n';
+ markdown +=
+ 'See the [Fixing Guide](https://github.com/google-gemini/gemini-cli/blob/main/evals/README.md#fixing-evaluations) for detailed troubleshooting steps.\n';
+ markdown += ' \n';
+
+ process.stdout.write(markdown);
+ } else if (passes.length > 0) {
+ // Success State
+ process.stdout.write(
+ `โ
**${passes.length}** tests passed successfully on **${targetModel}**.\n`,
+ );
+ }
+}
+
+/**
+ * Aggregates stats from history for a specific model.
+ */
+function aggregateHistoricalStats(history, model) {
+ const stats = {};
+ for (const item of history) {
+ const modelStats = item.stats[model];
+ if (!modelStats) continue;
+
+ for (const [testName, stat] of Object.entries(modelStats)) {
+ if (!stats[testName]) stats[testName] = { passed: 0, total: 0 };
+ stats[testName].passed += stat.passed;
+ stats[testName].total += stat.total;
+ }
+ }
+
+ for (const name in stats) {
+ stats[name].passRate = stats[name].passed / stats[name].total;
+ }
+ return stats;
+}
+
+main();
diff --git a/scripts/eval_utils.js b/scripts/eval_utils.js
new file mode 100644
index 0000000000..6d13f11891
--- /dev/null
+++ b/scripts/eval_utils.js
@@ -0,0 +1,136 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import fs from 'node:fs';
+import path from 'node:path';
+import { execSync } from 'node:child_process';
+import os from 'node:os';
+
+/**
+ * Finds all report.json files recursively in a directory.
+ */
+export function findReports(dir) {
+ const reports = [];
+ if (!fs.existsSync(dir)) return reports;
+
+ const files = fs.readdirSync(dir);
+ for (const file of files) {
+ const fullPath = path.join(dir, file);
+ const stat = fs.statSync(fullPath);
+ if (stat.isDirectory()) {
+ reports.push(...findReports(fullPath));
+ } else if (file === 'report.json') {
+ reports.push(fullPath);
+ }
+ }
+ return reports;
+}
+
+/**
+ * Extracts the model name from the artifact path.
+ */
+export function getModelFromPath(reportPath) {
+ const parts = reportPath.split(path.sep);
+ // Look for the directory that follows the 'eval-logs-' pattern
+ const artifactDir = parts.find((p) => p.startsWith('eval-logs-'));
+ if (!artifactDir) return 'unknown';
+
+ const match = artifactDir.match(/^eval-logs-(.+)-(\d+)$/);
+ return match ? match[1] : 'unknown';
+}
+
+/**
+ * Escapes special characters in a string for use in a regular expression.
+ */
+export function escapeRegex(string) {
+ return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
+}
+
+/**
+ * Aggregates stats from a list of report.json files.
+ * @returns {Record>} statsByModel
+ */
+export function getStatsFromReports(reports) {
+ const statsByModel = {};
+
+ for (const reportPath of reports) {
+ try {
+ const model = getModelFromPath(reportPath);
+ if (!statsByModel[model]) {
+ statsByModel[model] = {};
+ }
+ const testStats = statsByModel[model];
+
+ const content = fs.readFileSync(reportPath, 'utf-8');
+ const json = JSON.parse(content);
+
+ for (const testResult of json.testResults) {
+ const filePath = testResult.name;
+ for (const assertion of testResult.assertionResults) {
+ const name = assertion.title;
+ if (!testStats[name]) {
+ testStats[name] = { passed: 0, total: 0, file: filePath };
+ }
+ testStats[name].total++;
+ if (assertion.status === 'passed') {
+ testStats[name].passed++;
+ }
+ }
+ }
+ } catch (error) {
+ console.error(`Error processing report at ${reportPath}:`, error.message);
+ }
+ }
+ return statsByModel;
+}
+
+/**
+ * Fetches historical nightly data using the GitHub CLI.
+ * @returns {Array<{runId: string, stats: Record}>} history
+ */
+export function fetchNightlyHistory(lookbackCount) {
+ const history = [];
+ try {
+ const cmd = `gh run list --workflow evals-nightly.yml --branch main --limit ${
+ lookbackCount + 2
+ } --json databaseId,status`;
+ const runsJson = execSync(cmd, { encoding: 'utf-8' });
+ let runs = JSON.parse(runsJson);
+
+ // Filter for completed runs and take the top N
+ runs = runs.filter((r) => r.status === 'completed').slice(0, lookbackCount);
+
+ for (const run of runs) {
+ const tmpDir = fs.mkdtempSync(
+ path.join(os.tmpdir(), `gemini-evals-hist-${run.databaseId}-`),
+ );
+ try {
+ execSync(
+ `gh run download ${run.databaseId} -p "eval-logs-*" -D "${tmpDir}"`,
+ { stdio: 'ignore' },
+ );
+
+ const runReports = findReports(tmpDir);
+ if (runReports.length > 0) {
+ history.push({
+ runId: run.databaseId,
+ stats: getStatsFromReports(runReports),
+ });
+ }
+ } catch (error) {
+ console.error(
+ `Failed to process artifacts for run ${run.databaseId}:`,
+ error.message,
+ );
+ } finally {
+ fs.rmSync(tmpDir, { recursive: true, force: true });
+ }
+ }
+ } catch (error) {
+ console.error('Failed to fetch history:', error.message);
+ }
+ return history;
+}
diff --git a/scripts/get_trustworthy_evals.js b/scripts/get_trustworthy_evals.js
new file mode 100644
index 0000000000..c87d148e7a
--- /dev/null
+++ b/scripts/get_trustworthy_evals.js
@@ -0,0 +1,125 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/**
+ * @fileoverview Identifies "Trustworthy" behavioral evaluations from nightly history.
+ *
+ * This script analyzes the last 6 days of nightly runs to find tests that meet
+ * strict stability criteria (80% aggregate pass rate and 60% daily floor).
+ * It outputs a list of files and a Vitest pattern used by the PR regression check
+ * to ensure high-signal validation and minimize noise.
+ */
+
+import { fetchNightlyHistory, escapeRegex } from './eval_utils.js';
+
+const LOOKBACK_COUNT = 6;
+const MIN_VALID_RUNS = 5; // At least 5 out of 6 must be available
+const PASS_RATE_THRESHOLD = 0.6; // Daily floor (e.g., 2/3)
+const AGGREGATE_PASS_RATE_THRESHOLD = 0.8; // Weekly signal (e.g., 15/18)
+
+/**
+ * Main execution logic.
+ */
+function main() {
+ const targetModel = process.argv[2];
+ if (!targetModel) {
+ console.error('โ Error: No target model specified.');
+ process.exit(1);
+ }
+ console.error(`๐ Identifying trustworthy evals for model: ${targetModel}`);
+
+ const history = fetchNightlyHistory(LOOKBACK_COUNT);
+ if (history.length === 0) {
+ console.error('โ No historical data found.');
+ process.exit(1);
+ }
+
+ // Aggregate results for the target model across all history
+ const testHistories = {}; // { [testName]: { totalPassed: 0, totalRuns: 0, dailyRates: [], file: string } }
+
+ for (const item of history) {
+ const modelStats = item.stats[targetModel];
+ if (!modelStats) continue;
+
+ for (const [testName, stat] of Object.entries(modelStats)) {
+ if (!testHistories[testName]) {
+ testHistories[testName] = {
+ totalPassed: 0,
+ totalRuns: 0,
+ dailyRates: [],
+ file: stat.file,
+ };
+ }
+ testHistories[testName].totalPassed += stat.passed;
+ testHistories[testName].totalRuns += stat.total;
+ testHistories[testName].dailyRates.push(stat.passed / stat.total);
+ }
+ }
+
+ const trustworthyTests = [];
+ const trustworthyFiles = new Set();
+ const volatileTests = [];
+ const newTests = [];
+
+ for (const [testName, info] of Object.entries(testHistories)) {
+ const dailyRates = info.dailyRates;
+ const aggregateRate = info.totalPassed / info.totalRuns;
+
+ // 1. Minimum data points required
+ if (dailyRates.length < MIN_VALID_RUNS) {
+ newTests.push(testName);
+ continue;
+ }
+
+ // 2. Trustworthy Criterion:
+ // - Every single day must be above the floor (e.g. > 60%)
+ // - The overall aggregate must be high-signal (e.g. > 80%)
+ const isDailyStable = dailyRates.every(
+ (rate) => rate > PASS_RATE_THRESHOLD,
+ );
+ const isAggregateHighSignal = aggregateRate > AGGREGATE_PASS_RATE_THRESHOLD;
+
+ if (isDailyStable && isAggregateHighSignal) {
+ trustworthyTests.push(testName);
+ if (info.file) {
+ const match = info.file.match(/evals\/.*\.eval\.ts/);
+ if (match) {
+ trustworthyFiles.add(match[0]);
+ }
+ }
+ } else {
+ volatileTests.push(testName);
+ }
+ }
+
+ console.error(
+ `โ
Found ${trustworthyTests.length} trustworthy tests across ${trustworthyFiles.size} files:`,
+ );
+ trustworthyTests.sort().forEach((name) => console.error(` - ${name}`));
+ console.error(`\nโช Ignored ${volatileTests.length} volatile tests.`);
+ console.error(
+ `๐ Ignored ${newTests.length} tests with insufficient history.`,
+ );
+
+ // Output the list of names as a regex-friendly pattern for vitest -t
+ const pattern = trustworthyTests.map((name) => escapeRegex(name)).join('|');
+
+ // Also output unique file paths as a space-separated string
+ const files = Array.from(trustworthyFiles).join(' ');
+
+ // Print the combined output to stdout for use in shell scripts (only if piped/CI)
+ if (!process.stdout.isTTY) {
+ // Format: FILE_LIST --test-pattern TEST_PATTERN
+ // This allows the workflow to easily use it
+ process.stdout.write(`${files} --test-pattern ${pattern || ''}\n`);
+ } else {
+ console.error(
+ '\n๐ก Note: Raw regex pattern and file list are hidden in interactive terminal. It will be printed when piped or in CI.',
+ );
+ }
+}
+
+main();
diff --git a/scripts/run_eval_regression.js b/scripts/run_eval_regression.js
new file mode 100644
index 0000000000..7a64a6a2f9
--- /dev/null
+++ b/scripts/run_eval_regression.js
@@ -0,0 +1,107 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/**
+ * @fileoverview Orchestrates the PR evaluation process across multiple models.
+ *
+ * This script loops through a provided list of models, identifies trustworthy
+ * tests for each, executes the frugal regression check, and collects results
+ * into a single unified report. It exits with code 1 if any confirmed
+ * regressions are detected.
+ */
+
+import { execSync } from 'node:child_process';
+import fs from 'node:fs';
+
+/**
+ * Main execution logic.
+ */
+async function main() {
+ const modelList = process.env.MODEL_LIST || 'gemini-3-flash-preview';
+ const models = modelList.split(',').map((m) => m.trim());
+
+ let combinedReport = '';
+ let hasRegression = false;
+
+ console.log(
+ `๐ Starting evaluation orchestration for models: ${models.join(', ')}`,
+ );
+
+ for (const model of models) {
+ console.log(`\n--- Processing Model: ${model} ---`);
+
+ try {
+ // 1. Identify Trustworthy Evals
+ console.log(`๐ Identifying trustworthy tests for ${model}...`);
+ const output = execSync(
+ `node scripts/get_trustworthy_evals.js "${model}"`,
+ {
+ encoding: 'utf-8',
+ stdio: ['inherit', 'pipe', 'inherit'], // Capture stdout but pass stdin/stderr
+ },
+ ).trim();
+
+ if (!output) {
+ console.log(`โน๏ธ No trustworthy tests found for ${model}. Skipping.`);
+ continue;
+ }
+
+ // 2. Run Frugal Regression Check
+ console.log(`๐งช Running regression check for ${model}...`);
+ execSync(`node scripts/run_regression_check.js "${model}" "${output}"`, {
+ stdio: 'inherit',
+ });
+
+ // 3. Generate Report
+ console.log(`๐ Generating report for ${model}...`);
+ const report = execSync(`node scripts/compare_evals.js "${model}"`, {
+ encoding: 'utf-8',
+ stdio: ['inherit', 'pipe', 'inherit'],
+ }).trim();
+
+ if (report) {
+ if (combinedReport) {
+ combinedReport += '\n\n---\n\n';
+ }
+ combinedReport += report;
+
+ // 4. Check for Regressions
+ // If the report contains the "Action Required" marker, it means a confirmed regression was found.
+ if (report.includes('Action Required')) {
+ hasRegression = true;
+ }
+ }
+ } catch (error) {
+ console.error(`โ Error processing model ${model}:`, error.message);
+ // We flag a failure if any model encountered a critical error
+ hasRegression = true;
+ }
+ }
+
+ // Always save the combined report to a file so the workflow can capture it cleanly
+ if (combinedReport) {
+ fs.writeFileSync('eval_regression_report.md', combinedReport);
+ console.log(
+ '\n๐ Final Markdown report saved to eval_regression_report.md',
+ );
+ }
+
+ // Log status for CI visibility, but don't exit with error
+ if (hasRegression) {
+ console.error(
+ '\nโ ๏ธ Confirmed regressions detected across one or more models. See PR comment for details.',
+ );
+ } else {
+ console.log('\nโ
All evaluations passed successfully (or were cleared).');
+ }
+
+ process.exit(0);
+}
+
+main().catch((err) => {
+ console.error(err);
+ process.exit(1);
+});
diff --git a/scripts/run_regression_check.js b/scripts/run_regression_check.js
new file mode 100644
index 0000000000..1250671c30
--- /dev/null
+++ b/scripts/run_regression_check.js
@@ -0,0 +1,305 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/**
+ * @fileoverview Executes a high-signal regression check for behavioral evaluations.
+ *
+ * This script runs a targeted set of stable tests in an optimistic first pass.
+ * If failures occur, it employs a "Best-of-4" retry logic to handle natural flakiness.
+ * For confirmed failures (0/3), it performs Dynamic Baseline Verification by
+ * checking the failure against the 'main' branch to distinguish between
+ * model drift and PR-introduced regressions.
+ */
+
+import { execSync } from 'node:child_process';
+import fs from 'node:fs';
+import path from 'node:path';
+import { quote } from 'shell-quote';
+import { escapeRegex } from './eval_utils.js';
+
+/**
+ * Runs a set of tests using Vitest and returns the results.
+ */
+function runTests(files, pattern, model) {
+ const outputDir = path.resolve(
+ process.cwd(),
+ `evals/logs/pr-run-${Date.now()}`,
+ );
+ fs.mkdirSync(outputDir, { recursive: true });
+
+ const filesToRun = files || 'evals/';
+ console.log(
+ `๐ Running tests in ${filesToRun} with pattern: ${pattern?.slice(0, 100)}...`,
+ );
+
+ try {
+ const cmd = `npx vitest run --config evals/vitest.config.ts ${filesToRun} -t "${pattern}" --reporter=json --reporter=default --outputFile="${path.join(outputDir, 'report.json')}"`;
+ execSync(cmd, {
+ stdio: 'inherit',
+ env: { ...process.env, RUN_EVALS: '1', GEMINI_MODEL: model },
+ });
+ } catch {
+ // Vitest returns a non-zero exit code when tests fail. This is expected.
+ // We continue execution and handle the failures by parsing the JSON report.
+ }
+
+ const reportPath = path.join(outputDir, 'report.json');
+ return fs.existsSync(reportPath)
+ ? JSON.parse(fs.readFileSync(reportPath, 'utf-8'))
+ : null;
+}
+
+/**
+ * Helper to find a specific assertion by name across all test files.
+ */
+function findAssertion(report, testName) {
+ if (!report?.testResults) return null;
+ for (const fileResult of report.testResults) {
+ const assertion = fileResult.assertionResults.find(
+ (a) => a.title === testName,
+ );
+ if (assertion) return assertion;
+ }
+ return null;
+}
+
+/**
+ * Parses command line arguments to identify model, files, and test pattern.
+ */
+function parseArgs() {
+ const modelArg = process.argv[2];
+ const remainingArgs = process.argv.slice(3);
+ const fullArgsString = remainingArgs.join(' ');
+ const testPatternIndex = remainingArgs.indexOf('--test-pattern');
+
+ if (testPatternIndex !== -1) {
+ return {
+ model: modelArg,
+ files: remainingArgs.slice(0, testPatternIndex).join(' '),
+ pattern: remainingArgs.slice(testPatternIndex + 1).join(' '),
+ };
+ }
+
+ if (fullArgsString.includes('--test-pattern')) {
+ const parts = fullArgsString.split('--test-pattern');
+ return {
+ model: modelArg,
+ files: parts[0].trim(),
+ pattern: parts[1].trim(),
+ };
+ }
+
+ // Fallback for manual mode: Pattern Model
+ const manualPattern = process.argv[2];
+ const manualModel = process.argv[3];
+ if (!manualModel) {
+ console.error('โ Error: No target model specified.');
+ process.exit(1);
+ }
+
+ let manualFiles = 'evals/';
+ try {
+ const grepResult = execSync(
+ `grep -l ${quote([manualPattern])} evals/*.eval.ts`,
+ { encoding: 'utf-8' },
+ );
+ manualFiles = grepResult.split('\n').filter(Boolean).join(' ');
+ } catch {
+ // Grep returns exit code 1 if no files match the pattern.
+ // In this case, we fall back to scanning all files in the evals/ directory.
+ }
+
+ return {
+ model: manualModel,
+ files: manualFiles,
+ pattern: manualPattern,
+ isManual: true,
+ };
+}
+
+/**
+ * Runs the targeted retry logic (Best-of-4) for a failing test.
+ */
+async function runRetries(testName, results, files, model) {
+ console.log(`\nRe-evaluating: ${testName}`);
+
+ while (
+ results[testName].passed < 2 &&
+ results[testName].total - results[testName].passed < 3 &&
+ results[testName].total < 4
+ ) {
+ const attemptNum = results[testName].total + 1;
+ console.log(` Running attempt ${attemptNum}...`);
+
+ const retry = runTests(files, escapeRegex(testName), model);
+ const retryAssertion = findAssertion(retry, testName);
+
+ results[testName].total++;
+ if (retryAssertion?.status === 'passed') {
+ results[testName].passed++;
+ console.log(
+ ` โ
Attempt ${attemptNum} passed. Score: ${results[testName].passed}/${results[testName].total}`,
+ );
+ } else {
+ console.log(
+ ` โ Attempt ${attemptNum} failed (${retryAssertion?.status || 'unknown'}). Score: ${results[testName].passed}/${results[testName].total}`,
+ );
+ }
+
+ if (results[testName].passed >= 2) {
+ console.log(
+ ` โ
Test cleared as Noisy Pass (${results[testName].passed}/${results[testName].total})`,
+ );
+ } else if (results[testName].total - results[testName].passed >= 3) {
+ await verifyBaseline(testName, results, files, model);
+ }
+ }
+}
+
+/**
+ * Verifies a potential regression against the 'main' branch.
+ */
+async function verifyBaseline(testName, results, files, model) {
+ console.log('\n--- Step 3: Dynamic Baseline Verification ---');
+ console.log(
+ `โ ๏ธ Potential regression detected. Verifying baseline on 'main'...`,
+ );
+
+ try {
+ execSync('git stash push -m "eval-regression-check-stash"', {
+ stdio: 'inherit',
+ });
+ const hasStash = execSync('git stash list')
+ .toString()
+ .includes('eval-regression-check-stash');
+ execSync('git checkout main', { stdio: 'inherit' });
+
+ console.log(
+ `\n--- Running Baseline Verification on 'main' (Best-of-3) ---`,
+ );
+ let baselinePasses = 0;
+ let baselineTotal = 0;
+
+ while (baselinePasses === 0 && baselineTotal < 3) {
+ baselineTotal++;
+ console.log(` Baseline Attempt ${baselineTotal}...`);
+ const baselineRun = runTests(files, escapeRegex(testName), model);
+ if (findAssertion(baselineRun, testName)?.status === 'passed') {
+ baselinePasses++;
+ console.log(` โ
Baseline Attempt ${baselineTotal} passed.`);
+ } else {
+ console.log(` โ Baseline Attempt ${baselineTotal} failed.`);
+ }
+ }
+
+ execSync('git checkout -', { stdio: 'inherit' });
+ if (hasStash) execSync('git stash pop', { stdio: 'inherit' });
+
+ if (baselinePasses === 0) {
+ console.log(
+ ` โน๏ธ Test also fails on 'main'. Marking as PRE-EXISTING (Cleared).`,
+ );
+ results[testName].status = 'pre-existing';
+ results[testName].passed = results[testName].total; // Clear for report
+ } else {
+ console.log(
+ ` โ Test passes on 'main' but fails in PR. Marking as CONFIRMED REGRESSION.`,
+ );
+ results[testName].status = 'regression';
+ }
+ } catch (error) {
+ console.error(` โ Failed to verify baseline: ${error.message}`);
+
+ // Best-effort cleanup: try to return to the original branch.
+ try {
+ execSync('git checkout -', { stdio: 'ignore' });
+ } catch {
+ // Ignore checkout errors during cleanup to avoid hiding the original error.
+ }
+ }
+}
+
+/**
+ * Processes initial results and orchestrates retries/baseline checks.
+ */
+async function processResults(firstPass, pattern, model, files) {
+ if (!firstPass) return false;
+
+ const results = {};
+ const failingTests = [];
+ let totalProcessed = 0;
+
+ for (const fileResult of firstPass.testResults) {
+ for (const assertion of fileResult.assertionResults) {
+ if (assertion.status !== 'passed' && assertion.status !== 'failed') {
+ continue;
+ }
+
+ const name = assertion.title;
+ results[name] = {
+ passed: assertion.status === 'passed' ? 1 : 0,
+ total: 1,
+ file: fileResult.name,
+ };
+ if (assertion.status === 'failed') failingTests.push(name);
+ totalProcessed++;
+ }
+ }
+
+ if (totalProcessed === 0) {
+ console.error('โ Error: No matching tests were found or executed.');
+ return false;
+ }
+
+ if (failingTests.length === 0) {
+ console.log('โ
All trustworthy tests passed on the first try!');
+ } else {
+ console.log('\n--- Step 2: Best-of-4 Retries ---');
+ console.log(
+ `โ ๏ธ ${failingTests.length} tests failed the optimistic run. Starting retries...`,
+ );
+ for (const testName of failingTests) {
+ await runRetries(testName, results, files, model);
+ }
+ }
+
+ saveResults(results);
+ return true;
+}
+
+function saveResults(results) {
+ const finalReport = { timestamp: new Date().toISOString(), results };
+ fs.writeFileSync(
+ 'evals/logs/pr_final_report.json',
+ JSON.stringify(finalReport, null, 2),
+ );
+ console.log('\nFinal report saved to evals/logs/pr_final_report.json');
+}
+
+async function main() {
+ const { model, files, pattern, isManual } = parseArgs();
+
+ if (isManual) {
+ const firstPass = runTests(files, pattern, model);
+ const success = await processResults(firstPass, pattern, model, files);
+ process.exit(success ? 0 : 1);
+ }
+
+ if (!pattern) {
+ console.log('No trustworthy tests to run.');
+ process.exit(0);
+ }
+
+ console.log('\n--- Step 1: Optimistic Run (N=1) ---');
+ const firstPass = runTests(files, pattern, model);
+ const success = await processResults(firstPass, pattern, model, files);
+ process.exit(success ? 0 : 1);
+}
+
+main().catch((err) => {
+ console.error(err);
+ process.exit(1);
+});