Merge remote-tracking branch 'origin/main' into feature/simulator-knowledge-update

# Conflicts: # package-lock.json # package.json # packages/cli/src/interactiveCli.tsx # packages/core/src/telemetry/llmRole.ts
2026-06-12 20:37:08 -07:00 · 2026-04-16 22:05:57 -07:00
parent 40f9db30ce 22fb83320e
commit bfbdae8e3a
914 changed files with 892464 additions and 16022 deletions
@@ -36,7 +36,8 @@ function main() {
    });

    // Get changed files using the triple-dot syntax which correctly handles merge commits
-    const changedFiles = execSync(`git diff --name-only FETCH_HEAD...HEAD`, {
+    const head = process.env.PR_HEAD_SHA || 'HEAD';
+    const changedFiles = execSync(`git diff --name-only FETCH_HEAD...${head}`, {
      encoding: 'utf-8',
    })
      .split('\n')
@@ -70,7 +71,7 @@ function main() {
      if (coreChanges.length > 0) {
        // Get the actual diff content for core files
        const diff = execSync(
-          `git diff -U0 FETCH_HEAD...HEAD -- packages/core/src/`,
+          `git diff -U0 FETCH_HEAD...${head} -- packages/core/src/`,
          { encoding: 'utf-8' },
        );
        for (const sig of STEERING_SIGNATURES) {
@@ -0,0 +1,142 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/**
+ * @fileoverview Compares PR evaluation results against historical nightly baselines.
+ *
+ * This script generates a Markdown report for use in PR comments. It aligns with
+ * the 6-day lookback logic to show accurate historical pass rates and filters out
+ * pre-existing or noisy failures to ensure only actionable regressions are reported.
+ */
+
+import fs from 'node:fs';
+import path from 'node:path';
+import { fetchNightlyHistory } from './eval_utils.js';
+
+/**
+ * Main execution logic.
+ */
+function main() {
+  const prReportPath = 'evals/logs/pr_final_report.json';
+  const targetModel = process.argv[2];
+
+  if (!targetModel) {
+    console.error('❌ Error: No target model specified.');
+    process.exit(1);
+  }
+
+  if (!fs.existsSync(prReportPath)) {
+    console.error('No PR report found.');
+    return;
+  }
+
+  const prReport = JSON.parse(fs.readFileSync(prReportPath, 'utf-8'));
+  const history = fetchNightlyHistory(6); // Use same 6-day lookback
+  const latestNightly = aggregateHistoricalStats(history, targetModel);
+
+  const regressions = [];
+  const passes = [];
+
+  for (const [testName, pr] of Object.entries(prReport.results)) {
+    const prRate = pr.passed / pr.total;
+    if (pr.status === 'regression' || (prRate <= 0.34 && !pr.status)) {
+      // Use relative path from workspace root
+      const relativeFile = pr.file
+        ? path.relative(process.cwd(), pr.file)
+        : 'evals/';
+
+      regressions.push({
+        name: testName,
+        file: relativeFile,
+        nightly: latestNightly[testName]
+          ? (latestNightly[testName].passRate * 100).toFixed(0) + '%'
+          : 'N/A',
+        pr: (prRate * 100).toFixed(0) + '%',
+      });
+    } else {
+      passes.push(testName);
+    }
+  }
+
+  if (regressions.length > 0) {
+    let markdown = '### 🚨 Action Required: Eval Regressions Detected\n\n';
+    markdown += `**Model:** \`${targetModel}\`\n\n`;
+    markdown +=
+      'The following trustworthy evaluations passed on **`main`** and in **recent Nightly runs**, but failed in this PR. These regressions must be addressed before merging.\n\n';
+
+    markdown += '| Test Name | Nightly | PR Result | Status |\n';
+    markdown += '| :--- | :---: | :---: | :--- |\n';
+    for (const r of regressions) {
+      markdown += `| ${r.name} | ${r.nightly} | ${r.pr} | ❌ **Regression** |\n`;
+    }
+    markdown += `\n*The check passed or was cleared for ${passes.length} other trustworthy evaluations.*\n\n`;
+
+    markdown += '<details>\n';
+    markdown +=
+      '<summary><b>🛠️ Troubleshooting & Fix Instructions</b></summary>\n\n';
+
+    for (let i = 0; i < regressions.length; i++) {
+      const r = regressions[i];
+      if (regressions.length > 1) {
+        markdown += `### Failure ${i + 1}: ${r.name}\n\n`;
+      }
+
+      markdown += '#### 1. Ask Gemini CLI to fix it (Recommended)\n';
+      markdown += 'Copy and paste this prompt to the agent:\n';
+      markdown += '```text\n';
+      markdown += `The eval "${r.name}" in ${r.file} is failing. Investigate and fix it using the behavioral-evals skill.\n`;
+      markdown += '```\n\n';
+
+      markdown += '#### 2. Reproduce Locally\n';
+      markdown += 'Run the following command to see the failure trajectory:\n';
+      markdown += '```bash\n';
+      const pattern = r.name.replace(/'/g, '.');
+      markdown += `GEMINI_MODEL=${targetModel} npm run test:all_evals -- ${r.file} --testNamePattern="${pattern}"\n`;
+
+      markdown += '```\n\n';
+
+      if (i < regressions.length - 1) {
+        markdown += '---\n\n';
+      }
+    }
+
+    markdown += '#### 3. Manual Fix\n';
+    markdown +=
+      'See the [Fixing Guide](https://github.com/google-gemini/gemini-cli/blob/main/evals/README.md#fixing-evaluations) for detailed troubleshooting steps.\n';
+    markdown += '</details>\n';
+
+    process.stdout.write(markdown);
+  } else if (passes.length > 0) {
+    // Success State
+    process.stdout.write(
+      `✅ **${passes.length}** tests passed successfully on **${targetModel}**.\n`,
+    );
+  }
+}
+
+/**
+ * Aggregates stats from history for a specific model.
+ */
+function aggregateHistoricalStats(history, model) {
+  const stats = {};
+  for (const item of history) {
+    const modelStats = item.stats[model];
+    if (!modelStats) continue;
+
+    for (const [testName, stat] of Object.entries(modelStats)) {
+      if (!stats[testName]) stats[testName] = { passed: 0, total: 0 };
+      stats[testName].passed += stat.passed;
+      stats[testName].total += stat.total;
+    }
+  }
+
+  for (const name in stats) {
+    stats[name].passRate = stats[name].passed / stats[name].total;
+  }
+  return stats;
+}
+
+main();
@@ -98,9 +98,43 @@ if (existsSync(devtoolsDistSrc)) {
 // 6. Copy bundled chrome-devtools-mcp
 const bundleMcpSrc = join(root, 'packages/core/dist/bundled');
 const bundleMcpDest = join(bundleDir, 'bundled');
-if (existsSync(bundleMcpSrc)) {
-  cpSync(bundleMcpSrc, bundleMcpDest, { recursive: true, dereference: true });
-  console.log('Copied bundled chrome-devtools-mcp to bundle/bundled/');
+if (!existsSync(bundleMcpSrc)) {
+  console.error(
+    `Error: chrome-devtools-mcp bundle not found at ${bundleMcpSrc}.\n` +
+      `Run "npm run bundle:browser-mcp -w @google/gemini-cli-core" first.`,
+  );
+  process.exit(1);
+}
+cpSync(bundleMcpSrc, bundleMcpDest, { recursive: true, dereference: true });
+console.log('Copied bundled chrome-devtools-mcp to bundle/bundled/');
+
+// 7. Copy pre-built ripgrep vendor binaries
+const ripgrepVendorSrc = join(root, 'packages/core/vendor/ripgrep');
+const ripgrepVendorDest = join(bundleDir, 'vendor', 'ripgrep');
+if (existsSync(ripgrepVendorSrc)) {
+  mkdirSync(ripgrepVendorDest, { recursive: true });
+  cpSync(ripgrepVendorSrc, ripgrepVendorDest, {
+    recursive: true,
+    dereference: true,
+  });
+  console.log('Copied ripgrep vendor binaries to bundle/vendor/ripgrep/');
+}
+
+// 8. Copy Extension Examples
+const extensionExamplesSrc = join(
+  root,
+  'packages/cli/src/commands/extensions/examples',
+);
+const extensionExamplesDest = join(bundleDir, 'examples');
+const EXCLUDED_EXAMPLE_DIRS = ['node_modules', 'dist'];
+
+if (existsSync(extensionExamplesSrc)) {
+  cpSync(extensionExamplesSrc, extensionExamplesDest, {
+    recursive: true,
+    dereference: true,
+    filter: (src) => !EXCLUDED_EXAMPLE_DIRS.some((dir) => src.includes(dir)),
+  });
+  console.log('Copied extension examples to bundle/examples/');
 }

 console.log('Assets copied to bundle/');
@@ -0,0 +1,146 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/**
+ * @fileoverview This script downloads pre-built ripgrep binaries for all supported
+ * architectures and platforms. These binaries are checked into the repository
+ * under packages/core/vendor/ripgrep.
+ *
+ * Maintainers should periodically run this script to upgrade the version
+ * of ripgrep being distributed.
+ *
+ * Usage: npx tsx scripts/download-ripgrep-binaries.ts
+ */
+
+import fs from 'node:fs';
+import fsPromises from 'node:fs/promises';
+import path from 'node:path';
+import { pipeline } from 'node:stream/promises';
+import { fileURLToPath } from 'node:url';
+import { createWriteStream } from 'node:fs';
+import { Readable } from 'node:stream';
+import type { ReadableStream } from 'node:stream/web';
+import { execFileSync } from 'node:child_process';
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+const CORE_VENDOR_DIR = path.join(__dirname, '../packages/core/vendor/ripgrep');
+const VERSION = 'v13.0.0-10';
+
+interface Target {
+  platform: string;
+  arch: string;
+  file: string;
+}
+
+const targets: Target[] = [
+  { platform: 'darwin', arch: 'arm64', file: 'aarch64-apple-darwin.tar.gz' },
+  { platform: 'darwin', arch: 'x64', file: 'x86_64-apple-darwin.tar.gz' },
+  {
+    platform: 'linux',
+    arch: 'arm64',
+    file: 'aarch64-unknown-linux-gnu.tar.gz',
+  },
+  { platform: 'linux', arch: 'x64', file: 'x86_64-unknown-linux-musl.tar.gz' },
+  { platform: 'win32', arch: 'x64', file: 'x86_64-pc-windows-msvc.zip' },
+];
+
+async function downloadBinary() {
+  await fsPromises.mkdir(CORE_VENDOR_DIR, { recursive: true });
+
+  for (const target of targets) {
+    const url = `https://github.com/microsoft/ripgrep-prebuilt/releases/download/${VERSION}/ripgrep-${VERSION}-${target.file}`;
+    const archivePath = path.join(CORE_VENDOR_DIR, target.file);
+    const binName = `rg-${target.platform}-${target.arch}${target.platform === 'win32' ? '.exe' : ''}`;
+    const finalBinPath = path.join(CORE_VENDOR_DIR, binName);
+
+    if (fs.existsSync(finalBinPath)) {
+      console.log(`[Cache] ${binName} already exists.`);
+      continue;
+    }
+
+    console.log(`[Download] ${url} -> ${archivePath}`);
+    const response = await fetch(url);
+    if (!response.ok) {
+      throw new Error(`Failed to fetch ${url}: ${response.statusText}`);
+    }
+
+    if (!response.body) {
+      throw new Error(`Response body is null for ${url}`);
+    }
+
+    const fileStream = createWriteStream(archivePath);
+
+    // Node 18+ global fetch response.body is a ReadableStream (web stream)
+    // pipeline(Readable.fromWeb(response.body), fileStream) works in Node 18+
+    await pipeline(
+      Readable.fromWeb(response.body as ReadableStream),
+      fileStream,
+    );
+
+    console.log(`[Extract] Extracting ${archivePath}...`);
+    // Extract using shell commands for simplicity
+    if (target.file.endsWith('.tar.gz')) {
+      execFileSync('tar', ['-xzf', archivePath, '-C', CORE_VENDOR_DIR]);
+      // Microsoft's ripgrep release extracts directly to `rg` inside the current directory sometimes
+      const sourceBin = path.join(CORE_VENDOR_DIR, 'rg');
+      if (fs.existsSync(sourceBin)) {
+        await fsPromises.rename(sourceBin, finalBinPath);
+      } else {
+        // Fallback for sub-directory if it happens
+        const extractedDirName = `ripgrep-${VERSION}-${target.file.replace('.tar.gz', '')}`;
+        const fallbackSourceBin = path.join(
+          CORE_VENDOR_DIR,
+          extractedDirName,
+          'rg',
+        );
+        if (fs.existsSync(fallbackSourceBin)) {
+          await fsPromises.rename(fallbackSourceBin, finalBinPath);
+          await fsPromises.rm(path.join(CORE_VENDOR_DIR, extractedDirName), {
+            recursive: true,
+            force: true,
+          });
+        } else {
+          throw new Error(
+            `Could not find extracted 'rg' binary for ${target.platform} ${target.arch}`,
+          );
+        }
+      }
+    } else if (target.file.endsWith('.zip')) {
+      execFileSync('unzip', ['-o', '-q', archivePath, '-d', CORE_VENDOR_DIR]);
+      const sourceBin = path.join(CORE_VENDOR_DIR, 'rg.exe');
+      if (fs.existsSync(sourceBin)) {
+        await fsPromises.rename(sourceBin, finalBinPath);
+      } else {
+        const extractedDirName = `ripgrep-${VERSION}-${target.file.replace('.zip', '')}`;
+        const fallbackSourceBin = path.join(
+          CORE_VENDOR_DIR,
+          extractedDirName,
+          'rg.exe',
+        );
+        if (fs.existsSync(fallbackSourceBin)) {
+          await fsPromises.rename(fallbackSourceBin, finalBinPath);
+          await fsPromises.rm(path.join(CORE_VENDOR_DIR, extractedDirName), {
+            recursive: true,
+            force: true,
+          });
+        } else {
+          throw new Error(
+            `Could not find extracted 'rg.exe' binary for ${target.platform} ${target.arch}`,
+          );
+        }
+      }
+    }
+
+    // Clean up archive
+    await fsPromises.unlink(archivePath);
+    console.log(`[Success] Saved to ${finalBinPath}`);
+  }
+}
+
+downloadBinary().catch((err) => {
+  console.error(err);
+  process.exit(1);
+});
@@ -0,0 +1,136 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import fs from 'node:fs';
+import path from 'node:path';
+import { execSync } from 'node:child_process';
+import os from 'node:os';
+
+/**
+ * Finds all report.json files recursively in a directory.
+ */
+export function findReports(dir) {
+  const reports = [];
+  if (!fs.existsSync(dir)) return reports;
+
+  const files = fs.readdirSync(dir);
+  for (const file of files) {
+    const fullPath = path.join(dir, file);
+    const stat = fs.statSync(fullPath);
+    if (stat.isDirectory()) {
+      reports.push(...findReports(fullPath));
+    } else if (file === 'report.json') {
+      reports.push(fullPath);
+    }
+  }
+  return reports;
+}
+
+/**
+ * Extracts the model name from the artifact path.
+ */
+export function getModelFromPath(reportPath) {
+  const parts = reportPath.split(path.sep);
+  // Look for the directory that follows the 'eval-logs-' pattern
+  const artifactDir = parts.find((p) => p.startsWith('eval-logs-'));
+  if (!artifactDir) return 'unknown';
+
+  const match = artifactDir.match(/^eval-logs-(.+)-(\d+)$/);
+  return match ? match[1] : 'unknown';
+}
+
+/**
+ * Escapes special characters in a string for use in a regular expression.
+ */
+export function escapeRegex(string) {
+  return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
+}
+
+/**
+ * Aggregates stats from a list of report.json files.
+ * @returns {Record<string, Record<string, {passed: number, total: number, file?: string}>>} statsByModel
+ */
+export function getStatsFromReports(reports) {
+  const statsByModel = {};
+
+  for (const reportPath of reports) {
+    try {
+      const model = getModelFromPath(reportPath);
+      if (!statsByModel[model]) {
+        statsByModel[model] = {};
+      }
+      const testStats = statsByModel[model];
+
+      const content = fs.readFileSync(reportPath, 'utf-8');
+      const json = JSON.parse(content);
+
+      for (const testResult of json.testResults) {
+        const filePath = testResult.name;
+        for (const assertion of testResult.assertionResults) {
+          const name = assertion.title;
+          if (!testStats[name]) {
+            testStats[name] = { passed: 0, total: 0, file: filePath };
+          }
+          testStats[name].total++;
+          if (assertion.status === 'passed') {
+            testStats[name].passed++;
+          }
+        }
+      }
+    } catch (error) {
+      console.error(`Error processing report at ${reportPath}:`, error.message);
+    }
+  }
+  return statsByModel;
+}
+
+/**
+ * Fetches historical nightly data using the GitHub CLI.
+ * @returns {Array<{runId: string, stats: Record<string, any>}>} history
+ */
+export function fetchNightlyHistory(lookbackCount) {
+  const history = [];
+  try {
+    const cmd = `gh run list --workflow evals-nightly.yml --branch main --limit ${
+      lookbackCount + 2
+    } --json databaseId,status`;
+    const runsJson = execSync(cmd, { encoding: 'utf-8' });
+    let runs = JSON.parse(runsJson);
+
+    // Filter for completed runs and take the top N
+    runs = runs.filter((r) => r.status === 'completed').slice(0, lookbackCount);
+
+    for (const run of runs) {
+      const tmpDir = fs.mkdtempSync(
+        path.join(os.tmpdir(), `gemini-evals-hist-${run.databaseId}-`),
+      );
+      try {
+        execSync(
+          `gh run download ${run.databaseId} -p "eval-logs-*" -D "${tmpDir}"`,
+          { stdio: 'ignore' },
+        );
+
+        const runReports = findReports(tmpDir);
+        if (runReports.length > 0) {
+          history.push({
+            runId: run.databaseId,
+            stats: getStatsFromReports(runReports),
+          });
+        }
+      } catch (error) {
+        console.error(
+          `Failed to process artifacts for run ${run.databaseId}:`,
+          error.message,
+        );
+      } finally {
+        fs.rmSync(tmpDir, { recursive: true, force: true });
+      }
+    }
+  } catch (error) {
+    console.error('Failed to fetch history:', error.message);
+  }
+  return history;
+}
@@ -195,7 +195,7 @@ function doesVersionExist({ args, version } = {}) {
      console.error(`Version ${version} already exists on NPM.`);
      return true;
    }
-  } catch (_error) {
+  } catch {
    // This is expected if the version doesn't exist.
  }

@@ -285,7 +285,7 @@ function promoteNightlyVersion({ args } = {}) {
  const date = new Date().toISOString().slice(0, 10).replace(/-/g, '');
  const gitShortHash = execSync('git rev-parse --short HEAD').toString().trim();
  return {
-    releaseVersion: `${major}.${nextMinor}.0-nightly.${date}.${gitShortHash}`,
+    releaseVersion: `${major}.${nextMinor}.0-nightly.${date}.g${gitShortHash}`,
    npmTag: TAG_NIGHTLY,
    previousReleaseTag: previousNightlyTag,
  };
@@ -296,7 +296,7 @@ function getNightlyVersion() {
  const baseVersion = packageJson.version.split('-')[0];
  const date = new Date().toISOString().slice(0, 10).replace(/-/g, '');
  const gitShortHash = execSync('git rev-parse --short HEAD').toString().trim();
-  const releaseVersion = `${baseVersion}-nightly.${date}.${gitShortHash}`;
+  const releaseVersion = `${baseVersion}-nightly.${date}.g${gitShortHash}`;
  const previousReleaseTag = getLatestTag('v*-nightly*');

  return {
@@ -0,0 +1,125 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/**
+ * @fileoverview Identifies "Trustworthy" behavioral evaluations from nightly history.
+ *
+ * This script analyzes the last 6 days of nightly runs to find tests that meet
+ * strict stability criteria (80% aggregate pass rate and 60% daily floor).
+ * It outputs a list of files and a Vitest pattern used by the PR regression check
+ * to ensure high-signal validation and minimize noise.
+ */
+
+import { fetchNightlyHistory, escapeRegex } from './eval_utils.js';
+
+const LOOKBACK_COUNT = 6;
+const MIN_VALID_RUNS = 5; // At least 5 out of 6 must be available
+const PASS_RATE_THRESHOLD = 0.6; // Daily floor (e.g., 2/3)
+const AGGREGATE_PASS_RATE_THRESHOLD = 0.8; // Weekly signal (e.g., 15/18)
+
+/**
+ * Main execution logic.
+ */
+function main() {
+  const targetModel = process.argv[2];
+  if (!targetModel) {
+    console.error('❌ Error: No target model specified.');
+    process.exit(1);
+  }
+  console.error(`🔍 Identifying trustworthy evals for model: ${targetModel}`);
+
+  const history = fetchNightlyHistory(LOOKBACK_COUNT);
+  if (history.length === 0) {
+    console.error('❌ No historical data found.');
+    process.exit(1);
+  }
+
+  // Aggregate results for the target model across all history
+  const testHistories = {}; // { [testName]: { totalPassed: 0, totalRuns: 0, dailyRates: [], file: string } }
+
+  for (const item of history) {
+    const modelStats = item.stats[targetModel];
+    if (!modelStats) continue;
+
+    for (const [testName, stat] of Object.entries(modelStats)) {
+      if (!testHistories[testName]) {
+        testHistories[testName] = {
+          totalPassed: 0,
+          totalRuns: 0,
+          dailyRates: [],
+          file: stat.file,
+        };
+      }
+      testHistories[testName].totalPassed += stat.passed;
+      testHistories[testName].totalRuns += stat.total;
+      testHistories[testName].dailyRates.push(stat.passed / stat.total);
+    }
+  }
+
+  const trustworthyTests = [];
+  const trustworthyFiles = new Set();
+  const volatileTests = [];
+  const newTests = [];
+
+  for (const [testName, info] of Object.entries(testHistories)) {
+    const dailyRates = info.dailyRates;
+    const aggregateRate = info.totalPassed / info.totalRuns;
+
+    // 1. Minimum data points required
+    if (dailyRates.length < MIN_VALID_RUNS) {
+      newTests.push(testName);
+      continue;
+    }
+
+    // 2. Trustworthy Criterion:
+    // - Every single day must be above the floor (e.g. > 60%)
+    // - The overall aggregate must be high-signal (e.g. > 80%)
+    const isDailyStable = dailyRates.every(
+      (rate) => rate > PASS_RATE_THRESHOLD,
+    );
+    const isAggregateHighSignal = aggregateRate > AGGREGATE_PASS_RATE_THRESHOLD;
+
+    if (isDailyStable && isAggregateHighSignal) {
+      trustworthyTests.push(testName);
+      if (info.file) {
+        const match = info.file.match(/evals\/.*\.eval\.ts/);
+        if (match) {
+          trustworthyFiles.add(match[0]);
+        }
+      }
+    } else {
+      volatileTests.push(testName);
+    }
+  }
+
+  console.error(
+    `✅ Found ${trustworthyTests.length} trustworthy tests across ${trustworthyFiles.size} files:`,
+  );
+  trustworthyTests.sort().forEach((name) => console.error(`   - ${name}`));
+  console.error(`\n⚪ Ignored ${volatileTests.length} volatile tests.`);
+  console.error(
+    `🆕 Ignored ${newTests.length} tests with insufficient history.`,
+  );
+
+  // Output the list of names as a regex-friendly pattern for vitest -t
+  const pattern = trustworthyTests.map((name) => escapeRegex(name)).join('|');
+
+  // Also output unique file paths as a space-separated string
+  const files = Array.from(trustworthyFiles).join(' ');
+
+  // Print the combined output to stdout for use in shell scripts (only if piped/CI)
+  if (!process.stdout.isTTY) {
+    // Format: FILE_LIST --test-pattern TEST_PATTERN
+    // This allows the workflow to easily use it
+    process.stdout.write(`${files} --test-pattern ${pattern || ''}\n`);
+  } else {
+    console.error(
+      '\n💡 Note: Raw regex pattern and file list are hidden in interactive terminal. It will be printed when piped or in CI.',
+    );
+  }
+}
+
+main();
@@ -177,7 +177,7 @@ function runCommand(command, stdio = 'inherit') {
    ].join(sep);
    execSync(command, { stdio, env, shell: true });
    return true;
-  } catch (_e) {
+  } catch {
    return false;
  }
 }
@@ -267,7 +267,7 @@ export function runSensitiveKeywordLinter() {
        .trim()
        .split('\n')
        .filter(Boolean);
-    } catch (_error) {
+    } catch {
      console.error(`Could not get changed files against origin/${baseRef}.`);
      try {
        console.log('Falling back to diff against HEAD~1');
@@ -276,7 +276,7 @@ export function runSensitiveKeywordLinter() {
          .trim()
          .split('\n')
          .filter(Boolean);
-      } catch (_fallbackError) {
+      } catch {
        console.error('Could not get changed files against HEAD~1 either.');
        process.exit(1);
      }
@@ -105,11 +105,11 @@ async function main() {
  try {
    execSync('pkill -f "otelcol-contrib"');
    console.log('✅ Stopped existing otelcol-contrib process.');
-  } catch (_e) {} // eslint-disable-line no-empty
+  } catch {} // eslint-disable-line no-empty
  try {
    execSync('pkill -f "jaeger"');
    console.log('✅ Stopped existing jaeger process.');
-  } catch (_e) {} // eslint-disable-line no-empty
+  } catch {} // eslint-disable-line no-empty
  try {
    if (fileExists(OTEL_LOG_FILE)) fs.unlinkSync(OTEL_LOG_FILE);
    console.log('✅ Deleted old collector log.');
@@ -155,7 +155,7 @@ async function main() {
  try {
    await waitForPort(JAEGER_PORT);
    console.log(`✅ Jaeger started successfully.`);
-  } catch (_) {
+  } catch {
    console.error(`🛑 Error: Jaeger failed to start on port ${JAEGER_PORT}.`);
    if (jaegerProcess && jaegerProcess.pid) {
      process.kill(jaegerProcess.pid, 'SIGKILL');
@@ -180,7 +180,7 @@ async function main() {
  try {
    await waitForPort(4317);
    console.log(`✅ OTEL collector started successfully.`);
-  } catch (_) {
+  } catch {
    console.error(`🛑 Error: OTEL collector failed to start on port 4317.`);
    if (collectorProcess && collectorProcess.pid) {
      process.kill(collectorProcess.pid, 'SIGKILL');
@@ -180,7 +180,7 @@ async function main() {
          // Re-throw if it's not a conflict error
          throw error;
        }
-      } catch (_statusError) {
+      } catch {
        // Re-throw original error if we can't determine the status
        throw error;
      }
@@ -268,7 +268,7 @@ function branchExists(branchName) {
  try {
    execSync(`git ls-remote --exit-code --heads origin ${branchName}`);
    return true;
-  } catch (_e) {
+  } catch {
    return false;
  }
 }
@@ -374,7 +374,7 @@ No output was generated during patch creation.
      // Clean up temp file
      try {
        unlinkSync(tmpFile);
-      } catch (_e) {
+      } catch {
        // Ignore cleanup errors
      }
    }
@@ -0,0 +1,107 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/**
+ * @fileoverview Orchestrates the PR evaluation process across multiple models.
+ *
+ * This script loops through a provided list of models, identifies trustworthy
+ * tests for each, executes the frugal regression check, and collects results
+ * into a single unified report. It exits with code 1 if any confirmed
+ * regressions are detected.
+ */
+
+import { execSync } from 'node:child_process';
+import fs from 'node:fs';
+
+/**
+ * Main execution logic.
+ */
+async function main() {
+  const modelList = process.env.MODEL_LIST || 'gemini-3-flash-preview';
+  const models = modelList.split(',').map((m) => m.trim());
+
+  let combinedReport = '';
+  let hasRegression = false;
+
+  console.log(
+    `🚀 Starting evaluation orchestration for models: ${models.join(', ')}`,
+  );
+
+  for (const model of models) {
+    console.log(`\n--- Processing Model: ${model} ---`);
+
+    try {
+      // 1. Identify Trustworthy Evals
+      console.log(`🔍 Identifying trustworthy tests for ${model}...`);
+      const output = execSync(
+        `node scripts/get_trustworthy_evals.js "${model}"`,
+        {
+          encoding: 'utf-8',
+          stdio: ['inherit', 'pipe', 'inherit'], // Capture stdout but pass stdin/stderr
+        },
+      ).trim();
+
+      if (!output) {
+        console.log(`ℹ️ No trustworthy tests found for ${model}. Skipping.`);
+        continue;
+      }
+
+      // 2. Run Frugal Regression Check
+      console.log(`🧪 Running regression check for ${model}...`);
+      execSync(`node scripts/run_regression_check.js "${model}" "${output}"`, {
+        stdio: 'inherit',
+      });
+
+      // 3. Generate Report
+      console.log(`📊 Generating report for ${model}...`);
+      const report = execSync(`node scripts/compare_evals.js "${model}"`, {
+        encoding: 'utf-8',
+        stdio: ['inherit', 'pipe', 'inherit'],
+      }).trim();
+
+      if (report) {
+        if (combinedReport) {
+          combinedReport += '\n\n---\n\n';
+        }
+        combinedReport += report;
+
+        // 4. Check for Regressions
+        // If the report contains the "Action Required" marker, it means a confirmed regression was found.
+        if (report.includes('Action Required')) {
+          hasRegression = true;
+        }
+      }
+    } catch (error) {
+      console.error(`❌ Error processing model ${model}:`, error.message);
+      // We flag a failure if any model encountered a critical error
+      hasRegression = true;
+    }
+  }
+
+  // Always save the combined report to a file so the workflow can capture it cleanly
+  if (combinedReport) {
+    fs.writeFileSync('eval_regression_report.md', combinedReport);
+    console.log(
+      '\n📊 Final Markdown report saved to eval_regression_report.md',
+    );
+  }
+
+  // Log status for CI visibility, but don't exit with error
+  if (hasRegression) {
+    console.error(
+      '\n⚠️ Confirmed regressions detected across one or more models. See PR comment for details.',
+    );
+  } else {
+    console.log('\n✅ All evaluations passed successfully (or were cleared).');
+  }
+
+  process.exit(0);
+}
+
+main().catch((err) => {
+  console.error(err);
+  process.exit(1);
+});
@@ -0,0 +1,305 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/**
+ * @fileoverview Executes a high-signal regression check for behavioral evaluations.
+ *
+ * This script runs a targeted set of stable tests in an optimistic first pass.
+ * If failures occur, it employs a "Best-of-4" retry logic to handle natural flakiness.
+ * For confirmed failures (0/3), it performs Dynamic Baseline Verification by
+ * checking the failure against the 'main' branch to distinguish between
+ * model drift and PR-introduced regressions.
+ */
+
+import { execSync } from 'node:child_process';
+import fs from 'node:fs';
+import path from 'node:path';
+import { quote } from 'shell-quote';
+import { escapeRegex } from './eval_utils.js';
+
+/**
+ * Runs a set of tests using Vitest and returns the results.
+ */
+function runTests(files, pattern, model) {
+  const outputDir = path.resolve(
+    process.cwd(),
+    `evals/logs/pr-run-${Date.now()}`,
+  );
+  fs.mkdirSync(outputDir, { recursive: true });
+
+  const filesToRun = files || 'evals/';
+  console.log(
+    `🚀 Running tests in ${filesToRun} with pattern: ${pattern?.slice(0, 100)}...`,
+  );
+
+  try {
+    const cmd = `npx vitest run --config evals/vitest.config.ts ${filesToRun} -t "${pattern}" --reporter=json --reporter=default --outputFile="${path.join(outputDir, 'report.json')}"`;
+    execSync(cmd, {
+      stdio: 'inherit',
+      env: { ...process.env, RUN_EVALS: '1', GEMINI_MODEL: model },
+    });
+  } catch {
+    // Vitest returns a non-zero exit code when tests fail. This is expected.
+    // We continue execution and handle the failures by parsing the JSON report.
+  }
+
+  const reportPath = path.join(outputDir, 'report.json');
+  return fs.existsSync(reportPath)
+    ? JSON.parse(fs.readFileSync(reportPath, 'utf-8'))
+    : null;
+}
+
+/**
+ * Helper to find a specific assertion by name across all test files.
+ */
+function findAssertion(report, testName) {
+  if (!report?.testResults) return null;
+  for (const fileResult of report.testResults) {
+    const assertion = fileResult.assertionResults.find(
+      (a) => a.title === testName,
+    );
+    if (assertion) return assertion;
+  }
+  return null;
+}
+
+/**
+ * Parses command line arguments to identify model, files, and test pattern.
+ */
+function parseArgs() {
+  const modelArg = process.argv[2];
+  const remainingArgs = process.argv.slice(3);
+  const fullArgsString = remainingArgs.join(' ');
+  const testPatternIndex = remainingArgs.indexOf('--test-pattern');
+
+  if (testPatternIndex !== -1) {
+    return {
+      model: modelArg,
+      files: remainingArgs.slice(0, testPatternIndex).join(' '),
+      pattern: remainingArgs.slice(testPatternIndex + 1).join(' '),
+    };
+  }
+
+  if (fullArgsString.includes('--test-pattern')) {
+    const parts = fullArgsString.split('--test-pattern');
+    return {
+      model: modelArg,
+      files: parts[0].trim(),
+      pattern: parts[1].trim(),
+    };
+  }
+
+  // Fallback for manual mode: Pattern Model
+  const manualPattern = process.argv[2];
+  const manualModel = process.argv[3];
+  if (!manualModel) {
+    console.error('❌ Error: No target model specified.');
+    process.exit(1);
+  }
+
+  let manualFiles = 'evals/';
+  try {
+    const grepResult = execSync(
+      `grep -l ${quote([manualPattern])} evals/*.eval.ts`,
+      { encoding: 'utf-8' },
+    );
+    manualFiles = grepResult.split('\n').filter(Boolean).join(' ');
+  } catch {
+    // Grep returns exit code 1 if no files match the pattern.
+    // In this case, we fall back to scanning all files in the evals/ directory.
+  }
+
+  return {
+    model: manualModel,
+    files: manualFiles,
+    pattern: manualPattern,
+    isManual: true,
+  };
+}
+
+/**
+ * Runs the targeted retry logic (Best-of-4) for a failing test.
+ */
+async function runRetries(testName, results, files, model) {
+  console.log(`\nRe-evaluating: ${testName}`);
+
+  while (
+    results[testName].passed < 2 &&
+    results[testName].total - results[testName].passed < 3 &&
+    results[testName].total < 4
+  ) {
+    const attemptNum = results[testName].total + 1;
+    console.log(`  Running attempt ${attemptNum}...`);
+
+    const retry = runTests(files, escapeRegex(testName), model);
+    const retryAssertion = findAssertion(retry, testName);
+
+    results[testName].total++;
+    if (retryAssertion?.status === 'passed') {
+      results[testName].passed++;
+      console.log(
+        `  ✅ Attempt ${attemptNum} passed. Score: ${results[testName].passed}/${results[testName].total}`,
+      );
+    } else {
+      console.log(
+        `  ❌ Attempt ${attemptNum} failed (${retryAssertion?.status || 'unknown'}). Score: ${results[testName].passed}/${results[testName].total}`,
+      );
+    }
+
+    if (results[testName].passed >= 2) {
+      console.log(
+        `  ✅ Test cleared as Noisy Pass (${results[testName].passed}/${results[testName].total})`,
+      );
+    } else if (results[testName].total - results[testName].passed >= 3) {
+      await verifyBaseline(testName, results, files, model);
+    }
+  }
+}
+
+/**
+ * Verifies a potential regression against the 'main' branch.
+ */
+async function verifyBaseline(testName, results, files, model) {
+  console.log('\n--- Step 3: Dynamic Baseline Verification ---');
+  console.log(
+    `⚠️ Potential regression detected. Verifying baseline on 'main'...`,
+  );
+
+  try {
+    execSync('git stash push -m "eval-regression-check-stash"', {
+      stdio: 'inherit',
+    });
+    const hasStash = execSync('git stash list')
+      .toString()
+      .includes('eval-regression-check-stash');
+    execSync('git checkout main', { stdio: 'inherit' });
+
+    console.log(
+      `\n--- Running Baseline Verification on 'main' (Best-of-3) ---`,
+    );
+    let baselinePasses = 0;
+    let baselineTotal = 0;
+
+    while (baselinePasses === 0 && baselineTotal < 3) {
+      baselineTotal++;
+      console.log(`  Baseline Attempt ${baselineTotal}...`);
+      const baselineRun = runTests(files, escapeRegex(testName), model);
+      if (findAssertion(baselineRun, testName)?.status === 'passed') {
+        baselinePasses++;
+        console.log(`  ✅ Baseline Attempt ${baselineTotal} passed.`);
+      } else {
+        console.log(`  ❌ Baseline Attempt ${baselineTotal} failed.`);
+      }
+    }
+
+    execSync('git checkout -', { stdio: 'inherit' });
+    if (hasStash) execSync('git stash pop', { stdio: 'inherit' });
+
+    if (baselinePasses === 0) {
+      console.log(
+        `  ℹ️ Test also fails on 'main'. Marking as PRE-EXISTING (Cleared).`,
+      );
+      results[testName].status = 'pre-existing';
+      results[testName].passed = results[testName].total; // Clear for report
+    } else {
+      console.log(
+        `  ❌ Test passes on 'main' but fails in PR. Marking as CONFIRMED REGRESSION.`,
+      );
+      results[testName].status = 'regression';
+    }
+  } catch (error) {
+    console.error(`  ❌ Failed to verify baseline: ${error.message}`);
+
+    // Best-effort cleanup: try to return to the original branch.
+    try {
+      execSync('git checkout -', { stdio: 'ignore' });
+    } catch {
+      // Ignore checkout errors during cleanup to avoid hiding the original error.
+    }
+  }
+}
+
+/**
+ * Processes initial results and orchestrates retries/baseline checks.
+ */
+async function processResults(firstPass, pattern, model, files) {
+  if (!firstPass) return false;
+
+  const results = {};
+  const failingTests = [];
+  let totalProcessed = 0;
+
+  for (const fileResult of firstPass.testResults) {
+    for (const assertion of fileResult.assertionResults) {
+      if (assertion.status !== 'passed' && assertion.status !== 'failed') {
+        continue;
+      }
+
+      const name = assertion.title;
+      results[name] = {
+        passed: assertion.status === 'passed' ? 1 : 0,
+        total: 1,
+        file: fileResult.name,
+      };
+      if (assertion.status === 'failed') failingTests.push(name);
+      totalProcessed++;
+    }
+  }
+
+  if (totalProcessed === 0) {
+    console.error('❌ Error: No matching tests were found or executed.');
+    return false;
+  }
+
+  if (failingTests.length === 0) {
+    console.log('✅ All trustworthy tests passed on the first try!');
+  } else {
+    console.log('\n--- Step 2: Best-of-4 Retries ---');
+    console.log(
+      `⚠️ ${failingTests.length} tests failed the optimistic run. Starting retries...`,
+    );
+    for (const testName of failingTests) {
+      await runRetries(testName, results, files, model);
+    }
+  }
+
+  saveResults(results);
+  return true;
+}
+
+function saveResults(results) {
+  const finalReport = { timestamp: new Date().toISOString(), results };
+  fs.writeFileSync(
+    'evals/logs/pr_final_report.json',
+    JSON.stringify(finalReport, null, 2),
+  );
+  console.log('\nFinal report saved to evals/logs/pr_final_report.json');
+}
+
+async function main() {
+  const { model, files, pattern, isManual } = parseArgs();
+
+  if (isManual) {
+    const firstPass = runTests(files, pattern, model);
+    const success = await processResults(firstPass, pattern, model, files);
+    process.exit(success ? 0 : 1);
+  }
+
+  if (!pattern) {
+    console.log('No trustworthy tests to run.');
+    process.exit(0);
+  }
+
+  console.log('\n--- Step 1: Optimistic Run (N=1) ---');
+  const firstPass = runTests(files, pattern, model);
+  const success = await processResults(firstPass, pattern, model, files);
+  process.exit(success ? 0 : 1);
+}
+
+main().catch((err) => {
+  console.error(err);
+  process.exit(1);
+});
@@ -32,7 +32,7 @@ function runCommand(command) {
      stdio: ['ignore', 'pipe', 'ignore'],
      maxBuffer: 10 * 1024 * 1024,
    });
-  } catch (_e) {
+  } catch {
    return null;
  }
 }
@@ -118,7 +118,7 @@ async function main() {
  try {
    execSync('pkill -f "otelcol-contrib"');
    console.log('✅ Stopped existing otelcol-contrib process.');
-  } catch (_e) {
+  } catch {
    /* no-op */
  }
  try {
@@ -438,7 +438,7 @@ export function registerCleanup(
      if (fd) {
        try {
          fs.closeSync(fd);
-        } catch (_) {
+        } catch {
          /* no-op */
        }
      }
@@ -93,7 +93,7 @@ describe('getVersion', () => {
      vi.mocked(execSync).mockImplementation(mockExecSync);
      const result = getVersion({ type: 'nightly' });
      // Note: The base version now comes from package.json, not the previous nightly tag.
-      expect(result.releaseVersion).toBe('0.8.0-nightly.20250917.d3bf8a3d');
+      expect(result.releaseVersion).toBe('0.8.0-nightly.20250917.gd3bf8a3d');
      expect(result.npmTag).toBe('nightly');
      expect(result.previousReleaseTag).toBe('v0.8.0-nightly.20250916.abcdef');
    });
@@ -191,5 +191,19 @@ describe('getVersion', () => {
      // Should have skipped preview.0 and landed on preview.1
      expect(result.releaseVersion).toBe('0.8.0-preview.1');
    });
+
+    it('should preserve a git hash with a leading zero via the g prefix', () => {
+      const mockWithLeadingZeroHash = (command) => {
+        // Return an all-numeric hash with a leading zero
+        if (command.includes('git rev-parse --short HEAD')) return '017972622';
+        return mockExecSync(command);
+      };
+      vi.mocked(execSync).mockImplementation(mockWithLeadingZeroHash);
+
+      const result = getVersion({ type: 'nightly' });
+      // The 'g' prefix forces semver to treat this as an alphanumeric
+      // identifier, preventing it from stripping the leading zero.
+      expect(result.releaseVersion).toBe('0.8.0-nightly.20250917.g017972622');
+    });
  });
 });