feat(evals): implement production-grade PR impact analysis with isolation and policy-aware filtering

2026-05-16 14:53:19 -07:00 · 2026-03-19 14:23:40 -07:00
parent 8f9f412327
commit c1083b91c6
2 changed files with 114 additions and 44 deletions
@@ -34,24 +34,41 @@ jobs:
      - name: 'Build project'
        run: 'npm run build'

-      - name: 'Run Evals (3 Attempts)'
+      - name: 'Run Evals (3 Attempts with Clean State)'
        env:
          GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}'
          RUN_EVALS: 'true'
        run: |
          MODELS=("gemini-3.1-pro-preview-customtools" "gemini-3-flash-preview")
-          mkdir -p evals/logs
+          # Create a persistent logs dir outside the workspace that won't be git-cleaned
+          FINAL_LOGS_DIR="/tmp/eval-impact-logs"
+          mkdir -p "$FINAL_LOGS_DIR"
          
          for model in "${MODELS[@]}"; do
            for attempt in {1..3}; do
              echo "::group::Running $model (Attempt $attempt)"
              DIR_NAME="eval-logs-$model-$attempt"
-              mkdir -p "evals/logs/$DIR_NAME"
-              # Run sequentially to keep one clean job in the UI
-              GEMINI_MODEL=$model npm run test:all_evals -- --outputFile.json="evals/logs/$DIR_NAME/report.json" || true
+              
+              # Run the tests
+              GEMINI_MODEL=$model npm run test:all_evals -- --outputFile.json="report.json" || true
+              
+              # Move the report to the persistent location
+              mkdir -p "$FINAL_LOGS_DIR/$DIR_NAME"
+              if [ -f "report.json" ]; then
+                mv report.json "$FINAL_LOGS_DIR/$DIR_NAME/report.json"
+              fi
+              
+              # FORCE CLEAN: Return to a perfectly pristine state
+              git clean -xfd
+              npm run build
+              
              echo "::endgroup::"
            done
          done
+          
+          # Move all logs back into the workspace for the aggregation script
+          mkdir -p evals/logs
+          cp -r "$FINAL_LOGS_DIR"/* evals/logs/

      - name: 'Generate Impact Report'
        id: 'generate-report'
@@ -66,6 +83,11 @@ jobs:
          echo "<!-- eval-impact-report -->" > report.md
          node scripts/aggregate_evals.js evals/logs --compare-main --pr-comment >> report.md
          cat report.md >> "$GITHUB_STEP_SUMMARY"
+          
+          # Check for blockers in the report
+          if grep -q "🔴" report.md; then
+            echo "BLOCKER_DETECTED=true" >> "$GITHUB_ENV"
+          fi

      - name: 'Comment on PR'
        if: 'always()'
@@ -83,3 +105,9 @@ jobs:
          else
            gh pr comment $PR_NUMBER --body-file report.md
          fi
+
+      - name: 'Block PR on Stable Regression'
+        if: "env.BLOCKER_DETECTED == 'true'"
+        run: |
+          echo "Fatal regressions detected in ALWAYS_PASSES behavioral evaluations."
+          exit 1
@@ -16,6 +16,28 @@ const artifactsDir = args.find((arg) => !arg.startsWith('--')) || '.';
 const isPrComment = args.includes('--pr-comment');
 const MAX_HISTORY = 7;

+// Extract policies from the source code
+function getTestPolicies() {
+  const policies = {};
+  try {
+    const evalFiles = fs
+      .readdirSync('evals')
+      .filter((f) => f.endsWith('.eval.ts'));
+    for (const file of evalFiles) {
+      const content = fs.readFileSync(path.join('evals', file), 'utf-8');
+      const matches = content.matchAll(
+        /evalTest\s*\(\s*['"](ALWAYS_PASSES|USUALLY_PASSES)['"]\s*,\s*\{\s*name:\s*['"](.+?)['"]/g,
+      );
+      for (const match of matches) {
+        policies[match[2]] = match[1];
+      }
+    }
+  } catch {
+    // Ignore errors in policy extraction
+  }
+  return policies;
+}
+
 // Find all report.json files recursively
 function findReports(dir) {
  const reports = [];
@@ -131,8 +153,6 @@ function fetchHistoricalData() {
        if (runReports.length > 0) {
          const stats = getStats(runReports);

-          // --- Infrastructure Failure Check ---
-          // If the overall pass rate for this run is 0%, ignore it as a "poisoned" baseline.
          let totalPassed = 0;
          let totalTests = 0;
          Object.values(stats).forEach((modelStats) => {
@@ -164,6 +184,7 @@ function fetchHistoricalData() {
 function generateMarkdown(currentStatsByModel, history) {
  const reversedHistory = [...history].reverse();
  const models = Object.keys(currentStatsByModel).sort();
+  const policies = getTestPolicies();

  const getConsolidatedBaseline = (model) => {
    const consolidated = {};
@@ -202,40 +223,46 @@ function generateMarkdown(currentStatsByModel, history) {
  if (isPrComment) {
    console.log('### 🤖 Model Steering Impact Report\n');

-    let overallRegression = false;
+    let blockerRegression = false;
    for (const model of models) {
      const currentStats = currentStatsByModel[model];
      const baselineStats = getConsolidatedBaseline(model);
      for (const [name, curr] of Object.entries(currentStats)) {
+        const policy = policies[name] || 'USUALLY_PASSES';
+        const currRate = (curr.passed / curr.total) * 100;
        const base = baselineStats ? baselineStats[name] : null;
-        if (base) {
-          const delta =
-            (curr.passed / curr.total) * 100 - (base.passed / base.total) * 100;
-          if (delta < -15) overallRegression = true;
+        const baseRate = base ? (base.passed / base.total) * 100 : null;
+
+        if (policy === 'ALWAYS_PASSES' && currRate < 100) {
+          blockerRegression = true;
+        } else if (
+          policy === 'USUALLY_PASSES' &&
+          baseRate !== null &&
+          baseRate > 90 &&
+          currRate < 60
+        ) {
+          blockerRegression = true; // Significant drop in a highly stable test
        }
      }
    }

-    if (overallRegression) {
-      console.log('**Status: ⚠️ Investigation Recommended**\n');
+    if (blockerRegression) {
+      console.log('**Status: 🔴 Regression Detected (Blocking)**\n');
      console.log(
-        'This PR modifies core prompt or tool logic and has introduced significant regressions in behavioral stability. Please review the delta below.\n',
+        'This PR has introduced regressions in stable behavioral evaluations. These must be resolved before merging.\n',
      );
    } else {
      console.log('**Status: ✅ Stable**\n');
      console.log(
-        'This PR modifies core prompt or tool logic. Behavioral evaluations remain stable compared to the `main` baseline.\n',
+        'Behavioral evaluations remain stable compared to the `main` baseline.\n',
      );
    }

    console.log(
-      `> **Note:** The baseline is an average of the last ${history.length} healthy nightly runs on \`main\` (ignoring infrastructure failures).\n`,
+      `> **Note:** Baseline is averaged from the last ${history.length} healthy nightly runs on \`main\`.\n`,
    );
  } else {
    console.log('### Evals Nightly Summary\n');
-    console.log(
-      'See [evals/README.md](https://github.com/google-gemini/gemini-cli/tree/main/evals) for more details.\n',
-    );
  }

  for (const model of models) {
@@ -255,24 +282,48 @@ function generateMarkdown(currentStatsByModel, history) {
    let stableCount = 0;

    for (const name of Array.from(allTestNames).sort()) {
+      const policy = policies[name] || 'USUALLY_PASSES';
      const searchUrl = `https://github.com/search?q=repo%3Agoogle-gemini%2Fgemini-cli%20%22${encodeURIComponent(name)}%22&type=code`;
      const curr = currentStats[name];
      const base = baselineStats ? baselineStats[name] : null;

      const currRate = curr ? (curr.passed / curr.total) * 100 : null;
      const baseRate = base ? (base.passed / base.total) * 100 : null;
-
      const delta =
        currRate !== null && baseRate !== null ? currRate - baseRate : null;
-      const isInteresting =
-        currRate === null || baseRate === null || Math.abs(delta) >= 15;
+
+      // Smart Noise Filtering
+      let status = '⚪ Stable';
+      let isInteresting = false;
+
+      if (policy === 'ALWAYS_PASSES') {
+        if (currRate !== null && currRate < 100) {
+          status = '🔴 Regression';
+          isInteresting = true;
+        }
+      } else {
+        // USUALLY_PASSES: Only interesting if drop is > 30% OR it's a new failure
+        if (delta !== null && delta < -30) {
+          status = '🔴 Regression';
+          isInteresting = true;
+        } else if (delta !== null && delta > 30) {
+          status = '🟢 Improved';
+          isInteresting = true;
+        } else if (baseRate !== null && baseRate > 80 && currRate === 0) {
+          status = '🔴 Regression';
+          isInteresting = true;
+        }
+      }
+
+      // Always show new or missing tests
+      if (currRate === null || baseRate === null) isInteresting = true;

      if (isPrComment && !isInteresting) {
        stableCount++;
        continue;
      }

-      let row = `| [${name}](${searchUrl}) |`;
+      let row = `| [${name}](${searchUrl}) | ${policy === 'ALWAYS_PASSES' ? '🔒' : '🎲'} |`;

      if (!isPrComment) {
        for (const item of reversedHistory) {
@@ -280,18 +331,10 @@ function generateMarkdown(currentStatsByModel, history) {
          row += ` ${stat ? ((stat.passed / stat.total) * 100).toFixed(0) + '%' : '-'} |`;
        }
      } else if (baselinePassRate !== null) {
-        row += ` ${formatPassRate(baseRate)} (${base?.total || 0}n) |`;
+        row += ` ${formatPassRate(baseRate)} |`;
      }

-      row += ` ${formatPassRate(currRate)} (${curr?.total || 0}n) |`;
-
-      if (delta !== null) {
-        if (delta > 10) row += ` 🟢 +${delta.toFixed(0)}% |`;
-        else if (delta < -15) row += ` 🔴 ${delta.toFixed(0)}% |`;
-        else row += ' ⚪ Stable |';
-      } else {
-        row += ' - |';
-      }
+      row += ` ${formatPassRate(currRate)} | ${status} |`;
      rows.push(row);
    }

@@ -304,19 +347,15 @@ function generateMarkdown(currentStatsByModel, history) {
      console.log(
        `**Pass Rate: ${formatPassRate(currentPassRate)}** vs. ${formatPassRate(baselinePassRate)} Baseline${deltaStr}\n`,
      );
-    } else if (!isPrComment) {
-      console.log(`**Total Pass Rate: ${formatPassRate(currentPassRate)}**\n`);
    }

    if (isPrComment && rows.length === 0) {
-      console.log(
-        '✅ No interesting behavioral shifts detected for this model.\n',
-      );
+      console.log('✅ All behavioral evaluations are stable.\n');
      continue;
    }

-    let header = '| Test Name |';
-    let separator = '| :--- |';
+    let header = `| Test Name | Policy |`;
+    let separator = `| :--- | :---: |`;

    if (!isPrComment) {
      for (const item of reversedHistory) {
@@ -324,7 +363,7 @@ function generateMarkdown(currentStatsByModel, history) {
        separator += ' :---: |';
      }
    } else if (baselinePassRate !== null) {
-      header += ' Baseline (Avg) |';
+      header += ' Baseline |';
      separator += ' :---: |';
    }

@@ -337,7 +376,7 @@ function generateMarkdown(currentStatsByModel, history) {

    if (isPrComment && stableCount > 0) {
      console.log(
-        `\n> **Note:** ${stableCount} stable tests were hidden from this report to reduce noise.\n`,
+        `\n> **Note:** ${stableCount} stable tests were hidden from this report.\n`,
      );
    }
    console.log('\n');
@@ -345,7 +384,10 @@ function generateMarkdown(currentStatsByModel, history) {

  if (isPrComment) {
    console.log(
-      '---\n💡 To investigate regressions locally, run: `gemini /fix-behavioral-eval`',
+      '---\n💡 **Policy Key:** 🔒 `ALWAYS_PASSES` (PR Blocker) | 🎲 `USUALLY_PASSES` (Informational)\n',
+    );
+    console.log(
+      '💡 To investigate regressions locally, run: `gemini /fix-behavioral-eval`',
    );
  }
 }