feat(evals): add automated usage metrics reporting for behavioral evals

2026-05-29 05:02:35 -07:00 · 2026-04-09 16:39:55 -07:00
parent a7a091360e
commit 770db15ca5
10 changed files with 404 additions and 5 deletions
@@ -106,14 +106,77 @@ function main() {
    markdown += '#### 3. Manual Fix\n';
    markdown +=
      'See the [Fixing Guide](https://github.com/google-gemini/gemini-cli/blob/main/evals/README.md#fixing-evaluations) for detailed troubleshooting steps.\n';
-    markdown += '</details>\n';
+    markdown += '</details>\n\n';
+
+    markdown += getUsageSummaryMarkdown();

    process.stdout.write(markdown);
  } else if (passes.length > 0) {
    // Success State
-    process.stdout.write(
-      `✅ **${passes.length}** tests passed successfully on **${targetModel}**.\n`,
-    );
+    let markdown = `✅ **${passes.length}** tests passed successfully on **${targetModel}**.\n\n`;
+    markdown += getUsageSummaryMarkdown();
+    process.stdout.write(markdown);
+  }
+}
+
+/**
+ * Generates a Markdown summary of usage metrics if available.
+ */
+function getUsageSummaryMarkdown() {
+  const usageLogPath =
+    process.env['GEMINI_EVAL_USAGE_LOG'] ||
+    path.resolve(process.cwd(), 'evals/logs/usage-metrics.jsonl');
+  // In the PR workflow, the metrics might be gathered in multiple passes or we might
+  // need to fall back to the nightly report structure. Since the PR evaluation runs
+  // in a loop that might wipe the temp file, we rely on the fact that `run_eval_regression`
+  // copies the metrics to `evals/logs/usage-metrics.jsonl` if we implement that, or
+  // we just skip it if it's missing.
+  if (!fs.existsSync(usageLogPath)) {
+    return '';
+  }
+
+  try {
+    const lines = fs
+      .readFileSync(usageLogPath, 'utf-8')
+      .trim()
+      .split('\n')
+      .filter(Boolean);
+    if (lines.length === 0) return '';
+    const entries = lines
+      .map((line) => {
+        try {
+          return JSON.parse(line);
+        } catch {
+          return null;
+        }
+      })
+      .filter(Boolean);
+
+    let totalTurns = 0;
+    let totalInput = 0;
+    let totalOutput = 0;
+    let totalCached = 0;
+    let grandTotal = 0;
+
+    for (const entry of entries) {
+      totalTurns += entry.turns || 0;
+      totalInput += entry.input || 0;
+      totalOutput += entry.output || 0;
+      totalCached += entry.cached || 0;
+      grandTotal += entry.total || 0;
+    }
+
+    let markdown = '#### 📊 Usage Summary\n';
+    markdown += `| Metric | Total |\n`;
+    markdown += `| :--- | :--- |\n`;
+    markdown += `| **Turns** | ${totalTurns} |\n`;
+    markdown += `| **Input Tokens** | ${totalInput.toLocaleString()} |\n`;
+    markdown += `| **Output Tokens** | ${totalOutput.toLocaleString()} |\n`;
+    markdown += `| **Cached Tokens** | ${totalCached.toLocaleString()} |\n`;
+    markdown += `| **Total Tokens** | ${grandTotal.toLocaleString()} |\n`;
+    return markdown;
+  } catch {
+    return '';
  }
 }

@@ -15,6 +15,9 @@

 import { execSync } from 'node:child_process';
 import fs from 'node:fs';
+import path from 'node:path';
+import os from 'node:os';
+import { randomUUID } from 'node:crypto';

 /**
 * Main execution logic.
@@ -26,6 +29,14 @@ async function main() {
  let combinedReport = '';
  let hasRegression = false;

+  const usageLogPath = path.join(
+    os.tmpdir(),
+    `gemini-usage-regression-${randomUUID()}.jsonl`,
+  );
+  if (fs.existsSync(usageLogPath)) {
+    fs.unlinkSync(usageLogPath);
+  }
+
  console.log(
    `🚀 Starting evaluation orchestration for models: ${models.join(', ')}`,
  );
@@ -50,16 +61,30 @@ async function main() {
      }

      // 2. Run Frugal Regression Check
-      console.log(`🧪 Running regression check for ${model}...`);
+      console.log(`\n🚀 Executing regression tests for ${model}...`);
+      const tmpUsageLog = path.join(
+        os.tmpdir(),
+        `gemini-usage-tmp-${model}-${randomUUID()}.jsonl`,
+      );
+      const env = { ...process.env, GEMINI_EVAL_USAGE_LOG: tmpUsageLog };
+
      execSync(`node scripts/run_regression_check.js "${model}" "${output}"`, {
        stdio: 'inherit',
+        env,
      });

+      if (fs.existsSync(tmpUsageLog)) {
+        fs.appendFileSync(usageLogPath, fs.readFileSync(tmpUsageLog));
+        fs.unlinkSync(tmpUsageLog);
+      }
+
      // 3. Generate Report
      console.log(`📊 Generating report for ${model}...`);
+      const reportEnv = { ...process.env, GEMINI_EVAL_USAGE_LOG: usageLogPath };
      const report = execSync(`node scripts/compare_evals.js "${model}"`, {
        encoding: 'utf-8',
        stdio: ['inherit', 'pipe', 'inherit'],
+        env: reportEnv,
      }).trim();

      if (report) {
@@ -98,6 +123,10 @@ async function main() {
    console.log('\n✅ All evaluations passed successfully (or were cleared).');
  }

+  if (fs.existsSync(usageLogPath)) {
+    fs.unlinkSync(usageLogPath);
+  }
+
  process.exit(0);
 }