feat(evals): add automated usage metrics reporting for behavioral evals

2026-05-14 22:02:59 -07:00 · 2026-04-09 16:39:55 -07:00
parent a7a091360e
commit 770db15ca5
10 changed files with 404 additions and 5 deletions
@@ -14,6 +14,7 @@ import {
  prepareWorkspace,
  type BaseEvalCase,
  EVAL_MODEL,
+  logUsageMetrics,
 } from './test-helper.js';
 import fs from 'node:fs';
 import path from 'node:path';
@@ -59,6 +60,7 @@ export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {

      const { logDir, sanitizedName } = await prepareLogDir(evalCase.name);
      const logFile = path.join(logDir, `${sanitizedName}.log`);
+      let isSuccess = false;

      try {
        await rig.initialize();
@@ -89,7 +91,16 @@ export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
        // Run assertion. Interaction-heavy tests can do their own waiting/steering here.
        const output = rig.getStaticOutput();
        await evalCase.assert(rig, output);
+        isSuccess = true;
+      } catch (e) {
+        isSuccess = false;
+        throw e;
      } finally {
+        const metrics = rig.getUsageMetrics();
+        if (metrics.turns > 0) {
+          logUsageMetrics(evalCase.name, metrics, isSuccess);
+        }
+
        const output = rig.getStaticOutput();
        if (output) {
          await fs.promises.writeFile(logFile, output);
@@ -0,0 +1,133 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import fs from 'node:fs';
+import path from 'node:path';
+import os from 'node:os';
+import { randomUUID } from 'node:crypto';
+
+let usageLogPath: string;
+
+export function setup() {
+  const tmpDir = os.tmpdir();
+  usageLogPath = path.join(tmpDir, `gemini-usage-${randomUUID()}.jsonl`);
+  process.env['GEMINI_EVAL_USAGE_LOG'] = usageLogPath;
+}
+
+export function teardown() {
+  if (usageLogPath && fs.existsSync(usageLogPath)) {
+    try {
+      generateAndPrintReport(usageLogPath);
+    } catch (e) {
+      console.error('Failed to generate usage report', e);
+    } finally {
+      try {
+        fs.unlinkSync(usageLogPath);
+      } catch {
+        // Ignore cleanup errors
+      }
+    }
+  }
+}
+
+/**
+ * Formats a number with human-readable suffixes (K, M).
+ */
+function formatNumber(num: number): string {
+  if (num >= 1000000) {
+    return (num / 1000000).toFixed(1).replace(/\.0$/, '') + 'M';
+  }
+  if (num >= 1000) {
+    return (num / 1000).toFixed(1).replace(/\.0$/, '') + 'K';
+  }
+  return num.toString();
+}
+
+function generateAndPrintReport(logPath: string) {
+  const lines = fs
+    .readFileSync(logPath, 'utf-8')
+    .trim()
+    .split('\n')
+    .filter(Boolean);
+  if (lines.length === 0) return;
+  const entries = lines
+    .map((line) => {
+      try {
+        return JSON.parse(line);
+      } catch {
+        return null;
+      }
+    })
+    .filter(Boolean);
+
+  const stats: Record<
+    string,
+    {
+      turns: number;
+      input: number;
+      output: number;
+      cached: number;
+      total: number;
+      passed: boolean;
+    }
+  > = {};
+  let totalTurns = 0;
+  let totalInput = 0;
+  let totalOutput = 0;
+  let totalCached = 0;
+  let grandTotal = 0;
+
+  for (const entry of entries) {
+    const key = `${entry.testName} (${entry.model})`;
+    if (!stats[key]) {
+      stats[key] = {
+        turns: 0,
+        input: 0,
+        output: 0,
+        cached: 0,
+        total: 0,
+        passed: true,
+      };
+    }
+    stats[key].turns += entry.turns || 0;
+    stats[key].input += entry.input || 0;
+    stats[key].output += entry.output || 0;
+    stats[key].cached += entry.cached || 0;
+    stats[key].total += entry.total || 0;
+    // If any attempt failed for this test/model combo in this run, mark as failed
+    if (entry.passed === false) {
+      stats[key].passed = false;
+    }
+
+    totalTurns += entry.turns || 0;
+    totalInput += entry.input || 0;
+    totalOutput += entry.output || 0;
+    totalCached += entry.cached || 0;
+    grandTotal += entry.total || 0;
+  }
+
+  console.log('\n📊 Behavioral Eval Usage Report');
+  console.log('===============================');
+
+  const sortedKeys = Object.keys(stats).sort();
+  for (const key of sortedKeys) {
+    const s = stats[key];
+    const status = s.passed ? '✅' : '❌';
+    console.log(`${status} ${key}`);
+    console.log(
+      `  > turns: ${s.turns}, input: ${formatNumber(s.input)}, output: ${formatNumber(s.output)}, cached: ${formatNumber(s.cached)}, total: ${formatNumber(s.total)}`,
+    );
+  }
+
+  console.log('\n📈 Suite Totals');
+  console.log('--------------');
+  console.log(`Total Turns:   ${totalTurns}`);
+  console.log(`Input Tokens:  ${formatNumber(totalInput)}`);
+  console.log(`Output Tokens: ${formatNumber(totalOutput)}`);
+  console.log(`Cached Tokens: ${formatNumber(totalCached)}`);
+  console.log(`Total Tokens:  ${formatNumber(grandTotal)}`);
+  console.log('');
+}
@@ -185,7 +185,15 @@ export async function internalEvalTest(evalCase: EvalCase) {

      await evalCase.assert(rig, result);
      isSuccess = true;
+    } catch (e) {
+      isSuccess = false;
+      throw e;
    } finally {
+      const metrics = rig.getUsageMetrics();
+      if (metrics.turns > 0) {
+        logUsageMetrics(evalCase.name, metrics, isSuccess);
+      }
+
      if (isSuccess) {
        await fs.promises.unlink(activityLogFile).catch((err) => {
          if (err.code !== 'ENOENT') throw err;
@@ -260,6 +268,41 @@ function logReliabilityEvent(
  }
 }

+/**
+ * Log usage metrics for individual eval runs.
+ */
+export function logUsageMetrics(
+  testName: string,
+  metrics: {
+    turns: number;
+    input: number;
+    output: number;
+    cached: number;
+    total: number;
+  },
+  passed: boolean,
+) {
+  const usageLog = {
+    timestamp: new Date().toISOString(),
+    testName,
+    model: EVAL_MODEL,
+    passed,
+    ...metrics,
+  };
+
+  try {
+    const logPath =
+      process.env['GEMINI_EVAL_USAGE_LOG'] ||
+      path.resolve(process.cwd(), 'evals/logs/usage-metrics.jsonl');
+
+    const logDir = path.dirname(logPath);
+    fs.mkdirSync(logDir, { recursive: true });
+    fs.appendFileSync(logPath, JSON.stringify(usageLog) + '\n');
+  } catch (logError) {
+    console.error('Failed to write usage log:', logError);
+  }
+}
+
 /**
 * Helper to setup test files and git repository.
 *
@@ -16,6 +16,7 @@ export default defineConfig({
  },
  test: {
    testTimeout: 300000, // 5 minutes
+    globalSetup: [path.resolve(__dirname, 'global-setup.ts')],
    reporters: ['default', 'json'],
    outputFile: {
      json: 'evals/logs/report.json',
@@ -36,6 +36,10 @@ import {
  CoreToolCallStatus,
  IntegrityDataStatus,
 } from '@google/gemini-cli-core';
+import {
+  getUsageMetrics,
+  type UsageMetrics,
+} from '@google/gemini-cli-test-utils';
 import {
  type MockShellCommand,
  MockShellExecutionService,
@@ -477,6 +481,14 @@ export class AppRig {
    return this.testDir;
  }

+  /**
+   * Scans the session recordings and extracts aggregate token usage and turn counts.
+   * This provides an empirical measure of the agent's efficiency and cost.
+   */
+  getUsageMetrics(): UsageMetrics {
+    return getUsageMetrics(this.testDir);
+  }
+
  getPendingConfirmations() {
    return Array.from(this.pendingConfirmations.values());
  }
@@ -13,3 +13,4 @@ export * from './mock-utils.js';
 export * from './test-mcp-server.js';
 export * from './test-rig.js';
 export * from './env-setup.js';
+export * from './usage-metrics-utils.js';
@@ -17,6 +17,7 @@ import * as pty from '@lydell/node-pty';
 import stripAnsi from 'strip-ansi';
 import * as os from 'node:os';
 import type { TestMcpConfig } from './test-mcp-server.js';
+import { getUsageMetrics, type UsageMetrics } from './usage-metrics-utils.js';

 const __dirname = dirname(fileURLToPath(import.meta.url));
 const BUNDLE_PATH = join(__dirname, '..', '..', '..', 'bundle/gemini.js');
@@ -1516,6 +1517,23 @@ export class TestRig {
    return run;
  }

+  /**
+   * Scans the session recordings and extracts aggregate token usage and turn counts.
+   * This provides an empirical measure of the agent's efficiency and cost.
+   */
+  getUsageMetrics(): UsageMetrics {
+    if (!this.homeDir) {
+      return {
+        turns: 0,
+        input: 0,
+        output: 0,
+        cached: 0,
+        total: 0,
+      };
+    }
+    return getUsageMetrics(this.homeDir);
+  }
+
  readHookLogs() {
    const parsedLogs = this._readAndParseTelemetryLog();
    const logs: {
@@ -0,0 +1,88 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import fs from 'node:fs';
+import { join } from 'node:path';
+
+export interface UsageMetrics {
+  turns: number;
+  input: number;
+  output: number;
+  cached: number;
+  total: number;
+}
+
+/**
+ * Scans the session recordings and extracts aggregate token usage and turn counts.
+ * This provides an empirical measure of the agent's efficiency and cost.
+ *
+ * @param tempDir The directory containing the .gemini/tmp folder with recordings.
+ */
+export function getUsageMetrics(tempDir: string): UsageMetrics {
+  const geminiTmpDir = join(tempDir, '.gemini', 'tmp');
+
+  const metrics: UsageMetrics = {
+    turns: 0,
+    input: 0,
+    output: 0,
+    cached: 0,
+    total: 0,
+  };
+
+  if (!fs.existsSync(geminiTmpDir)) {
+    return metrics;
+  }
+
+  const processDir = (dir: string) => {
+    const entries = fs.readdirSync(dir, { withFileTypes: true });
+    for (const entry of entries) {
+      const fullPath = join(dir, entry.name);
+      if (entry.isDirectory()) {
+        if (entry.name === 'chats') {
+          processChatsDir(fullPath, metrics);
+        } else {
+          processDir(fullPath);
+        }
+      }
+    }
+  };
+
+  processDir(geminiTmpDir);
+  return metrics;
+}
+
+function processChatsDir(chatsDir: string, metrics: UsageMetrics) {
+  const entries = fs.readdirSync(chatsDir, { withFileTypes: true });
+  for (const entry of entries) {
+    const fullPath = join(chatsDir, entry.name);
+    if (entry.isDirectory()) {
+      // Handle subagent sessions stored in subdirectories
+      processChatsDir(fullPath, metrics);
+    } else if (
+      entry.name.endsWith('.json') &&
+      entry.name.startsWith('session-')
+    ) {
+      try {
+        const data = JSON.parse(fs.readFileSync(fullPath, 'utf8'));
+        if (data.messages && Array.isArray(data.messages)) {
+          for (const msg of data.messages) {
+            if (msg.type === 'gemini') {
+              metrics.turns++;
+              if (msg.tokens) {
+                metrics.input += msg.tokens.input || 0;
+                metrics.output += msg.tokens.output || 0;
+                metrics.cached += msg.tokens.cached || 0;
+                metrics.total += msg.tokens.total || 0;
+              }
+            }
+          }
+        }
+      } catch {
+        // Ignore parse errors for partially written or corrupted files
+      }
+    }
+  }
+}
@@ -106,14 +106,77 @@ function main() {
    markdown += '#### 3. Manual Fix\n';
    markdown +=
      'See the [Fixing Guide](https://github.com/google-gemini/gemini-cli/blob/main/evals/README.md#fixing-evaluations) for detailed troubleshooting steps.\n';
-    markdown += '</details>\n';
+    markdown += '</details>\n\n';
+
+    markdown += getUsageSummaryMarkdown();

    process.stdout.write(markdown);
  } else if (passes.length > 0) {
    // Success State
-    process.stdout.write(
-      `✅ **${passes.length}** tests passed successfully on **${targetModel}**.\n`,
-    );
+    let markdown = `✅ **${passes.length}** tests passed successfully on **${targetModel}**.\n\n`;
+    markdown += getUsageSummaryMarkdown();
+    process.stdout.write(markdown);
+  }
+}
+
+/**
+ * Generates a Markdown summary of usage metrics if available.
+ */
+function getUsageSummaryMarkdown() {
+  const usageLogPath =
+    process.env['GEMINI_EVAL_USAGE_LOG'] ||
+    path.resolve(process.cwd(), 'evals/logs/usage-metrics.jsonl');
+  // In the PR workflow, the metrics might be gathered in multiple passes or we might
+  // need to fall back to the nightly report structure. Since the PR evaluation runs
+  // in a loop that might wipe the temp file, we rely on the fact that `run_eval_regression`
+  // copies the metrics to `evals/logs/usage-metrics.jsonl` if we implement that, or
+  // we just skip it if it's missing.
+  if (!fs.existsSync(usageLogPath)) {
+    return '';
+  }
+
+  try {
+    const lines = fs
+      .readFileSync(usageLogPath, 'utf-8')
+      .trim()
+      .split('\n')
+      .filter(Boolean);
+    if (lines.length === 0) return '';
+    const entries = lines
+      .map((line) => {
+        try {
+          return JSON.parse(line);
+        } catch {
+          return null;
+        }
+      })
+      .filter(Boolean);
+
+    let totalTurns = 0;
+    let totalInput = 0;
+    let totalOutput = 0;
+    let totalCached = 0;
+    let grandTotal = 0;
+
+    for (const entry of entries) {
+      totalTurns += entry.turns || 0;
+      totalInput += entry.input || 0;
+      totalOutput += entry.output || 0;
+      totalCached += entry.cached || 0;
+      grandTotal += entry.total || 0;
+    }
+
+    let markdown = '#### 📊 Usage Summary\n';
+    markdown += `| Metric | Total |\n`;
+    markdown += `| :--- | :--- |\n`;
+    markdown += `| **Turns** | ${totalTurns} |\n`;
+    markdown += `| **Input Tokens** | ${totalInput.toLocaleString()} |\n`;
+    markdown += `| **Output Tokens** | ${totalOutput.toLocaleString()} |\n`;
+    markdown += `| **Cached Tokens** | ${totalCached.toLocaleString()} |\n`;
+    markdown += `| **Total Tokens** | ${grandTotal.toLocaleString()} |\n`;
+    return markdown;
+  } catch {
+    return '';
  }
 }

@@ -15,6 +15,9 @@

 import { execSync } from 'node:child_process';
 import fs from 'node:fs';
+import path from 'node:path';
+import os from 'node:os';
+import { randomUUID } from 'node:crypto';

 /**
 * Main execution logic.
@@ -26,6 +29,14 @@ async function main() {
  let combinedReport = '';
  let hasRegression = false;

+  const usageLogPath = path.join(
+    os.tmpdir(),
+    `gemini-usage-regression-${randomUUID()}.jsonl`,
+  );
+  if (fs.existsSync(usageLogPath)) {
+    fs.unlinkSync(usageLogPath);
+  }
+
  console.log(
    `🚀 Starting evaluation orchestration for models: ${models.join(', ')}`,
  );
@@ -50,16 +61,30 @@ async function main() {
      }

      // 2. Run Frugal Regression Check
-      console.log(`🧪 Running regression check for ${model}...`);
+      console.log(`\n🚀 Executing regression tests for ${model}...`);
+      const tmpUsageLog = path.join(
+        os.tmpdir(),
+        `gemini-usage-tmp-${model}-${randomUUID()}.jsonl`,
+      );
+      const env = { ...process.env, GEMINI_EVAL_USAGE_LOG: tmpUsageLog };
+
      execSync(`node scripts/run_regression_check.js "${model}" "${output}"`, {
        stdio: 'inherit',
+        env,
      });

+      if (fs.existsSync(tmpUsageLog)) {
+        fs.appendFileSync(usageLogPath, fs.readFileSync(tmpUsageLog));
+        fs.unlinkSync(tmpUsageLog);
+      }
+
      // 3. Generate Report
      console.log(`📊 Generating report for ${model}...`);
+      const reportEnv = { ...process.env, GEMINI_EVAL_USAGE_LOG: usageLogPath };
      const report = execSync(`node scripts/compare_evals.js "${model}"`, {
        encoding: 'utf-8',
        stdio: ['inherit', 'pipe', 'inherit'],
+        env: reportEnv,
      }).trim();

      if (report) {
@@ -98,6 +123,10 @@ async function main() {
    console.log('\n✅ All evaluations passed successfully (or were cleared).');
  }

+  if (fs.existsSync(usageLogPath)) {
+    fs.unlinkSync(usageLogPath);
+  }
+
  process.exit(0);
 }