feat(evals): add automated usage metrics reporting for behavioral evals

This commit is contained in:
Michael Bleigh
2026-04-09 16:39:55 -07:00
parent a7a091360e
commit 770db15ca5
10 changed files with 404 additions and 5 deletions
+11
View File
@@ -14,6 +14,7 @@ import {
prepareWorkspace,
type BaseEvalCase,
EVAL_MODEL,
logUsageMetrics,
} from './test-helper.js';
import fs from 'node:fs';
import path from 'node:path';
@@ -59,6 +60,7 @@ export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
const { logDir, sanitizedName } = await prepareLogDir(evalCase.name);
const logFile = path.join(logDir, `${sanitizedName}.log`);
let isSuccess = false;
try {
await rig.initialize();
@@ -89,7 +91,16 @@ export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
// Run assertion. Interaction-heavy tests can do their own waiting/steering here.
const output = rig.getStaticOutput();
await evalCase.assert(rig, output);
isSuccess = true;
} catch (e) {
isSuccess = false;
throw e;
} finally {
const metrics = rig.getUsageMetrics();
if (metrics.turns > 0) {
logUsageMetrics(evalCase.name, metrics, isSuccess);
}
const output = rig.getStaticOutput();
if (output) {
await fs.promises.writeFile(logFile, output);
+133
View File
@@ -0,0 +1,133 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import fs from 'node:fs';
import path from 'node:path';
import os from 'node:os';
import { randomUUID } from 'node:crypto';
let usageLogPath: string;
export function setup() {
const tmpDir = os.tmpdir();
usageLogPath = path.join(tmpDir, `gemini-usage-${randomUUID()}.jsonl`);
process.env['GEMINI_EVAL_USAGE_LOG'] = usageLogPath;
}
export function teardown() {
if (usageLogPath && fs.existsSync(usageLogPath)) {
try {
generateAndPrintReport(usageLogPath);
} catch (e) {
console.error('Failed to generate usage report', e);
} finally {
try {
fs.unlinkSync(usageLogPath);
} catch {
// Ignore cleanup errors
}
}
}
}
/**
* Formats a number with human-readable suffixes (K, M).
*/
function formatNumber(num: number): string {
if (num >= 1000000) {
return (num / 1000000).toFixed(1).replace(/\.0$/, '') + 'M';
}
if (num >= 1000) {
return (num / 1000).toFixed(1).replace(/\.0$/, '') + 'K';
}
return num.toString();
}
function generateAndPrintReport(logPath: string) {
const lines = fs
.readFileSync(logPath, 'utf-8')
.trim()
.split('\n')
.filter(Boolean);
if (lines.length === 0) return;
const entries = lines
.map((line) => {
try {
return JSON.parse(line);
} catch {
return null;
}
})
.filter(Boolean);
const stats: Record<
string,
{
turns: number;
input: number;
output: number;
cached: number;
total: number;
passed: boolean;
}
> = {};
let totalTurns = 0;
let totalInput = 0;
let totalOutput = 0;
let totalCached = 0;
let grandTotal = 0;
for (const entry of entries) {
const key = `${entry.testName} (${entry.model})`;
if (!stats[key]) {
stats[key] = {
turns: 0,
input: 0,
output: 0,
cached: 0,
total: 0,
passed: true,
};
}
stats[key].turns += entry.turns || 0;
stats[key].input += entry.input || 0;
stats[key].output += entry.output || 0;
stats[key].cached += entry.cached || 0;
stats[key].total += entry.total || 0;
// If any attempt failed for this test/model combo in this run, mark as failed
if (entry.passed === false) {
stats[key].passed = false;
}
totalTurns += entry.turns || 0;
totalInput += entry.input || 0;
totalOutput += entry.output || 0;
totalCached += entry.cached || 0;
grandTotal += entry.total || 0;
}
console.log('\n📊 Behavioral Eval Usage Report');
console.log('===============================');
const sortedKeys = Object.keys(stats).sort();
for (const key of sortedKeys) {
const s = stats[key];
const status = s.passed ? '✅' : '❌';
console.log(`${status} ${key}`);
console.log(
` > turns: ${s.turns}, input: ${formatNumber(s.input)}, output: ${formatNumber(s.output)}, cached: ${formatNumber(s.cached)}, total: ${formatNumber(s.total)}`,
);
}
console.log('\n📈 Suite Totals');
console.log('--------------');
console.log(`Total Turns: ${totalTurns}`);
console.log(`Input Tokens: ${formatNumber(totalInput)}`);
console.log(`Output Tokens: ${formatNumber(totalOutput)}`);
console.log(`Cached Tokens: ${formatNumber(totalCached)}`);
console.log(`Total Tokens: ${formatNumber(grandTotal)}`);
console.log('');
}
+43
View File
@@ -185,7 +185,15 @@ export async function internalEvalTest(evalCase: EvalCase) {
await evalCase.assert(rig, result);
isSuccess = true;
} catch (e) {
isSuccess = false;
throw e;
} finally {
const metrics = rig.getUsageMetrics();
if (metrics.turns > 0) {
logUsageMetrics(evalCase.name, metrics, isSuccess);
}
if (isSuccess) {
await fs.promises.unlink(activityLogFile).catch((err) => {
if (err.code !== 'ENOENT') throw err;
@@ -260,6 +268,41 @@ function logReliabilityEvent(
}
}
/**
* Log usage metrics for individual eval runs.
*/
export function logUsageMetrics(
testName: string,
metrics: {
turns: number;
input: number;
output: number;
cached: number;
total: number;
},
passed: boolean,
) {
const usageLog = {
timestamp: new Date().toISOString(),
testName,
model: EVAL_MODEL,
passed,
...metrics,
};
try {
const logPath =
process.env['GEMINI_EVAL_USAGE_LOG'] ||
path.resolve(process.cwd(), 'evals/logs/usage-metrics.jsonl');
const logDir = path.dirname(logPath);
fs.mkdirSync(logDir, { recursive: true });
fs.appendFileSync(logPath, JSON.stringify(usageLog) + '\n');
} catch (logError) {
console.error('Failed to write usage log:', logError);
}
}
/**
* Helper to setup test files and git repository.
*
+1
View File
@@ -16,6 +16,7 @@ export default defineConfig({
},
test: {
testTimeout: 300000, // 5 minutes
globalSetup: [path.resolve(__dirname, 'global-setup.ts')],
reporters: ['default', 'json'],
outputFile: {
json: 'evals/logs/report.json',
+12
View File
@@ -36,6 +36,10 @@ import {
CoreToolCallStatus,
IntegrityDataStatus,
} from '@google/gemini-cli-core';
import {
getUsageMetrics,
type UsageMetrics,
} from '@google/gemini-cli-test-utils';
import {
type MockShellCommand,
MockShellExecutionService,
@@ -477,6 +481,14 @@ export class AppRig {
return this.testDir;
}
/**
* Scans the session recordings and extracts aggregate token usage and turn counts.
* This provides an empirical measure of the agent's efficiency and cost.
*/
getUsageMetrics(): UsageMetrics {
return getUsageMetrics(this.testDir);
}
getPendingConfirmations() {
return Array.from(this.pendingConfirmations.values());
}
+1
View File
@@ -13,3 +13,4 @@ export * from './mock-utils.js';
export * from './test-mcp-server.js';
export * from './test-rig.js';
export * from './env-setup.js';
export * from './usage-metrics-utils.js';
+18
View File
@@ -17,6 +17,7 @@ import * as pty from '@lydell/node-pty';
import stripAnsi from 'strip-ansi';
import * as os from 'node:os';
import type { TestMcpConfig } from './test-mcp-server.js';
import { getUsageMetrics, type UsageMetrics } from './usage-metrics-utils.js';
const __dirname = dirname(fileURLToPath(import.meta.url));
const BUNDLE_PATH = join(__dirname, '..', '..', '..', 'bundle/gemini.js');
@@ -1516,6 +1517,23 @@ export class TestRig {
return run;
}
/**
* Scans the session recordings and extracts aggregate token usage and turn counts.
* This provides an empirical measure of the agent's efficiency and cost.
*/
getUsageMetrics(): UsageMetrics {
if (!this.homeDir) {
return {
turns: 0,
input: 0,
output: 0,
cached: 0,
total: 0,
};
}
return getUsageMetrics(this.homeDir);
}
readHookLogs() {
const parsedLogs = this._readAndParseTelemetryLog();
const logs: {
@@ -0,0 +1,88 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import fs from 'node:fs';
import { join } from 'node:path';
export interface UsageMetrics {
turns: number;
input: number;
output: number;
cached: number;
total: number;
}
/**
* Scans the session recordings and extracts aggregate token usage and turn counts.
* This provides an empirical measure of the agent's efficiency and cost.
*
* @param tempDir The directory containing the .gemini/tmp folder with recordings.
*/
export function getUsageMetrics(tempDir: string): UsageMetrics {
const geminiTmpDir = join(tempDir, '.gemini', 'tmp');
const metrics: UsageMetrics = {
turns: 0,
input: 0,
output: 0,
cached: 0,
total: 0,
};
if (!fs.existsSync(geminiTmpDir)) {
return metrics;
}
const processDir = (dir: string) => {
const entries = fs.readdirSync(dir, { withFileTypes: true });
for (const entry of entries) {
const fullPath = join(dir, entry.name);
if (entry.isDirectory()) {
if (entry.name === 'chats') {
processChatsDir(fullPath, metrics);
} else {
processDir(fullPath);
}
}
}
};
processDir(geminiTmpDir);
return metrics;
}
function processChatsDir(chatsDir: string, metrics: UsageMetrics) {
const entries = fs.readdirSync(chatsDir, { withFileTypes: true });
for (const entry of entries) {
const fullPath = join(chatsDir, entry.name);
if (entry.isDirectory()) {
// Handle subagent sessions stored in subdirectories
processChatsDir(fullPath, metrics);
} else if (
entry.name.endsWith('.json') &&
entry.name.startsWith('session-')
) {
try {
const data = JSON.parse(fs.readFileSync(fullPath, 'utf8'));
if (data.messages && Array.isArray(data.messages)) {
for (const msg of data.messages) {
if (msg.type === 'gemini') {
metrics.turns++;
if (msg.tokens) {
metrics.input += msg.tokens.input || 0;
metrics.output += msg.tokens.output || 0;
metrics.cached += msg.tokens.cached || 0;
metrics.total += msg.tokens.total || 0;
}
}
}
}
} catch {
// Ignore parse errors for partially written or corrupted files
}
}
}
}
+67 -4
View File
@@ -106,14 +106,77 @@ function main() {
markdown += '#### 3. Manual Fix\n';
markdown +=
'See the [Fixing Guide](https://github.com/google-gemini/gemini-cli/blob/main/evals/README.md#fixing-evaluations) for detailed troubleshooting steps.\n';
markdown += '</details>\n';
markdown += '</details>\n\n';
markdown += getUsageSummaryMarkdown();
process.stdout.write(markdown);
} else if (passes.length > 0) {
// Success State
process.stdout.write(
`✅ **${passes.length}** tests passed successfully on **${targetModel}**.\n`,
);
let markdown = `✅ **${passes.length}** tests passed successfully on **${targetModel}**.\n\n`;
markdown += getUsageSummaryMarkdown();
process.stdout.write(markdown);
}
}
/**
* Generates a Markdown summary of usage metrics if available.
*/
function getUsageSummaryMarkdown() {
const usageLogPath =
process.env['GEMINI_EVAL_USAGE_LOG'] ||
path.resolve(process.cwd(), 'evals/logs/usage-metrics.jsonl');
// In the PR workflow, the metrics might be gathered in multiple passes or we might
// need to fall back to the nightly report structure. Since the PR evaluation runs
// in a loop that might wipe the temp file, we rely on the fact that `run_eval_regression`
// copies the metrics to `evals/logs/usage-metrics.jsonl` if we implement that, or
// we just skip it if it's missing.
if (!fs.existsSync(usageLogPath)) {
return '';
}
try {
const lines = fs
.readFileSync(usageLogPath, 'utf-8')
.trim()
.split('\n')
.filter(Boolean);
if (lines.length === 0) return '';
const entries = lines
.map((line) => {
try {
return JSON.parse(line);
} catch {
return null;
}
})
.filter(Boolean);
let totalTurns = 0;
let totalInput = 0;
let totalOutput = 0;
let totalCached = 0;
let grandTotal = 0;
for (const entry of entries) {
totalTurns += entry.turns || 0;
totalInput += entry.input || 0;
totalOutput += entry.output || 0;
totalCached += entry.cached || 0;
grandTotal += entry.total || 0;
}
let markdown = '#### 📊 Usage Summary\n';
markdown += `| Metric | Total |\n`;
markdown += `| :--- | :--- |\n`;
markdown += `| **Turns** | ${totalTurns} |\n`;
markdown += `| **Input Tokens** | ${totalInput.toLocaleString()} |\n`;
markdown += `| **Output Tokens** | ${totalOutput.toLocaleString()} |\n`;
markdown += `| **Cached Tokens** | ${totalCached.toLocaleString()} |\n`;
markdown += `| **Total Tokens** | ${grandTotal.toLocaleString()} |\n`;
return markdown;
} catch {
return '';
}
}
+30 -1
View File
@@ -15,6 +15,9 @@
import { execSync } from 'node:child_process';
import fs from 'node:fs';
import path from 'node:path';
import os from 'node:os';
import { randomUUID } from 'node:crypto';
/**
* Main execution logic.
@@ -26,6 +29,14 @@ async function main() {
let combinedReport = '';
let hasRegression = false;
const usageLogPath = path.join(
os.tmpdir(),
`gemini-usage-regression-${randomUUID()}.jsonl`,
);
if (fs.existsSync(usageLogPath)) {
fs.unlinkSync(usageLogPath);
}
console.log(
`🚀 Starting evaluation orchestration for models: ${models.join(', ')}`,
);
@@ -50,16 +61,30 @@ async function main() {
}
// 2. Run Frugal Regression Check
console.log(`🧪 Running regression check for ${model}...`);
console.log(`\n🚀 Executing regression tests for ${model}...`);
const tmpUsageLog = path.join(
os.tmpdir(),
`gemini-usage-tmp-${model}-${randomUUID()}.jsonl`,
);
const env = { ...process.env, GEMINI_EVAL_USAGE_LOG: tmpUsageLog };
execSync(`node scripts/run_regression_check.js "${model}" "${output}"`, {
stdio: 'inherit',
env,
});
if (fs.existsSync(tmpUsageLog)) {
fs.appendFileSync(usageLogPath, fs.readFileSync(tmpUsageLog));
fs.unlinkSync(tmpUsageLog);
}
// 3. Generate Report
console.log(`📊 Generating report for ${model}...`);
const reportEnv = { ...process.env, GEMINI_EVAL_USAGE_LOG: usageLogPath };
const report = execSync(`node scripts/compare_evals.js "${model}"`, {
encoding: 'utf-8',
stdio: ['inherit', 'pipe', 'inherit'],
env: reportEnv,
}).trim();
if (report) {
@@ -98,6 +123,10 @@ async function main() {
console.log('\n✅ All evaluations passed successfully (or were cleared).');
}
if (fs.existsSync(usageLogPath)) {
fs.unlinkSync(usageLogPath);
}
process.exit(0);
}