mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-05-14 22:02:59 -07:00
feat(evals): add automated usage metrics reporting for behavioral evals
This commit is contained in:
@@ -14,6 +14,7 @@ import {
|
||||
prepareWorkspace,
|
||||
type BaseEvalCase,
|
||||
EVAL_MODEL,
|
||||
logUsageMetrics,
|
||||
} from './test-helper.js';
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
@@ -59,6 +60,7 @@ export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
|
||||
|
||||
const { logDir, sanitizedName } = await prepareLogDir(evalCase.name);
|
||||
const logFile = path.join(logDir, `${sanitizedName}.log`);
|
||||
let isSuccess = false;
|
||||
|
||||
try {
|
||||
await rig.initialize();
|
||||
@@ -89,7 +91,16 @@ export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
|
||||
// Run assertion. Interaction-heavy tests can do their own waiting/steering here.
|
||||
const output = rig.getStaticOutput();
|
||||
await evalCase.assert(rig, output);
|
||||
isSuccess = true;
|
||||
} catch (e) {
|
||||
isSuccess = false;
|
||||
throw e;
|
||||
} finally {
|
||||
const metrics = rig.getUsageMetrics();
|
||||
if (metrics.turns > 0) {
|
||||
logUsageMetrics(evalCase.name, metrics, isSuccess);
|
||||
}
|
||||
|
||||
const output = rig.getStaticOutput();
|
||||
if (output) {
|
||||
await fs.promises.writeFile(logFile, output);
|
||||
|
||||
@@ -0,0 +1,133 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
import os from 'node:os';
|
||||
import { randomUUID } from 'node:crypto';
|
||||
|
||||
let usageLogPath: string;
|
||||
|
||||
export function setup() {
|
||||
const tmpDir = os.tmpdir();
|
||||
usageLogPath = path.join(tmpDir, `gemini-usage-${randomUUID()}.jsonl`);
|
||||
process.env['GEMINI_EVAL_USAGE_LOG'] = usageLogPath;
|
||||
}
|
||||
|
||||
export function teardown() {
|
||||
if (usageLogPath && fs.existsSync(usageLogPath)) {
|
||||
try {
|
||||
generateAndPrintReport(usageLogPath);
|
||||
} catch (e) {
|
||||
console.error('Failed to generate usage report', e);
|
||||
} finally {
|
||||
try {
|
||||
fs.unlinkSync(usageLogPath);
|
||||
} catch {
|
||||
// Ignore cleanup errors
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Formats a number with human-readable suffixes (K, M).
|
||||
*/
|
||||
function formatNumber(num: number): string {
|
||||
if (num >= 1000000) {
|
||||
return (num / 1000000).toFixed(1).replace(/\.0$/, '') + 'M';
|
||||
}
|
||||
if (num >= 1000) {
|
||||
return (num / 1000).toFixed(1).replace(/\.0$/, '') + 'K';
|
||||
}
|
||||
return num.toString();
|
||||
}
|
||||
|
||||
function generateAndPrintReport(logPath: string) {
|
||||
const lines = fs
|
||||
.readFileSync(logPath, 'utf-8')
|
||||
.trim()
|
||||
.split('\n')
|
||||
.filter(Boolean);
|
||||
if (lines.length === 0) return;
|
||||
const entries = lines
|
||||
.map((line) => {
|
||||
try {
|
||||
return JSON.parse(line);
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
})
|
||||
.filter(Boolean);
|
||||
|
||||
const stats: Record<
|
||||
string,
|
||||
{
|
||||
turns: number;
|
||||
input: number;
|
||||
output: number;
|
||||
cached: number;
|
||||
total: number;
|
||||
passed: boolean;
|
||||
}
|
||||
> = {};
|
||||
let totalTurns = 0;
|
||||
let totalInput = 0;
|
||||
let totalOutput = 0;
|
||||
let totalCached = 0;
|
||||
let grandTotal = 0;
|
||||
|
||||
for (const entry of entries) {
|
||||
const key = `${entry.testName} (${entry.model})`;
|
||||
if (!stats[key]) {
|
||||
stats[key] = {
|
||||
turns: 0,
|
||||
input: 0,
|
||||
output: 0,
|
||||
cached: 0,
|
||||
total: 0,
|
||||
passed: true,
|
||||
};
|
||||
}
|
||||
stats[key].turns += entry.turns || 0;
|
||||
stats[key].input += entry.input || 0;
|
||||
stats[key].output += entry.output || 0;
|
||||
stats[key].cached += entry.cached || 0;
|
||||
stats[key].total += entry.total || 0;
|
||||
// If any attempt failed for this test/model combo in this run, mark as failed
|
||||
if (entry.passed === false) {
|
||||
stats[key].passed = false;
|
||||
}
|
||||
|
||||
totalTurns += entry.turns || 0;
|
||||
totalInput += entry.input || 0;
|
||||
totalOutput += entry.output || 0;
|
||||
totalCached += entry.cached || 0;
|
||||
grandTotal += entry.total || 0;
|
||||
}
|
||||
|
||||
console.log('\n📊 Behavioral Eval Usage Report');
|
||||
console.log('===============================');
|
||||
|
||||
const sortedKeys = Object.keys(stats).sort();
|
||||
for (const key of sortedKeys) {
|
||||
const s = stats[key];
|
||||
const status = s.passed ? '✅' : '❌';
|
||||
console.log(`${status} ${key}`);
|
||||
console.log(
|
||||
` > turns: ${s.turns}, input: ${formatNumber(s.input)}, output: ${formatNumber(s.output)}, cached: ${formatNumber(s.cached)}, total: ${formatNumber(s.total)}`,
|
||||
);
|
||||
}
|
||||
|
||||
console.log('\n📈 Suite Totals');
|
||||
console.log('--------------');
|
||||
console.log(`Total Turns: ${totalTurns}`);
|
||||
console.log(`Input Tokens: ${formatNumber(totalInput)}`);
|
||||
console.log(`Output Tokens: ${formatNumber(totalOutput)}`);
|
||||
console.log(`Cached Tokens: ${formatNumber(totalCached)}`);
|
||||
console.log(`Total Tokens: ${formatNumber(grandTotal)}`);
|
||||
console.log('');
|
||||
}
|
||||
@@ -185,7 +185,15 @@ export async function internalEvalTest(evalCase: EvalCase) {
|
||||
|
||||
await evalCase.assert(rig, result);
|
||||
isSuccess = true;
|
||||
} catch (e) {
|
||||
isSuccess = false;
|
||||
throw e;
|
||||
} finally {
|
||||
const metrics = rig.getUsageMetrics();
|
||||
if (metrics.turns > 0) {
|
||||
logUsageMetrics(evalCase.name, metrics, isSuccess);
|
||||
}
|
||||
|
||||
if (isSuccess) {
|
||||
await fs.promises.unlink(activityLogFile).catch((err) => {
|
||||
if (err.code !== 'ENOENT') throw err;
|
||||
@@ -260,6 +268,41 @@ function logReliabilityEvent(
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Log usage metrics for individual eval runs.
|
||||
*/
|
||||
export function logUsageMetrics(
|
||||
testName: string,
|
||||
metrics: {
|
||||
turns: number;
|
||||
input: number;
|
||||
output: number;
|
||||
cached: number;
|
||||
total: number;
|
||||
},
|
||||
passed: boolean,
|
||||
) {
|
||||
const usageLog = {
|
||||
timestamp: new Date().toISOString(),
|
||||
testName,
|
||||
model: EVAL_MODEL,
|
||||
passed,
|
||||
...metrics,
|
||||
};
|
||||
|
||||
try {
|
||||
const logPath =
|
||||
process.env['GEMINI_EVAL_USAGE_LOG'] ||
|
||||
path.resolve(process.cwd(), 'evals/logs/usage-metrics.jsonl');
|
||||
|
||||
const logDir = path.dirname(logPath);
|
||||
fs.mkdirSync(logDir, { recursive: true });
|
||||
fs.appendFileSync(logPath, JSON.stringify(usageLog) + '\n');
|
||||
} catch (logError) {
|
||||
console.error('Failed to write usage log:', logError);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper to setup test files and git repository.
|
||||
*
|
||||
|
||||
@@ -16,6 +16,7 @@ export default defineConfig({
|
||||
},
|
||||
test: {
|
||||
testTimeout: 300000, // 5 minutes
|
||||
globalSetup: [path.resolve(__dirname, 'global-setup.ts')],
|
||||
reporters: ['default', 'json'],
|
||||
outputFile: {
|
||||
json: 'evals/logs/report.json',
|
||||
|
||||
@@ -36,6 +36,10 @@ import {
|
||||
CoreToolCallStatus,
|
||||
IntegrityDataStatus,
|
||||
} from '@google/gemini-cli-core';
|
||||
import {
|
||||
getUsageMetrics,
|
||||
type UsageMetrics,
|
||||
} from '@google/gemini-cli-test-utils';
|
||||
import {
|
||||
type MockShellCommand,
|
||||
MockShellExecutionService,
|
||||
@@ -477,6 +481,14 @@ export class AppRig {
|
||||
return this.testDir;
|
||||
}
|
||||
|
||||
/**
|
||||
* Scans the session recordings and extracts aggregate token usage and turn counts.
|
||||
* This provides an empirical measure of the agent's efficiency and cost.
|
||||
*/
|
||||
getUsageMetrics(): UsageMetrics {
|
||||
return getUsageMetrics(this.testDir);
|
||||
}
|
||||
|
||||
getPendingConfirmations() {
|
||||
return Array.from(this.pendingConfirmations.values());
|
||||
}
|
||||
|
||||
@@ -13,3 +13,4 @@ export * from './mock-utils.js';
|
||||
export * from './test-mcp-server.js';
|
||||
export * from './test-rig.js';
|
||||
export * from './env-setup.js';
|
||||
export * from './usage-metrics-utils.js';
|
||||
|
||||
@@ -17,6 +17,7 @@ import * as pty from '@lydell/node-pty';
|
||||
import stripAnsi from 'strip-ansi';
|
||||
import * as os from 'node:os';
|
||||
import type { TestMcpConfig } from './test-mcp-server.js';
|
||||
import { getUsageMetrics, type UsageMetrics } from './usage-metrics-utils.js';
|
||||
|
||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||
const BUNDLE_PATH = join(__dirname, '..', '..', '..', 'bundle/gemini.js');
|
||||
@@ -1516,6 +1517,23 @@ export class TestRig {
|
||||
return run;
|
||||
}
|
||||
|
||||
/**
|
||||
* Scans the session recordings and extracts aggregate token usage and turn counts.
|
||||
* This provides an empirical measure of the agent's efficiency and cost.
|
||||
*/
|
||||
getUsageMetrics(): UsageMetrics {
|
||||
if (!this.homeDir) {
|
||||
return {
|
||||
turns: 0,
|
||||
input: 0,
|
||||
output: 0,
|
||||
cached: 0,
|
||||
total: 0,
|
||||
};
|
||||
}
|
||||
return getUsageMetrics(this.homeDir);
|
||||
}
|
||||
|
||||
readHookLogs() {
|
||||
const parsedLogs = this._readAndParseTelemetryLog();
|
||||
const logs: {
|
||||
|
||||
@@ -0,0 +1,88 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import fs from 'node:fs';
|
||||
import { join } from 'node:path';
|
||||
|
||||
export interface UsageMetrics {
|
||||
turns: number;
|
||||
input: number;
|
||||
output: number;
|
||||
cached: number;
|
||||
total: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Scans the session recordings and extracts aggregate token usage and turn counts.
|
||||
* This provides an empirical measure of the agent's efficiency and cost.
|
||||
*
|
||||
* @param tempDir The directory containing the .gemini/tmp folder with recordings.
|
||||
*/
|
||||
export function getUsageMetrics(tempDir: string): UsageMetrics {
|
||||
const geminiTmpDir = join(tempDir, '.gemini', 'tmp');
|
||||
|
||||
const metrics: UsageMetrics = {
|
||||
turns: 0,
|
||||
input: 0,
|
||||
output: 0,
|
||||
cached: 0,
|
||||
total: 0,
|
||||
};
|
||||
|
||||
if (!fs.existsSync(geminiTmpDir)) {
|
||||
return metrics;
|
||||
}
|
||||
|
||||
const processDir = (dir: string) => {
|
||||
const entries = fs.readdirSync(dir, { withFileTypes: true });
|
||||
for (const entry of entries) {
|
||||
const fullPath = join(dir, entry.name);
|
||||
if (entry.isDirectory()) {
|
||||
if (entry.name === 'chats') {
|
||||
processChatsDir(fullPath, metrics);
|
||||
} else {
|
||||
processDir(fullPath);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
processDir(geminiTmpDir);
|
||||
return metrics;
|
||||
}
|
||||
|
||||
function processChatsDir(chatsDir: string, metrics: UsageMetrics) {
|
||||
const entries = fs.readdirSync(chatsDir, { withFileTypes: true });
|
||||
for (const entry of entries) {
|
||||
const fullPath = join(chatsDir, entry.name);
|
||||
if (entry.isDirectory()) {
|
||||
// Handle subagent sessions stored in subdirectories
|
||||
processChatsDir(fullPath, metrics);
|
||||
} else if (
|
||||
entry.name.endsWith('.json') &&
|
||||
entry.name.startsWith('session-')
|
||||
) {
|
||||
try {
|
||||
const data = JSON.parse(fs.readFileSync(fullPath, 'utf8'));
|
||||
if (data.messages && Array.isArray(data.messages)) {
|
||||
for (const msg of data.messages) {
|
||||
if (msg.type === 'gemini') {
|
||||
metrics.turns++;
|
||||
if (msg.tokens) {
|
||||
metrics.input += msg.tokens.input || 0;
|
||||
metrics.output += msg.tokens.output || 0;
|
||||
metrics.cached += msg.tokens.cached || 0;
|
||||
metrics.total += msg.tokens.total || 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// Ignore parse errors for partially written or corrupted files
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -106,14 +106,77 @@ function main() {
|
||||
markdown += '#### 3. Manual Fix\n';
|
||||
markdown +=
|
||||
'See the [Fixing Guide](https://github.com/google-gemini/gemini-cli/blob/main/evals/README.md#fixing-evaluations) for detailed troubleshooting steps.\n';
|
||||
markdown += '</details>\n';
|
||||
markdown += '</details>\n\n';
|
||||
|
||||
markdown += getUsageSummaryMarkdown();
|
||||
|
||||
process.stdout.write(markdown);
|
||||
} else if (passes.length > 0) {
|
||||
// Success State
|
||||
process.stdout.write(
|
||||
`✅ **${passes.length}** tests passed successfully on **${targetModel}**.\n`,
|
||||
);
|
||||
let markdown = `✅ **${passes.length}** tests passed successfully on **${targetModel}**.\n\n`;
|
||||
markdown += getUsageSummaryMarkdown();
|
||||
process.stdout.write(markdown);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates a Markdown summary of usage metrics if available.
|
||||
*/
|
||||
function getUsageSummaryMarkdown() {
|
||||
const usageLogPath =
|
||||
process.env['GEMINI_EVAL_USAGE_LOG'] ||
|
||||
path.resolve(process.cwd(), 'evals/logs/usage-metrics.jsonl');
|
||||
// In the PR workflow, the metrics might be gathered in multiple passes or we might
|
||||
// need to fall back to the nightly report structure. Since the PR evaluation runs
|
||||
// in a loop that might wipe the temp file, we rely on the fact that `run_eval_regression`
|
||||
// copies the metrics to `evals/logs/usage-metrics.jsonl` if we implement that, or
|
||||
// we just skip it if it's missing.
|
||||
if (!fs.existsSync(usageLogPath)) {
|
||||
return '';
|
||||
}
|
||||
|
||||
try {
|
||||
const lines = fs
|
||||
.readFileSync(usageLogPath, 'utf-8')
|
||||
.trim()
|
||||
.split('\n')
|
||||
.filter(Boolean);
|
||||
if (lines.length === 0) return '';
|
||||
const entries = lines
|
||||
.map((line) => {
|
||||
try {
|
||||
return JSON.parse(line);
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
})
|
||||
.filter(Boolean);
|
||||
|
||||
let totalTurns = 0;
|
||||
let totalInput = 0;
|
||||
let totalOutput = 0;
|
||||
let totalCached = 0;
|
||||
let grandTotal = 0;
|
||||
|
||||
for (const entry of entries) {
|
||||
totalTurns += entry.turns || 0;
|
||||
totalInput += entry.input || 0;
|
||||
totalOutput += entry.output || 0;
|
||||
totalCached += entry.cached || 0;
|
||||
grandTotal += entry.total || 0;
|
||||
}
|
||||
|
||||
let markdown = '#### 📊 Usage Summary\n';
|
||||
markdown += `| Metric | Total |\n`;
|
||||
markdown += `| :--- | :--- |\n`;
|
||||
markdown += `| **Turns** | ${totalTurns} |\n`;
|
||||
markdown += `| **Input Tokens** | ${totalInput.toLocaleString()} |\n`;
|
||||
markdown += `| **Output Tokens** | ${totalOutput.toLocaleString()} |\n`;
|
||||
markdown += `| **Cached Tokens** | ${totalCached.toLocaleString()} |\n`;
|
||||
markdown += `| **Total Tokens** | ${grandTotal.toLocaleString()} |\n`;
|
||||
return markdown;
|
||||
} catch {
|
||||
return '';
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -15,6 +15,9 @@
|
||||
|
||||
import { execSync } from 'node:child_process';
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
import os from 'node:os';
|
||||
import { randomUUID } from 'node:crypto';
|
||||
|
||||
/**
|
||||
* Main execution logic.
|
||||
@@ -26,6 +29,14 @@ async function main() {
|
||||
let combinedReport = '';
|
||||
let hasRegression = false;
|
||||
|
||||
const usageLogPath = path.join(
|
||||
os.tmpdir(),
|
||||
`gemini-usage-regression-${randomUUID()}.jsonl`,
|
||||
);
|
||||
if (fs.existsSync(usageLogPath)) {
|
||||
fs.unlinkSync(usageLogPath);
|
||||
}
|
||||
|
||||
console.log(
|
||||
`🚀 Starting evaluation orchestration for models: ${models.join(', ')}`,
|
||||
);
|
||||
@@ -50,16 +61,30 @@ async function main() {
|
||||
}
|
||||
|
||||
// 2. Run Frugal Regression Check
|
||||
console.log(`🧪 Running regression check for ${model}...`);
|
||||
console.log(`\n🚀 Executing regression tests for ${model}...`);
|
||||
const tmpUsageLog = path.join(
|
||||
os.tmpdir(),
|
||||
`gemini-usage-tmp-${model}-${randomUUID()}.jsonl`,
|
||||
);
|
||||
const env = { ...process.env, GEMINI_EVAL_USAGE_LOG: tmpUsageLog };
|
||||
|
||||
execSync(`node scripts/run_regression_check.js "${model}" "${output}"`, {
|
||||
stdio: 'inherit',
|
||||
env,
|
||||
});
|
||||
|
||||
if (fs.existsSync(tmpUsageLog)) {
|
||||
fs.appendFileSync(usageLogPath, fs.readFileSync(tmpUsageLog));
|
||||
fs.unlinkSync(tmpUsageLog);
|
||||
}
|
||||
|
||||
// 3. Generate Report
|
||||
console.log(`📊 Generating report for ${model}...`);
|
||||
const reportEnv = { ...process.env, GEMINI_EVAL_USAGE_LOG: usageLogPath };
|
||||
const report = execSync(`node scripts/compare_evals.js "${model}"`, {
|
||||
encoding: 'utf-8',
|
||||
stdio: ['inherit', 'pipe', 'inherit'],
|
||||
env: reportEnv,
|
||||
}).trim();
|
||||
|
||||
if (report) {
|
||||
@@ -98,6 +123,10 @@ async function main() {
|
||||
console.log('\n✅ All evaluations passed successfully (or were cleared).');
|
||||
}
|
||||
|
||||
if (fs.existsSync(usageLogPath)) {
|
||||
fs.unlinkSync(usageLogPath);
|
||||
}
|
||||
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user