diff --git a/evals/app-test-helper.ts b/evals/app-test-helper.ts index 1794573fe1..9dc492da54 100644 --- a/evals/app-test-helper.ts +++ b/evals/app-test-helper.ts @@ -14,6 +14,7 @@ import { prepareWorkspace, type BaseEvalCase, EVAL_MODEL, + logUsageMetrics, } from './test-helper.js'; import fs from 'node:fs'; import path from 'node:path'; @@ -59,6 +60,7 @@ export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) { const { logDir, sanitizedName } = await prepareLogDir(evalCase.name); const logFile = path.join(logDir, `${sanitizedName}.log`); + let isSuccess = false; try { await rig.initialize(); @@ -89,7 +91,16 @@ export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) { // Run assertion. Interaction-heavy tests can do their own waiting/steering here. const output = rig.getStaticOutput(); await evalCase.assert(rig, output); + isSuccess = true; + } catch (e) { + isSuccess = false; + throw e; } finally { + const metrics = rig.getUsageMetrics(); + if (metrics.turns > 0) { + logUsageMetrics(evalCase.name, metrics, isSuccess); + } + const output = rig.getStaticOutput(); if (output) { await fs.promises.writeFile(logFile, output); diff --git a/evals/global-setup.ts b/evals/global-setup.ts new file mode 100644 index 0000000000..6ff6ef8254 --- /dev/null +++ b/evals/global-setup.ts @@ -0,0 +1,133 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import fs from 'node:fs'; +import path from 'node:path'; +import os from 'node:os'; +import { randomUUID } from 'node:crypto'; + +let usageLogPath: string; + +export function setup() { + const tmpDir = os.tmpdir(); + usageLogPath = path.join(tmpDir, `gemini-usage-${randomUUID()}.jsonl`); + process.env['GEMINI_EVAL_USAGE_LOG'] = usageLogPath; +} + +export function teardown() { + if (usageLogPath && fs.existsSync(usageLogPath)) { + try { + generateAndPrintReport(usageLogPath); + } catch (e) { + console.error('Failed to generate usage report', e); + } finally { + try { + fs.unlinkSync(usageLogPath); + } catch { + // Ignore cleanup errors + } + } + } +} + +/** + * Formats a number with human-readable suffixes (K, M). + */ +function formatNumber(num: number): string { + if (num >= 1000000) { + return (num / 1000000).toFixed(1).replace(/\.0$/, '') + 'M'; + } + if (num >= 1000) { + return (num / 1000).toFixed(1).replace(/\.0$/, '') + 'K'; + } + return num.toString(); +} + +function generateAndPrintReport(logPath: string) { + const lines = fs + .readFileSync(logPath, 'utf-8') + .trim() + .split('\n') + .filter(Boolean); + if (lines.length === 0) return; + const entries = lines + .map((line) => { + try { + return JSON.parse(line); + } catch { + return null; + } + }) + .filter(Boolean); + + const stats: Record< + string, + { + turns: number; + input: number; + output: number; + cached: number; + total: number; + passed: boolean; + } + > = {}; + let totalTurns = 0; + let totalInput = 0; + let totalOutput = 0; + let totalCached = 0; + let grandTotal = 0; + + for (const entry of entries) { + const key = `${entry.testName} (${entry.model})`; + if (!stats[key]) { + stats[key] = { + turns: 0, + input: 0, + output: 0, + cached: 0, + total: 0, + passed: true, + }; + } + stats[key].turns += entry.turns || 0; + stats[key].input += entry.input || 0; + stats[key].output += entry.output || 0; + stats[key].cached += entry.cached || 0; + stats[key].total += entry.total || 0; + // If any attempt failed for this test/model combo in this run, mark as failed + if (entry.passed === false) { + stats[key].passed = false; + } + + totalTurns += entry.turns || 0; + totalInput += entry.input || 0; + totalOutput += entry.output || 0; + totalCached += entry.cached || 0; + grandTotal += entry.total || 0; + } + + console.log('\n๐Ÿ“Š Behavioral Eval Usage Report'); + console.log('==============================='); + + const sortedKeys = Object.keys(stats).sort(); + for (const key of sortedKeys) { + const s = stats[key]; + const status = s.passed ? 'โœ…' : 'โŒ'; + console.log(`${status} ${key}`); + console.log( + ` > turns: ${s.turns}, input: ${formatNumber(s.input)}, output: ${formatNumber(s.output)}, cached: ${formatNumber(s.cached)}, total: ${formatNumber(s.total)}`, + ); + } + + console.log('\n๐Ÿ“ˆ Suite Totals'); + console.log('--------------'); + console.log(`Total Turns: ${totalTurns}`); + console.log(`Input Tokens: ${formatNumber(totalInput)}`); + console.log(`Output Tokens: ${formatNumber(totalOutput)}`); + console.log(`Cached Tokens: ${formatNumber(totalCached)}`); + console.log(`Total Tokens: ${formatNumber(grandTotal)}`); + console.log(''); +} diff --git a/evals/test-helper.ts b/evals/test-helper.ts index 7369a6919c..586b44549c 100644 --- a/evals/test-helper.ts +++ b/evals/test-helper.ts @@ -185,7 +185,15 @@ export async function internalEvalTest(evalCase: EvalCase) { await evalCase.assert(rig, result); isSuccess = true; + } catch (e) { + isSuccess = false; + throw e; } finally { + const metrics = rig.getUsageMetrics(); + if (metrics.turns > 0) { + logUsageMetrics(evalCase.name, metrics, isSuccess); + } + if (isSuccess) { await fs.promises.unlink(activityLogFile).catch((err) => { if (err.code !== 'ENOENT') throw err; @@ -260,6 +268,41 @@ function logReliabilityEvent( } } +/** + * Log usage metrics for individual eval runs. + */ +export function logUsageMetrics( + testName: string, + metrics: { + turns: number; + input: number; + output: number; + cached: number; + total: number; + }, + passed: boolean, +) { + const usageLog = { + timestamp: new Date().toISOString(), + testName, + model: EVAL_MODEL, + passed, + ...metrics, + }; + + try { + const logPath = + process.env['GEMINI_EVAL_USAGE_LOG'] || + path.resolve(process.cwd(), 'evals/logs/usage-metrics.jsonl'); + + const logDir = path.dirname(logPath); + fs.mkdirSync(logDir, { recursive: true }); + fs.appendFileSync(logPath, JSON.stringify(usageLog) + '\n'); + } catch (logError) { + console.error('Failed to write usage log:', logError); + } +} + /** * Helper to setup test files and git repository. * diff --git a/evals/vitest.config.ts b/evals/vitest.config.ts index b0ad05c9e9..092db48cba 100644 --- a/evals/vitest.config.ts +++ b/evals/vitest.config.ts @@ -16,6 +16,7 @@ export default defineConfig({ }, test: { testTimeout: 300000, // 5 minutes + globalSetup: [path.resolve(__dirname, 'global-setup.ts')], reporters: ['default', 'json'], outputFile: { json: 'evals/logs/report.json', diff --git a/packages/cli/src/test-utils/AppRig.tsx b/packages/cli/src/test-utils/AppRig.tsx index 548372a139..f94777edd9 100644 --- a/packages/cli/src/test-utils/AppRig.tsx +++ b/packages/cli/src/test-utils/AppRig.tsx @@ -36,6 +36,10 @@ import { CoreToolCallStatus, IntegrityDataStatus, } from '@google/gemini-cli-core'; +import { + getUsageMetrics, + type UsageMetrics, +} from '@google/gemini-cli-test-utils'; import { type MockShellCommand, MockShellExecutionService, @@ -477,6 +481,14 @@ export class AppRig { return this.testDir; } + /** + * Scans the session recordings and extracts aggregate token usage and turn counts. + * This provides an empirical measure of the agent's efficiency and cost. + */ + getUsageMetrics(): UsageMetrics { + return getUsageMetrics(this.testDir); + } + getPendingConfirmations() { return Array.from(this.pendingConfirmations.values()); } diff --git a/packages/test-utils/src/index.ts b/packages/test-utils/src/index.ts index e851e7ab8d..2d194692ba 100644 --- a/packages/test-utils/src/index.ts +++ b/packages/test-utils/src/index.ts @@ -13,3 +13,4 @@ export * from './mock-utils.js'; export * from './test-mcp-server.js'; export * from './test-rig.js'; export * from './env-setup.js'; +export * from './usage-metrics-utils.js'; diff --git a/packages/test-utils/src/test-rig.ts b/packages/test-utils/src/test-rig.ts index 734c1b9546..142ec35aa5 100644 --- a/packages/test-utils/src/test-rig.ts +++ b/packages/test-utils/src/test-rig.ts @@ -17,6 +17,7 @@ import * as pty from '@lydell/node-pty'; import stripAnsi from 'strip-ansi'; import * as os from 'node:os'; import type { TestMcpConfig } from './test-mcp-server.js'; +import { getUsageMetrics, type UsageMetrics } from './usage-metrics-utils.js'; const __dirname = dirname(fileURLToPath(import.meta.url)); const BUNDLE_PATH = join(__dirname, '..', '..', '..', 'bundle/gemini.js'); @@ -1516,6 +1517,23 @@ export class TestRig { return run; } + /** + * Scans the session recordings and extracts aggregate token usage and turn counts. + * This provides an empirical measure of the agent's efficiency and cost. + */ + getUsageMetrics(): UsageMetrics { + if (!this.homeDir) { + return { + turns: 0, + input: 0, + output: 0, + cached: 0, + total: 0, + }; + } + return getUsageMetrics(this.homeDir); + } + readHookLogs() { const parsedLogs = this._readAndParseTelemetryLog(); const logs: { diff --git a/packages/test-utils/src/usage-metrics-utils.ts b/packages/test-utils/src/usage-metrics-utils.ts new file mode 100644 index 0000000000..cbedf396c4 --- /dev/null +++ b/packages/test-utils/src/usage-metrics-utils.ts @@ -0,0 +1,88 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import fs from 'node:fs'; +import { join } from 'node:path'; + +export interface UsageMetrics { + turns: number; + input: number; + output: number; + cached: number; + total: number; +} + +/** + * Scans the session recordings and extracts aggregate token usage and turn counts. + * This provides an empirical measure of the agent's efficiency and cost. + * + * @param tempDir The directory containing the .gemini/tmp folder with recordings. + */ +export function getUsageMetrics(tempDir: string): UsageMetrics { + const geminiTmpDir = join(tempDir, '.gemini', 'tmp'); + + const metrics: UsageMetrics = { + turns: 0, + input: 0, + output: 0, + cached: 0, + total: 0, + }; + + if (!fs.existsSync(geminiTmpDir)) { + return metrics; + } + + const processDir = (dir: string) => { + const entries = fs.readdirSync(dir, { withFileTypes: true }); + for (const entry of entries) { + const fullPath = join(dir, entry.name); + if (entry.isDirectory()) { + if (entry.name === 'chats') { + processChatsDir(fullPath, metrics); + } else { + processDir(fullPath); + } + } + } + }; + + processDir(geminiTmpDir); + return metrics; +} + +function processChatsDir(chatsDir: string, metrics: UsageMetrics) { + const entries = fs.readdirSync(chatsDir, { withFileTypes: true }); + for (const entry of entries) { + const fullPath = join(chatsDir, entry.name); + if (entry.isDirectory()) { + // Handle subagent sessions stored in subdirectories + processChatsDir(fullPath, metrics); + } else if ( + entry.name.endsWith('.json') && + entry.name.startsWith('session-') + ) { + try { + const data = JSON.parse(fs.readFileSync(fullPath, 'utf8')); + if (data.messages && Array.isArray(data.messages)) { + for (const msg of data.messages) { + if (msg.type === 'gemini') { + metrics.turns++; + if (msg.tokens) { + metrics.input += msg.tokens.input || 0; + metrics.output += msg.tokens.output || 0; + metrics.cached += msg.tokens.cached || 0; + metrics.total += msg.tokens.total || 0; + } + } + } + } + } catch { + // Ignore parse errors for partially written or corrupted files + } + } + } +} diff --git a/scripts/compare_evals.js b/scripts/compare_evals.js index a5ea15361f..92e2158d90 100644 --- a/scripts/compare_evals.js +++ b/scripts/compare_evals.js @@ -106,14 +106,77 @@ function main() { markdown += '#### 3. Manual Fix\n'; markdown += 'See the [Fixing Guide](https://github.com/google-gemini/gemini-cli/blob/main/evals/README.md#fixing-evaluations) for detailed troubleshooting steps.\n'; - markdown += '\n'; + markdown += '\n\n'; + + markdown += getUsageSummaryMarkdown(); process.stdout.write(markdown); } else if (passes.length > 0) { // Success State - process.stdout.write( - `โœ… **${passes.length}** tests passed successfully on **${targetModel}**.\n`, - ); + let markdown = `โœ… **${passes.length}** tests passed successfully on **${targetModel}**.\n\n`; + markdown += getUsageSummaryMarkdown(); + process.stdout.write(markdown); + } +} + +/** + * Generates a Markdown summary of usage metrics if available. + */ +function getUsageSummaryMarkdown() { + const usageLogPath = + process.env['GEMINI_EVAL_USAGE_LOG'] || + path.resolve(process.cwd(), 'evals/logs/usage-metrics.jsonl'); + // In the PR workflow, the metrics might be gathered in multiple passes or we might + // need to fall back to the nightly report structure. Since the PR evaluation runs + // in a loop that might wipe the temp file, we rely on the fact that `run_eval_regression` + // copies the metrics to `evals/logs/usage-metrics.jsonl` if we implement that, or + // we just skip it if it's missing. + if (!fs.existsSync(usageLogPath)) { + return ''; + } + + try { + const lines = fs + .readFileSync(usageLogPath, 'utf-8') + .trim() + .split('\n') + .filter(Boolean); + if (lines.length === 0) return ''; + const entries = lines + .map((line) => { + try { + return JSON.parse(line); + } catch { + return null; + } + }) + .filter(Boolean); + + let totalTurns = 0; + let totalInput = 0; + let totalOutput = 0; + let totalCached = 0; + let grandTotal = 0; + + for (const entry of entries) { + totalTurns += entry.turns || 0; + totalInput += entry.input || 0; + totalOutput += entry.output || 0; + totalCached += entry.cached || 0; + grandTotal += entry.total || 0; + } + + let markdown = '#### ๐Ÿ“Š Usage Summary\n'; + markdown += `| Metric | Total |\n`; + markdown += `| :--- | :--- |\n`; + markdown += `| **Turns** | ${totalTurns} |\n`; + markdown += `| **Input Tokens** | ${totalInput.toLocaleString()} |\n`; + markdown += `| **Output Tokens** | ${totalOutput.toLocaleString()} |\n`; + markdown += `| **Cached Tokens** | ${totalCached.toLocaleString()} |\n`; + markdown += `| **Total Tokens** | ${grandTotal.toLocaleString()} |\n`; + return markdown; + } catch { + return ''; } } diff --git a/scripts/run_eval_regression.js b/scripts/run_eval_regression.js index 7a64a6a2f9..acc0445c45 100644 --- a/scripts/run_eval_regression.js +++ b/scripts/run_eval_regression.js @@ -15,6 +15,9 @@ import { execSync } from 'node:child_process'; import fs from 'node:fs'; +import path from 'node:path'; +import os from 'node:os'; +import { randomUUID } from 'node:crypto'; /** * Main execution logic. @@ -26,6 +29,14 @@ async function main() { let combinedReport = ''; let hasRegression = false; + const usageLogPath = path.join( + os.tmpdir(), + `gemini-usage-regression-${randomUUID()}.jsonl`, + ); + if (fs.existsSync(usageLogPath)) { + fs.unlinkSync(usageLogPath); + } + console.log( `๐Ÿš€ Starting evaluation orchestration for models: ${models.join(', ')}`, ); @@ -50,16 +61,30 @@ async function main() { } // 2. Run Frugal Regression Check - console.log(`๐Ÿงช Running regression check for ${model}...`); + console.log(`\n๐Ÿš€ Executing regression tests for ${model}...`); + const tmpUsageLog = path.join( + os.tmpdir(), + `gemini-usage-tmp-${model}-${randomUUID()}.jsonl`, + ); + const env = { ...process.env, GEMINI_EVAL_USAGE_LOG: tmpUsageLog }; + execSync(`node scripts/run_regression_check.js "${model}" "${output}"`, { stdio: 'inherit', + env, }); + if (fs.existsSync(tmpUsageLog)) { + fs.appendFileSync(usageLogPath, fs.readFileSync(tmpUsageLog)); + fs.unlinkSync(tmpUsageLog); + } + // 3. Generate Report console.log(`๐Ÿ“Š Generating report for ${model}...`); + const reportEnv = { ...process.env, GEMINI_EVAL_USAGE_LOG: usageLogPath }; const report = execSync(`node scripts/compare_evals.js "${model}"`, { encoding: 'utf-8', stdio: ['inherit', 'pipe', 'inherit'], + env: reportEnv, }).trim(); if (report) { @@ -98,6 +123,10 @@ async function main() { console.log('\nโœ… All evaluations passed successfully (or were cleared).'); } + if (fs.existsSync(usageLogPath)) { + fs.unlinkSync(usageLogPath); + } + process.exit(0); }