feat(evals): add automated usage metrics reporting for behavioral evals

This commit is contained in:
Michael Bleigh
2026-04-09 16:39:55 -07:00
parent a7a091360e
commit 770db15ca5
10 changed files with 404 additions and 5 deletions
+67 -4
View File
@@ -106,14 +106,77 @@ function main() {
markdown += '#### 3. Manual Fix\n';
markdown +=
'See the [Fixing Guide](https://github.com/google-gemini/gemini-cli/blob/main/evals/README.md#fixing-evaluations) for detailed troubleshooting steps.\n';
markdown += '</details>\n';
markdown += '</details>\n\n';
markdown += getUsageSummaryMarkdown();
process.stdout.write(markdown);
} else if (passes.length > 0) {
// Success State
process.stdout.write(
`✅ **${passes.length}** tests passed successfully on **${targetModel}**.\n`,
);
let markdown = `✅ **${passes.length}** tests passed successfully on **${targetModel}**.\n\n`;
markdown += getUsageSummaryMarkdown();
process.stdout.write(markdown);
}
}
/**
* Generates a Markdown summary of usage metrics if available.
*/
function getUsageSummaryMarkdown() {
const usageLogPath =
process.env['GEMINI_EVAL_USAGE_LOG'] ||
path.resolve(process.cwd(), 'evals/logs/usage-metrics.jsonl');
// In the PR workflow, the metrics might be gathered in multiple passes or we might
// need to fall back to the nightly report structure. Since the PR evaluation runs
// in a loop that might wipe the temp file, we rely on the fact that `run_eval_regression`
// copies the metrics to `evals/logs/usage-metrics.jsonl` if we implement that, or
// we just skip it if it's missing.
if (!fs.existsSync(usageLogPath)) {
return '';
}
try {
const lines = fs
.readFileSync(usageLogPath, 'utf-8')
.trim()
.split('\n')
.filter(Boolean);
if (lines.length === 0) return '';
const entries = lines
.map((line) => {
try {
return JSON.parse(line);
} catch {
return null;
}
})
.filter(Boolean);
let totalTurns = 0;
let totalInput = 0;
let totalOutput = 0;
let totalCached = 0;
let grandTotal = 0;
for (const entry of entries) {
totalTurns += entry.turns || 0;
totalInput += entry.input || 0;
totalOutput += entry.output || 0;
totalCached += entry.cached || 0;
grandTotal += entry.total || 0;
}
let markdown = '#### 📊 Usage Summary\n';
markdown += `| Metric | Total |\n`;
markdown += `| :--- | :--- |\n`;
markdown += `| **Turns** | ${totalTurns} |\n`;
markdown += `| **Input Tokens** | ${totalInput.toLocaleString()} |\n`;
markdown += `| **Output Tokens** | ${totalOutput.toLocaleString()} |\n`;
markdown += `| **Cached Tokens** | ${totalCached.toLocaleString()} |\n`;
markdown += `| **Total Tokens** | ${grandTotal.toLocaleString()} |\n`;
return markdown;
} catch {
return '';
}
}
+30 -1
View File
@@ -15,6 +15,9 @@
import { execSync } from 'node:child_process';
import fs from 'node:fs';
import path from 'node:path';
import os from 'node:os';
import { randomUUID } from 'node:crypto';
/**
* Main execution logic.
@@ -26,6 +29,14 @@ async function main() {
let combinedReport = '';
let hasRegression = false;
const usageLogPath = path.join(
os.tmpdir(),
`gemini-usage-regression-${randomUUID()}.jsonl`,
);
if (fs.existsSync(usageLogPath)) {
fs.unlinkSync(usageLogPath);
}
console.log(
`🚀 Starting evaluation orchestration for models: ${models.join(', ')}`,
);
@@ -50,16 +61,30 @@ async function main() {
}
// 2. Run Frugal Regression Check
console.log(`🧪 Running regression check for ${model}...`);
console.log(`\n🚀 Executing regression tests for ${model}...`);
const tmpUsageLog = path.join(
os.tmpdir(),
`gemini-usage-tmp-${model}-${randomUUID()}.jsonl`,
);
const env = { ...process.env, GEMINI_EVAL_USAGE_LOG: tmpUsageLog };
execSync(`node scripts/run_regression_check.js "${model}" "${output}"`, {
stdio: 'inherit',
env,
});
if (fs.existsSync(tmpUsageLog)) {
fs.appendFileSync(usageLogPath, fs.readFileSync(tmpUsageLog));
fs.unlinkSync(tmpUsageLog);
}
// 3. Generate Report
console.log(`📊 Generating report for ${model}...`);
const reportEnv = { ...process.env, GEMINI_EVAL_USAGE_LOG: usageLogPath };
const report = execSync(`node scripts/compare_evals.js "${model}"`, {
encoding: 'utf-8',
stdio: ['inherit', 'pipe', 'inherit'],
env: reportEnv,
}).trim();
if (report) {
@@ -98,6 +123,10 @@ async function main() {
console.log('\n✅ All evaluations passed successfully (or were cleared).');
}
if (fs.existsSync(usageLogPath)) {
fs.unlinkSync(usageLogPath);
}
process.exit(0);
}