mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-05-29 05:02:35 -07:00
feat(evals): add automated usage metrics reporting for behavioral evals
This commit is contained in:
@@ -106,14 +106,77 @@ function main() {
|
||||
markdown += '#### 3. Manual Fix\n';
|
||||
markdown +=
|
||||
'See the [Fixing Guide](https://github.com/google-gemini/gemini-cli/blob/main/evals/README.md#fixing-evaluations) for detailed troubleshooting steps.\n';
|
||||
markdown += '</details>\n';
|
||||
markdown += '</details>\n\n';
|
||||
|
||||
markdown += getUsageSummaryMarkdown();
|
||||
|
||||
process.stdout.write(markdown);
|
||||
} else if (passes.length > 0) {
|
||||
// Success State
|
||||
process.stdout.write(
|
||||
`✅ **${passes.length}** tests passed successfully on **${targetModel}**.\n`,
|
||||
);
|
||||
let markdown = `✅ **${passes.length}** tests passed successfully on **${targetModel}**.\n\n`;
|
||||
markdown += getUsageSummaryMarkdown();
|
||||
process.stdout.write(markdown);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates a Markdown summary of usage metrics if available.
|
||||
*/
|
||||
function getUsageSummaryMarkdown() {
|
||||
const usageLogPath =
|
||||
process.env['GEMINI_EVAL_USAGE_LOG'] ||
|
||||
path.resolve(process.cwd(), 'evals/logs/usage-metrics.jsonl');
|
||||
// In the PR workflow, the metrics might be gathered in multiple passes or we might
|
||||
// need to fall back to the nightly report structure. Since the PR evaluation runs
|
||||
// in a loop that might wipe the temp file, we rely on the fact that `run_eval_regression`
|
||||
// copies the metrics to `evals/logs/usage-metrics.jsonl` if we implement that, or
|
||||
// we just skip it if it's missing.
|
||||
if (!fs.existsSync(usageLogPath)) {
|
||||
return '';
|
||||
}
|
||||
|
||||
try {
|
||||
const lines = fs
|
||||
.readFileSync(usageLogPath, 'utf-8')
|
||||
.trim()
|
||||
.split('\n')
|
||||
.filter(Boolean);
|
||||
if (lines.length === 0) return '';
|
||||
const entries = lines
|
||||
.map((line) => {
|
||||
try {
|
||||
return JSON.parse(line);
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
})
|
||||
.filter(Boolean);
|
||||
|
||||
let totalTurns = 0;
|
||||
let totalInput = 0;
|
||||
let totalOutput = 0;
|
||||
let totalCached = 0;
|
||||
let grandTotal = 0;
|
||||
|
||||
for (const entry of entries) {
|
||||
totalTurns += entry.turns || 0;
|
||||
totalInput += entry.input || 0;
|
||||
totalOutput += entry.output || 0;
|
||||
totalCached += entry.cached || 0;
|
||||
grandTotal += entry.total || 0;
|
||||
}
|
||||
|
||||
let markdown = '#### 📊 Usage Summary\n';
|
||||
markdown += `| Metric | Total |\n`;
|
||||
markdown += `| :--- | :--- |\n`;
|
||||
markdown += `| **Turns** | ${totalTurns} |\n`;
|
||||
markdown += `| **Input Tokens** | ${totalInput.toLocaleString()} |\n`;
|
||||
markdown += `| **Output Tokens** | ${totalOutput.toLocaleString()} |\n`;
|
||||
markdown += `| **Cached Tokens** | ${totalCached.toLocaleString()} |\n`;
|
||||
markdown += `| **Total Tokens** | ${grandTotal.toLocaleString()} |\n`;
|
||||
return markdown;
|
||||
} catch {
|
||||
return '';
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -15,6 +15,9 @@
|
||||
|
||||
import { execSync } from 'node:child_process';
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
import os from 'node:os';
|
||||
import { randomUUID } from 'node:crypto';
|
||||
|
||||
/**
|
||||
* Main execution logic.
|
||||
@@ -26,6 +29,14 @@ async function main() {
|
||||
let combinedReport = '';
|
||||
let hasRegression = false;
|
||||
|
||||
const usageLogPath = path.join(
|
||||
os.tmpdir(),
|
||||
`gemini-usage-regression-${randomUUID()}.jsonl`,
|
||||
);
|
||||
if (fs.existsSync(usageLogPath)) {
|
||||
fs.unlinkSync(usageLogPath);
|
||||
}
|
||||
|
||||
console.log(
|
||||
`🚀 Starting evaluation orchestration for models: ${models.join(', ')}`,
|
||||
);
|
||||
@@ -50,16 +61,30 @@ async function main() {
|
||||
}
|
||||
|
||||
// 2. Run Frugal Regression Check
|
||||
console.log(`🧪 Running regression check for ${model}...`);
|
||||
console.log(`\n🚀 Executing regression tests for ${model}...`);
|
||||
const tmpUsageLog = path.join(
|
||||
os.tmpdir(),
|
||||
`gemini-usage-tmp-${model}-${randomUUID()}.jsonl`,
|
||||
);
|
||||
const env = { ...process.env, GEMINI_EVAL_USAGE_LOG: tmpUsageLog };
|
||||
|
||||
execSync(`node scripts/run_regression_check.js "${model}" "${output}"`, {
|
||||
stdio: 'inherit',
|
||||
env,
|
||||
});
|
||||
|
||||
if (fs.existsSync(tmpUsageLog)) {
|
||||
fs.appendFileSync(usageLogPath, fs.readFileSync(tmpUsageLog));
|
||||
fs.unlinkSync(tmpUsageLog);
|
||||
}
|
||||
|
||||
// 3. Generate Report
|
||||
console.log(`📊 Generating report for ${model}...`);
|
||||
const reportEnv = { ...process.env, GEMINI_EVAL_USAGE_LOG: usageLogPath };
|
||||
const report = execSync(`node scripts/compare_evals.js "${model}"`, {
|
||||
encoding: 'utf-8',
|
||||
stdio: ['inherit', 'pipe', 'inherit'],
|
||||
env: reportEnv,
|
||||
}).trim();
|
||||
|
||||
if (report) {
|
||||
@@ -98,6 +123,10 @@ async function main() {
|
||||
console.log('\n✅ All evaluations passed successfully (or were cleared).');
|
||||
}
|
||||
|
||||
if (fs.existsSync(usageLogPath)) {
|
||||
fs.unlinkSync(usageLogPath);
|
||||
}
|
||||
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user