mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-03-17 17:41:24 -07:00
Add script for running nightly evals locally.
This commit is contained in:
@@ -114,6 +114,27 @@ npm run test:all_evals
|
||||
This command sets the `RUN_EVALS` environment variable to `1`, which enables the
|
||||
`USUALLY_PASSES` tests.
|
||||
|
||||
### All Evals (All Models)
|
||||
|
||||
To run the full evaluation suite across all supported models and generate a
|
||||
local markdown report (mirroring the nightly CI workflow):
|
||||
|
||||
```bash
|
||||
npm run test:all_evals_all_models
|
||||
```
|
||||
|
||||
This script will:
|
||||
1. Build the project.
|
||||
2. Run `test:all_evals` for each model in the nightly rotation.
|
||||
3. Collect logs and aggregate them using `scripts/aggregate_evals.js`.
|
||||
4. Generate a `local_evals_summary.md` file with the results.
|
||||
|
||||
You can also filter by test name and specify the number of attempts:
|
||||
|
||||
```bash
|
||||
npm run test:all_evals_all_models -- "my-test-pattern" --attempts 3
|
||||
```
|
||||
|
||||
## Reporting
|
||||
|
||||
Results for evaluations are available on GitHub Actions:
|
||||
|
||||
@@ -43,6 +43,7 @@
|
||||
"test:scripts": "vitest run --config ./scripts/tests/vitest.config.ts",
|
||||
"test:always_passing_evals": "vitest run --config evals/vitest.config.ts",
|
||||
"test:all_evals": "cross-env RUN_EVALS=1 vitest run --config evals/vitest.config.ts",
|
||||
"test:all_evals_all_models": "node scripts/run_local_evals.js",
|
||||
"test:e2e": "cross-env VERBOSE=true KEEP_OUTPUT=true npm run test:integration:sandbox:none",
|
||||
"test:integration:all": "npm run test:integration:sandbox:none && npm run test:integration:sandbox:docker && npm run test:integration:sandbox:podman",
|
||||
"test:integration:sandbox:none": "cross-env GEMINI_SANDBOX=false vitest run --root ./integration-tests",
|
||||
|
||||
@@ -12,6 +12,7 @@ import { execSync } from 'node:child_process';
|
||||
import os from 'node:os';
|
||||
|
||||
const artifactsDir = process.argv[2] || '.';
|
||||
const outputFile = process.argv[3];
|
||||
const MAX_HISTORY = 10;
|
||||
|
||||
// Find all report.json files recursively
|
||||
@@ -145,10 +146,8 @@ function fetchHistoricalData() {
|
||||
}
|
||||
|
||||
function generateMarkdown(currentStatsByModel, history) {
|
||||
console.log('### Evals Nightly Summary\n');
|
||||
console.log(
|
||||
'See [evals/README.md](https://github.com/google-gemini/gemini-cli/tree/main/evals) for more details.\n',
|
||||
);
|
||||
let md = '### Evals Nightly Summary\n\n';
|
||||
md += 'See [evals/README.md](https://github.com/google-gemini/gemini-cli/tree/main/evals) for more details.\n\n';
|
||||
|
||||
// Reverse history to show oldest first
|
||||
const reversedHistory = [...history].reverse();
|
||||
@@ -171,8 +170,8 @@ function generateMarkdown(currentStatsByModel, history) {
|
||||
? ((totalStats.passed / totalStats.total) * 100).toFixed(1) + '%'
|
||||
: 'N/A';
|
||||
|
||||
console.log(`#### Model: ${model}`);
|
||||
console.log(`**Total Pass Rate: ${totalPassRate}**\n`);
|
||||
md += `#### Model: ${model}\n`;
|
||||
md += `**Total Pass Rate: ${totalPassRate}**\n\n`;
|
||||
|
||||
// Header
|
||||
let header = '| Test Name |';
|
||||
@@ -187,8 +186,8 @@ function generateMarkdown(currentStatsByModel, history) {
|
||||
header += ' Current |';
|
||||
separator += ' :---: |';
|
||||
|
||||
console.log(header);
|
||||
console.log(separator);
|
||||
md += header + '\n';
|
||||
md += separator + '\n';
|
||||
|
||||
// Collect all test names for this model
|
||||
const allTestNames = new Set(Object.keys(currentStats));
|
||||
@@ -224,23 +223,28 @@ function generateMarkdown(currentStatsByModel, history) {
|
||||
row += ' - |';
|
||||
}
|
||||
|
||||
console.log(row);
|
||||
md += row + '\n';
|
||||
}
|
||||
console.log('\n');
|
||||
md += '\n';
|
||||
}
|
||||
return md;
|
||||
}
|
||||
|
||||
// --- Main ---
|
||||
|
||||
const currentReports = findReports(artifactsDir);
|
||||
if (currentReports.length === 0) {
|
||||
console.log('No reports found.');
|
||||
// We don't exit here because we might still want to see history if available,
|
||||
// but practically if current has no reports, something is wrong.
|
||||
// Sticking to original behavior roughly, but maybe we can continue.
|
||||
console.error('No reports found.');
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
const currentStats = getStats(currentReports);
|
||||
const history = fetchHistoricalData();
|
||||
generateMarkdown(currentStats, history);
|
||||
const markdown = generateMarkdown(currentStats, history);
|
||||
|
||||
if (outputFile) {
|
||||
fs.writeFileSync(outputFile, markdown);
|
||||
console.log(`Summary written to ${outputFile}`);
|
||||
} else {
|
||||
console.log(markdown);
|
||||
}
|
||||
|
||||
141
scripts/run_local_evals.js
Executable file
141
scripts/run_local_evals.js
Executable file
@@ -0,0 +1,141 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
import { execSync } from 'node:child_process';
|
||||
|
||||
const models = [
|
||||
'gemini-3-pro-preview',
|
||||
'gemini-3-flash-preview',
|
||||
'gemini-2.5-pro',
|
||||
'gemini-2.5-flash',
|
||||
'gemini-2.5-flash-lite',
|
||||
];
|
||||
|
||||
const artifactsDir = path.resolve('artifacts');
|
||||
const logsDir = path.resolve('evals/logs');
|
||||
|
||||
// Parse arguments
|
||||
const args = process.argv.slice(2);
|
||||
let testPattern = '';
|
||||
let attempts = 1;
|
||||
|
||||
for (let i = 0; i < args.length; i++) {
|
||||
if (args[i] === '--attempts') {
|
||||
attempts = parseInt(args[i + 1], 10);
|
||||
i++;
|
||||
} else if (!args[i].startsWith('-')) {
|
||||
testPattern = args[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure GEMINI_API_KEY is set
|
||||
if (!process.env.GEMINI_API_KEY) {
|
||||
console.error('Error: GEMINI_API_KEY environment variable is not set.');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Prepare artifacts directory
|
||||
if (fs.existsSync(artifactsDir)) {
|
||||
console.log(`Cleaning artifacts directory: ${artifactsDir}`);
|
||||
fs.rmSync(artifactsDir, { recursive: true, force: true });
|
||||
}
|
||||
fs.mkdirSync(artifactsDir);
|
||||
|
||||
// Build project
|
||||
console.log('Building project...');
|
||||
try {
|
||||
execSync('npm run build', { stdio: 'inherit' });
|
||||
} catch (e) {
|
||||
console.error('Build failed.');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log(`
|
||||
Starting evals with ${attempts} attempt(s) per model.`);
|
||||
|
||||
for (const model of models) {
|
||||
for (let attempt = 1; attempt <= attempts; attempt++) {
|
||||
console.log(`
|
||||
--------------------------------------------------`);
|
||||
console.log(`Running evals for ${model} (Attempt ${attempt}/${attempts})`);
|
||||
console.log(`--------------------------------------------------
|
||||
`);
|
||||
|
||||
// Clean logs directory for this run
|
||||
if (fs.existsSync(logsDir)) {
|
||||
fs.rmSync(logsDir, { recursive: true, force: true });
|
||||
}
|
||||
fs.mkdirSync(logsDir, { recursive: true });
|
||||
|
||||
try {
|
||||
// Construct command
|
||||
let cmd = 'npm run test:all_evals';
|
||||
if (testPattern) {
|
||||
if (
|
||||
testPattern.endsWith('.ts') ||
|
||||
testPattern.endsWith('.js') ||
|
||||
testPattern.includes('/')
|
||||
) {
|
||||
cmd += ` -- "${testPattern}"`;
|
||||
} else {
|
||||
cmd += ` -- -t "${testPattern}"`;
|
||||
}
|
||||
}
|
||||
|
||||
// Run evals
|
||||
execSync(cmd, {
|
||||
stdio: 'inherit',
|
||||
env: {
|
||||
...process.env,
|
||||
GEMINI_MODEL: model,
|
||||
RUN_EVALS: 'true',
|
||||
},
|
||||
});
|
||||
} catch (e) {
|
||||
console.log(
|
||||
`
|
||||
Evals for ${model} (Attempt ${attempt}) finished with failures.`,
|
||||
);
|
||||
}
|
||||
|
||||
// Copy logs to artifacts
|
||||
// Format: eval-logs-<model>-<attempt>
|
||||
const artifactName = `eval-logs-${model}-${attempt}`;
|
||||
const artifactPath = path.join(artifactsDir, artifactName);
|
||||
|
||||
// Ensure parent dir exists (though artifactsDir should exist)
|
||||
if (fs.existsSync(logsDir)) {
|
||||
console.log(`Copying logs to ${artifactPath}`);
|
||||
fs.cpSync(logsDir, artifactPath, { recursive: true });
|
||||
} else {
|
||||
console.error(`Warning: No logs found in ${logsDir}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.log('\n--------------------------------------------------');
|
||||
console.log('Aggregating results...');
|
||||
console.log('--------------------------------------------------\n');
|
||||
|
||||
try {
|
||||
const summaryFile = 'local_evals_summary.md';
|
||||
execSync(`node scripts/aggregate_evals.js "${artifactsDir}" "${summaryFile}"`, {
|
||||
stdio: 'inherit',
|
||||
env: {
|
||||
...process.env,
|
||||
},
|
||||
});
|
||||
|
||||
console.log(`\nSummary written to ${summaryFile}`);
|
||||
console.log('\nPreview:\n');
|
||||
console.log(fs.readFileSync(summaryFile, 'utf-8'));
|
||||
} catch (e) {
|
||||
console.error('Aggregation failed:', e);
|
||||
}
|
||||
Reference in New Issue
Block a user