diff --git a/evals/README.md b/evals/README.md index eb3cf2be70..9f6c1115f9 100644 --- a/evals/README.md +++ b/evals/README.md @@ -114,6 +114,27 @@ npm run test:all_evals This command sets the `RUN_EVALS` environment variable to `1`, which enables the `USUALLY_PASSES` tests. +### All Evals (All Models) + +To run the full evaluation suite across all supported models and generate a +local markdown report (mirroring the nightly CI workflow): + +```bash +npm run test:all_evals_all_models +``` + +This script will: +1. Build the project. +2. Run `test:all_evals` for each model in the nightly rotation. +3. Collect logs and aggregate them using `scripts/aggregate_evals.js`. +4. Generate a `local_evals_summary.md` file with the results. + +You can also filter by test name and specify the number of attempts: + +```bash +npm run test:all_evals_all_models -- "my-test-pattern" --attempts 3 +``` + ## Reporting Results for evaluations are available on GitHub Actions: diff --git a/package.json b/package.json index 820ae04826..b6e90d1d3f 100644 --- a/package.json +++ b/package.json @@ -43,6 +43,7 @@ "test:scripts": "vitest run --config ./scripts/tests/vitest.config.ts", "test:always_passing_evals": "vitest run --config evals/vitest.config.ts", "test:all_evals": "cross-env RUN_EVALS=1 vitest run --config evals/vitest.config.ts", + "test:all_evals_all_models": "node scripts/run_local_evals.js", "test:e2e": "cross-env VERBOSE=true KEEP_OUTPUT=true npm run test:integration:sandbox:none", "test:integration:all": "npm run test:integration:sandbox:none && npm run test:integration:sandbox:docker && npm run test:integration:sandbox:podman", "test:integration:sandbox:none": "cross-env GEMINI_SANDBOX=false vitest run --root ./integration-tests", diff --git a/scripts/aggregate_evals.js b/scripts/aggregate_evals.js index d14596d487..42407a18b6 100644 --- a/scripts/aggregate_evals.js +++ b/scripts/aggregate_evals.js @@ -12,6 +12,7 @@ import { execSync } from 'node:child_process'; import os from 'node:os'; const artifactsDir = process.argv[2] || '.'; +const outputFile = process.argv[3]; const MAX_HISTORY = 10; // Find all report.json files recursively @@ -145,10 +146,8 @@ function fetchHistoricalData() { } function generateMarkdown(currentStatsByModel, history) { - console.log('### Evals Nightly Summary\n'); - console.log( - 'See [evals/README.md](https://github.com/google-gemini/gemini-cli/tree/main/evals) for more details.\n', - ); + let md = '### Evals Nightly Summary\n\n'; + md += 'See [evals/README.md](https://github.com/google-gemini/gemini-cli/tree/main/evals) for more details.\n\n'; // Reverse history to show oldest first const reversedHistory = [...history].reverse(); @@ -171,8 +170,8 @@ function generateMarkdown(currentStatsByModel, history) { ? ((totalStats.passed / totalStats.total) * 100).toFixed(1) + '%' : 'N/A'; - console.log(`#### Model: ${model}`); - console.log(`**Total Pass Rate: ${totalPassRate}**\n`); + md += `#### Model: ${model}\n`; + md += `**Total Pass Rate: ${totalPassRate}**\n\n`; // Header let header = '| Test Name |'; @@ -187,8 +186,8 @@ function generateMarkdown(currentStatsByModel, history) { header += ' Current |'; separator += ' :---: |'; - console.log(header); - console.log(separator); + md += header + '\n'; + md += separator + '\n'; // Collect all test names for this model const allTestNames = new Set(Object.keys(currentStats)); @@ -224,23 +223,28 @@ function generateMarkdown(currentStatsByModel, history) { row += ' - |'; } - console.log(row); + md += row + '\n'; } - console.log('\n'); + md += '\n'; } + return md; } // --- Main --- const currentReports = findReports(artifactsDir); if (currentReports.length === 0) { - console.log('No reports found.'); - // We don't exit here because we might still want to see history if available, - // but practically if current has no reports, something is wrong. - // Sticking to original behavior roughly, but maybe we can continue. + console.error('No reports found.'); process.exit(0); } const currentStats = getStats(currentReports); const history = fetchHistoricalData(); -generateMarkdown(currentStats, history); +const markdown = generateMarkdown(currentStats, history); + +if (outputFile) { + fs.writeFileSync(outputFile, markdown); + console.log(`Summary written to ${outputFile}`); +} else { + console.log(markdown); +} diff --git a/scripts/run_local_evals.js b/scripts/run_local_evals.js new file mode 100755 index 0000000000..bb10b9ca6e --- /dev/null +++ b/scripts/run_local_evals.js @@ -0,0 +1,141 @@ +#!/usr/bin/env node + +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import fs from 'node:fs'; +import path from 'node:path'; +import { execSync } from 'node:child_process'; + +const models = [ + 'gemini-3-pro-preview', + 'gemini-3-flash-preview', + 'gemini-2.5-pro', + 'gemini-2.5-flash', + 'gemini-2.5-flash-lite', +]; + +const artifactsDir = path.resolve('artifacts'); +const logsDir = path.resolve('evals/logs'); + +// Parse arguments +const args = process.argv.slice(2); +let testPattern = ''; +let attempts = 1; + +for (let i = 0; i < args.length; i++) { + if (args[i] === '--attempts') { + attempts = parseInt(args[i + 1], 10); + i++; + } else if (!args[i].startsWith('-')) { + testPattern = args[i]; + } +} + +// Ensure GEMINI_API_KEY is set +if (!process.env.GEMINI_API_KEY) { + console.error('Error: GEMINI_API_KEY environment variable is not set.'); + process.exit(1); +} + +// Prepare artifacts directory +if (fs.existsSync(artifactsDir)) { + console.log(`Cleaning artifacts directory: ${artifactsDir}`); + fs.rmSync(artifactsDir, { recursive: true, force: true }); +} +fs.mkdirSync(artifactsDir); + +// Build project +console.log('Building project...'); +try { + execSync('npm run build', { stdio: 'inherit' }); +} catch (e) { + console.error('Build failed.'); + process.exit(1); +} + +console.log(` +Starting evals with ${attempts} attempt(s) per model.`); + +for (const model of models) { + for (let attempt = 1; attempt <= attempts; attempt++) { + console.log(` +--------------------------------------------------`); + console.log(`Running evals for ${model} (Attempt ${attempt}/${attempts})`); + console.log(`-------------------------------------------------- +`); + + // Clean logs directory for this run + if (fs.existsSync(logsDir)) { + fs.rmSync(logsDir, { recursive: true, force: true }); + } + fs.mkdirSync(logsDir, { recursive: true }); + + try { + // Construct command + let cmd = 'npm run test:all_evals'; + if (testPattern) { + if ( + testPattern.endsWith('.ts') || + testPattern.endsWith('.js') || + testPattern.includes('/') + ) { + cmd += ` -- "${testPattern}"`; + } else { + cmd += ` -- -t "${testPattern}"`; + } + } + + // Run evals + execSync(cmd, { + stdio: 'inherit', + env: { + ...process.env, + GEMINI_MODEL: model, + RUN_EVALS: 'true', + }, + }); + } catch (e) { + console.log( + ` +Evals for ${model} (Attempt ${attempt}) finished with failures.`, + ); + } + + // Copy logs to artifacts + // Format: eval-logs-- + const artifactName = `eval-logs-${model}-${attempt}`; + const artifactPath = path.join(artifactsDir, artifactName); + + // Ensure parent dir exists (though artifactsDir should exist) + if (fs.existsSync(logsDir)) { + console.log(`Copying logs to ${artifactPath}`); + fs.cpSync(logsDir, artifactPath, { recursive: true }); + } else { + console.error(`Warning: No logs found in ${logsDir}`); + } + } +} + +console.log('\n--------------------------------------------------'); +console.log('Aggregating results...'); +console.log('--------------------------------------------------\n'); + +try { + const summaryFile = 'local_evals_summary.md'; + execSync(`node scripts/aggregate_evals.js "${artifactsDir}" "${summaryFile}"`, { + stdio: 'inherit', + env: { + ...process.env, + }, + }); + + console.log(`\nSummary written to ${summaryFile}`); + console.log('\nPreview:\n'); + console.log(fs.readFileSync(summaryFile, 'utf-8')); +} catch (e) { + console.error('Aggregation failed:', e); +}