diff --git a/package.json b/package.json index 55cd8610eb..91177fe091 100644 --- a/package.json +++ b/package.json @@ -32,6 +32,7 @@ "schema:settings": "tsx ./scripts/generate-settings-schema.ts", "docs:settings": "tsx ./scripts/generate-settings-doc.ts", "docs:keybindings": "tsx ./scripts/generate-keybindings-doc.ts", + "eval:inventory": "tsx ./scripts/eval-inventory-cli.ts", "build": "node scripts/build.js", "build-and-start": "npm run build && npm run start --", "build:vscode": "node scripts/build_vscode_companion.js", diff --git a/scripts/eval-inventory-cli.ts b/scripts/eval-inventory-cli.ts new file mode 100644 index 0000000000..d7be338aa5 --- /dev/null +++ b/scripts/eval-inventory-cli.ts @@ -0,0 +1,46 @@ +#!/usr/bin/env tsx + +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * @fileoverview CLI entry point for the eval inventory command. + * + * Scans all eval source files, runs the static analyzer on each, + * and prints a human-readable inventory report grouped by policy, + * file, and suite. + * + * Usage: + * npm run eval:inventory + * npm run eval:inventory -- --root /path/to/repo + */ + +import { + collectInventory, + formatInventoryReport, +} from './utils/eval-inventory.js'; + +async function main() { + const rootFlagIndex = process.argv.indexOf('--root'); + const repoRoot = + rootFlagIndex !== -1 && process.argv[rootFlagIndex + 1] + ? process.argv[rootFlagIndex + 1] + : process.cwd(); + + const result = await collectInventory(repoRoot); + + if (result.totalFiles === 0) { + console.error('No eval files found under evals/.'); + process.exit(1); + } + + console.log(formatInventoryReport(result)); +} + +main().catch((error) => { + console.error('Fatal error:', error); + process.exit(1); +}); diff --git a/scripts/tests/eval-inventory.test.ts b/scripts/tests/eval-inventory.test.ts new file mode 100644 index 0000000000..e832c84d38 --- /dev/null +++ b/scripts/tests/eval-inventory.test.ts @@ -0,0 +1,185 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import path from 'node:path'; +import { describe, expect, it } from 'vitest'; +import { + collectInventory, + formatInventoryReport, + type InventoryResult, +} from '../utils/eval-inventory.js'; +import type { EvalCaseRecord } from '../utils/eval-analysis.js'; + +function makeCaseRecord( + overrides: Partial = {}, +): EvalCaseRecord { + return { + filePath: '/repo/evals/test.eval.ts', + relativePath: 'evals/test.eval.ts', + helperName: 'evalTest', + baseHelperName: 'evalTest', + policy: 'USUALLY_PASSES', + name: 'test case', + hasFiles: false, + hasPrompt: true, + location: { line: 1, column: 1 }, + ...overrides, + }; +} + +describe('eval-inventory', () => { + describe('collectInventory', () => { + it('discovers eval files from the real evals directory', async () => { + const repoRoot = path.resolve(import.meta.dirname, '../../'); + const result = await collectInventory(repoRoot); + + expect(result.totalFiles).toBeGreaterThanOrEqual(36); + expect(result.totalCases).toBeGreaterThanOrEqual(90); + expect(result.files.length).toBe(result.totalFiles); + expect(result.cases.length).toBe(result.totalCases); + + for (const evalCase of result.cases) { + expect(evalCase.name).toBeTruthy(); + expect(evalCase.relativePath).toBeTruthy(); + expect(evalCase.relativePath).toMatch(/^evals\//); + } + }); + + it('returns zero counts for a directory with no eval files', async () => { + const result = await collectInventory(import.meta.dirname); + + expect(result.totalFiles).toBe(0); + expect(result.totalCases).toBe(0); + expect(result.files).toEqual([]); + expect(result.cases).toEqual([]); + }); + }); + + describe('formatInventoryReport', () => { + it('includes summary line with correct counts', () => { + const result: InventoryResult = { + totalFiles: 2, + totalCases: 3, + files: [], + cases: [ + makeCaseRecord({ policy: 'ALWAYS_PASSES', name: 'case-1' }), + makeCaseRecord({ policy: 'USUALLY_PASSES', name: 'case-2' }), + makeCaseRecord({ policy: 'USUALLY_PASSES', name: 'case-3' }), + ], + diagnostics: [], + }; + + const report = formatInventoryReport(result); + + expect(report).toContain('2 files · 3 cases · 0 diagnostics'); + }); + + it('groups cases by policy', () => { + const result: InventoryResult = { + totalFiles: 1, + totalCases: 2, + files: [], + cases: [ + makeCaseRecord({ + policy: 'ALWAYS_PASSES', + name: 'stable test', + }), + makeCaseRecord({ + policy: 'USUALLY_PASSES', + name: 'flaky test', + }), + ], + diagnostics: [], + }; + + const report = formatInventoryReport(result); + + expect(report).toContain('By Policy'); + expect(report).toContain('ALWAYS_PASSES (1 cases)'); + expect(report).toContain('USUALLY_PASSES (1 cases)'); + expect(report).toContain('• stable test'); + expect(report).toContain('• flaky test'); + }); + + it('groups cases by suite name', () => { + const result: InventoryResult = { + totalFiles: 1, + totalCases: 2, + files: [], + cases: [ + makeCaseRecord({ suiteName: 'default', name: 'suite-test' }), + makeCaseRecord({ name: 'no-suite-test' }), + ], + diagnostics: [], + }; + + const report = formatInventoryReport(result); + + expect(report).toContain('By Suite'); + expect(report).toContain('default (1 cases)'); + expect(report).toContain('(no suite) (1 cases)'); + }); + + it('shows diagnostics section when diagnostics exist', () => { + const result: InventoryResult = { + totalFiles: 1, + totalCases: 0, + files: [], + cases: [], + diagnostics: [ + { + severity: 'warning', + message: 'Could not resolve policy', + filePath: '/repo/evals/bad.eval.ts', + location: { line: 5, column: 3 }, + }, + ], + }; + + const report = formatInventoryReport(result); + + expect(report).toContain('Diagnostics'); + expect(report).toContain('1 diagnostics'); + expect(report).toContain( + '⚠ /repo/evals/bad.eval.ts:5:3 — Could not resolve policy', + ); + }); + + it('omits diagnostics section when there are none', () => { + const result: InventoryResult = { + totalFiles: 1, + totalCases: 1, + files: [], + cases: [makeCaseRecord()], + diagnostics: [], + }; + + const report = formatInventoryReport(result); + + expect(report).not.toContain('Diagnostics'); + expect(report).not.toContain('⚠'); + }); + + it('includes helper name in case listing', () => { + const result: InventoryResult = { + totalFiles: 1, + totalCases: 1, + files: [], + cases: [ + makeCaseRecord({ + helperName: 'customHelper', + name: 'custom test', + }), + ], + diagnostics: [], + }; + + const report = formatInventoryReport(result); + + expect(report).toContain('• custom test [customHelper]'); + }); + }); +}); diff --git a/scripts/tests/test-setup.ts b/scripts/tests/test-setup.ts index d4c4b4655f..bcc0be0dc9 100644 --- a/scripts/tests/test-setup.ts +++ b/scripts/tests/test-setup.ts @@ -6,7 +6,10 @@ import { vi } from 'vitest'; -vi.mock('fs', () => ({ - ...vi.importActual('fs'), - appendFileSync: vi.fn(), -})); +vi.mock('fs', async () => { + const actual = await vi.importActual('fs'); + return { + ...actual, + appendFileSync: vi.fn(), + }; +}); diff --git a/scripts/utils/eval-inventory.ts b/scripts/utils/eval-inventory.ts new file mode 100644 index 0000000000..294d563167 --- /dev/null +++ b/scripts/utils/eval-inventory.ts @@ -0,0 +1,173 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import fs from 'node:fs'; +import path from 'node:path'; +import { glob } from 'glob'; + +import { + analyzeEvalSource, + type EvalCaseRecord, + type EvalFileAnalysis, + type EvalAnalysisDiagnostic, + type EvalPolicy, +} from './eval-analysis.js'; + +export interface InventoryResult { + totalFiles: number; + totalCases: number; + files: EvalFileAnalysis[]; + cases: readonly EvalCaseRecord[]; + diagnostics: readonly EvalAnalysisDiagnostic[]; +} + +/** + * Discovers all eval files under the given repo root and runs + * the static analyzer on each, returning the aggregated results. + */ +export async function collectInventory( + repoRoot: string, +): Promise { + const evalsDir = path.join(repoRoot, 'evals'); + const pattern = '**/*.eval.{ts,tsx}'; + + const evalFiles = await glob(pattern, { + cwd: evalsDir, + absolute: true, + nodir: true, + }); + + evalFiles.sort(); + + const files: EvalFileAnalysis[] = []; + const allCases: EvalCaseRecord[] = []; + const allDiagnostics: EvalAnalysisDiagnostic[] = []; + + for (const filePath of evalFiles) { + const sourceText = await fs.promises.readFile(filePath, 'utf-8'); + const analysis = analyzeEvalSource(sourceText, { filePath, repoRoot }); + files.push(analysis); + allCases.push(...analysis.cases); + allDiagnostics.push(...analysis.diagnostics); + } + + return { + totalFiles: files.length, + totalCases: allCases.length, + files, + cases: allCases, + diagnostics: allDiagnostics, + }; +} + +/** + * Formats an InventoryResult into a human-readable report string. + */ +export function formatInventoryReport(result: InventoryResult): string { + const lines: string[] = []; + + lines.push('Eval Inventory'); + lines.push('══════════════'); + lines.push(''); + lines.push( + `${result.totalFiles} files · ${result.totalCases} cases · ${result.diagnostics.length} diagnostics`, + ); + lines.push(''); + + // --- By Policy --- + lines.push('By Policy'); + lines.push('─────────'); + + const byPolicy = groupBy(result.cases, (c) => c.policy); + const policyOrder: EvalPolicy[] = [ + 'ALWAYS_PASSES', + 'USUALLY_PASSES', + 'USUALLY_FAILS', + 'unknown', + ]; + + for (const policy of policyOrder) { + const cases = byPolicy.get(policy); + if (!cases || cases.length === 0) { + continue; + } + + lines.push(`${policy} (${cases.length} cases)`); + + const byFile = groupBy(cases, (c) => c.relativePath); + for (const [filePath, fileCases] of byFile) { + lines.push(` ${filePath}`); + for (const evalCase of fileCases) { + lines.push(` • ${evalCase.name} [${evalCase.helperName}]`); + } + } + lines.push(''); + } + + // --- By Suite --- + lines.push('By Suite'); + lines.push('────────'); + + const bySuite = groupBy(result.cases, (c) => c.suiteName ?? '(no suite)'); + const suiteNames = [...bySuite.keys()].sort((a, b) => { + if (a === b) return 0; + if (a === '(no suite)') return 1; + if (b === '(no suite)') return -1; + return a.localeCompare(b, 'en'); + }); + + for (const suite of suiteNames) { + const cases = bySuite.get(suite)!; + lines.push(`${suite} (${cases.length} cases)`); + + for (const evalCase of cases) { + lines.push( + ` • ${evalCase.name} [${evalCase.relativePath}] (${evalCase.policy})`, + ); + } + lines.push(''); + } + + // --- Diagnostics --- + if (result.diagnostics.length > 0) { + const filePaths = new Map(); + for (const f of result.files) { + filePaths.set(f.filePath, f.relativePath); + } + + lines.push('Diagnostics'); + lines.push('───────────'); + for (const diagnostic of result.diagnostics) { + const displayPath = + diagnostic.filePath === '' + ? diagnostic.filePath + : (filePaths.get(diagnostic.filePath) ?? diagnostic.filePath); + lines.push( + `⚠ ${displayPath}:${diagnostic.location.line}:${diagnostic.location.column} — ${diagnostic.message}`, + ); + } + lines.push(''); + } + + return lines.join('\n'); +} + +function groupBy( + items: readonly T[], + keyFn: (item: T) => string, +): Map { + const groups = new Map(); + for (const item of items) { + const key = keyFn(item); + const group = groups.get(key); + if (group) { + group.push(item); + } else { + groups.set(key, [item]); + } + } + return groups; +}