From e68234a573cfe565c0d7e1f1cd0a913dc4d90a32 Mon Sep 17 00:00:00 2001 From: Christian Gunderman Date: Wed, 8 Apr 2026 10:35:33 -0700 Subject: [PATCH] feat(evals): extract component-level eval infrastructure --- evals/app-test-helper.ts | 95 ++++++++++++----------- evals/component-test-helper.ts | 133 +++++++++++++++++++++++++++++++++ evals/test-helper.ts | 111 ++++++++++++++++----------- evals/tsconfig.json | 10 +++ evals/vitest.config.ts | 7 +- 5 files changed, 264 insertions(+), 92 deletions(-) create mode 100644 evals/component-test-helper.ts create mode 100644 evals/tsconfig.json diff --git a/evals/app-test-helper.ts b/evals/app-test-helper.ts index 8ea842aa38..ce54adaaf4 100644 --- a/evals/app-test-helper.ts +++ b/evals/app-test-helper.ts @@ -10,10 +10,13 @@ import { runEval, prepareLogDir, symlinkNodeModules, + withEvalRetries, + prepareWorkspace, + BaseEvalCase, + EVAL_MODEL, } from './test-helper.js'; import fs from 'node:fs'; import path from 'node:path'; -import { DEFAULT_GEMINI_MODEL } from '@google/gemini-cli-core'; /** * Config overrides for evals, with tool-restriction fields explicitly @@ -32,12 +35,9 @@ interface EvalConfigOverrides { [key: string]: unknown; } -export interface AppEvalCase { - name: string; +export interface AppEvalCase extends BaseEvalCase { configOverrides?: EvalConfigOverrides; prompt: string; - timeout?: number; - files?: Record; setup?: (rig: AppRig) => Promise; assert: (rig: AppRig, output: string) => Promise; } @@ -48,55 +48,54 @@ export interface AppEvalCase { */ export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) { const fn = async () => { - const rig = new AppRig({ - configOverrides: { - model: DEFAULT_GEMINI_MODEL, - ...evalCase.configOverrides, - }, - }); + await withEvalRetries(evalCase.name, async () => { + const rig = new AppRig({ + configOverrides: { + model: EVAL_MODEL, + ...evalCase.configOverrides, + }, + }); - const { logDir, sanitizedName } = await prepareLogDir(evalCase.name); - const logFile = path.join(logDir, `${sanitizedName}.log`); + const { logDir, sanitizedName } = await prepareLogDir(evalCase.name); + const logFile = path.join(logDir, `${sanitizedName}.log`); - try { - await rig.initialize(); + try { + await rig.initialize(); - const testDir = rig.getTestDir(); - symlinkNodeModules(testDir); + const testDir = rig.getTestDir(); + symlinkNodeModules(testDir); - // Setup initial files - if (evalCase.files) { - for (const [filePath, content] of Object.entries(evalCase.files)) { - const fullPath = path.join(testDir, filePath); - fs.mkdirSync(path.dirname(fullPath), { recursive: true }); - fs.writeFileSync(fullPath, content); + // Setup initial files + if (evalCase.files) { + // Note: AppRig does not use a separate homeDir, so we use testDir twice + await prepareWorkspace(testDir, testDir, evalCase.files); } + + // Run custom setup if provided (e.g. for breakpoints) + if (evalCase.setup) { + await evalCase.setup(rig); + } + + // Render the app! + await rig.render(); + + // Wait for initial ready state + await rig.waitForIdle(); + + // Send the initial prompt + await rig.sendMessage(evalCase.prompt); + + // Run assertion. Interaction-heavy tests can do their own waiting/steering here. + const output = rig.getStaticOutput(); + await evalCase.assert(rig, output); + } finally { + const output = rig.getStaticOutput(); + if (output) { + await fs.promises.writeFile(logFile, output); + } + await rig.unmount(); } - - // Run custom setup if provided (e.g. for breakpoints) - if (evalCase.setup) { - await evalCase.setup(rig); - } - - // Render the app! - await rig.render(); - - // Wait for initial ready state - await rig.waitForIdle(); - - // Send the initial prompt - await rig.sendMessage(evalCase.prompt); - - // Run assertion. Interaction-heavy tests can do their own waiting/steering here. - const output = rig.getStaticOutput(); - await evalCase.assert(rig, output); - } finally { - const output = rig.getStaticOutput(); - if (output) { - await fs.promises.writeFile(logFile, output); - } - await rig.unmount(); - } + }); }; runEval(policy, evalCase.name, fn, (evalCase.timeout ?? 60000) + 10000); diff --git a/evals/component-test-helper.ts b/evals/component-test-helper.ts new file mode 100644 index 0000000000..9e5c9aa567 --- /dev/null +++ b/evals/component-test-helper.ts @@ -0,0 +1,133 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { + type EvalPolicy, + runEval, + prepareLogDir, + withEvalRetries, + prepareWorkspace, + type BaseEvalCase, +} from './test-helper.js'; +import fs from 'node:fs'; +import path from 'node:path'; +import os from 'node:os'; +import { randomUUID } from 'node:crypto'; +import { + Config, + type ConfigParameters, + AuthType, + ApprovalMode, + createPolicyEngineConfig, + ExtensionLoader, + IntegrityDataStatus, + makeFakeConfig, +} from '@google/gemini-cli-core'; +import { createMockSettings } from '../packages/cli/src/test-utils/settings.js'; + +// A minimal mock ExtensionManager to bypass integrity checks +class MockExtensionManager extends ExtensionLoader { + getExtensions = () => []; + setRequestConsent = () => {}; + setRequestSetting = () => {}; + integrityManager = { + verifyExtensionIntegrity: async () => IntegrityDataStatus.VERIFIED, + storeExtensionIntegrity: async () => undefined, + }; +} + +export interface ComponentEvalCase extends BaseEvalCase { + configOverrides?: Partial; + setup?: (config: Config) => Promise; + assert: (config: Config) => Promise; +} + +export class ComponentRig { + public config: Config | undefined; + public testDir: string; + public sessionId: string; + + constructor( + private options: { configOverrides?: Partial } = {}, + ) { + const uniqueId = randomUUID(); + this.testDir = fs.mkdtempSync( + path.join(os.tmpdir(), `gemini-component-rig-${uniqueId.slice(0, 8)}-`), + ); + this.sessionId = `test-session-${uniqueId}`; + } + + async initialize() { + const settings = createMockSettings(); + const policyEngineConfig = await createPolicyEngineConfig( + settings.merged, + ApprovalMode.DEFAULT, + ); + + const configParams: ConfigParameters = { + sessionId: this.sessionId, + targetDir: this.testDir, + cwd: this.testDir, + debugMode: false, + model: 'test-model', + interactive: false, + approvalMode: ApprovalMode.DEFAULT, + policyEngineConfig, + enableEventDrivenScheduler: false, // Don't need scheduler for direct component tests + extensionLoader: new MockExtensionManager() as any, + useAlternateBuffer: false, + ...this.options.configOverrides, + }; + + this.config = makeFakeConfig(configParams); + await this.config.initialize(); + + // Refresh auth using USE_GEMINI to initialize the real BaseLlmClient + await this.config.refreshAuth(AuthType.USE_GEMINI); + } + + async cleanup() { + fs.rmSync(this.testDir, { recursive: true, force: true }); + } +} + +/** + * A helper for running behavioral evaluations directly against backend components. + * It provides a fully initialized Config with real API access, bypassing the UI. + */ +export function componentEvalTest( + policy: EvalPolicy, + evalCase: ComponentEvalCase, +) { + const fn = async () => { + await withEvalRetries(evalCase.name, async () => { + const rig = new ComponentRig({ + configOverrides: evalCase.configOverrides, + }); + + const { logDir, sanitizedName } = await prepareLogDir(evalCase.name); + const logFile = path.join(logDir, `${sanitizedName}-component.log`); + + try { + await rig.initialize(); + + if (evalCase.files) { + await prepareWorkspace(rig.testDir, rig.testDir, evalCase.files); + } + + if (evalCase.setup) { + await evalCase.setup(rig.config!); + } + + await evalCase.assert(rig.config!); + } finally { + await rig.cleanup(); + } + }); + }; + + runEval(policy, evalCase.name, fn, (evalCase.timeout ?? 60000) + 10000); +} diff --git a/evals/test-helper.ts b/evals/test-helper.ts index 2bf9188eee..0a7c5c4dd7 100644 --- a/evals/test-helper.ts +++ b/evals/test-helper.ts @@ -16,10 +16,18 @@ import { Storage, getProjectHash, SESSION_FILE_PREFIX, + PREVIEW_GEMINI_FLASH_MODEL, } from '@google/gemini-cli-core'; export * from '@google/gemini-cli-test-utils'; +/** + * The default model used for all evaluations. + * Can be overridden by setting the GEMINI_MODEL environment variable. + */ +export const EVAL_MODEL = + process.env.GEMINI_MODEL || PREVIEW_GEMINI_FLASH_MODEL; + // Indicates the consistency expectation for this test. // - ALWAYS_PASSES - Means that the test is expected to pass 100% of the time. These // These tests are typically trivial and test basic functionality with unambiguous @@ -47,11 +55,47 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) { ); } -export async function internalEvalTest(evalCase: EvalCase) { +export async function withEvalRetries( + name: string, + attemptFn: (attempt: number) => Promise, +) { const maxRetries = 3; let attempt = 0; while (attempt <= maxRetries) { + try { + await attemptFn(attempt); + return; // Success! Exit the retry loop. + } catch (error: unknown) { + const errorMessage = + error instanceof Error ? error.message : String(error); + const errorCode = getApiErrorCode(errorMessage); + + if (errorCode) { + const status = attempt < maxRetries ? 'RETRY' : 'SKIP'; + logReliabilityEvent(name, attempt, status, errorCode, errorMessage); + + if (attempt < maxRetries) { + attempt++; + console.warn( + `[Eval] Attempt ${attempt} failed with ${errorCode} Error. Retrying...`, + ); + continue; // Retry + } + + console.warn( + `[Eval] '${name}' failed after ${maxRetries} retries due to persistent API errors. Skipping failure to avoid blocking PR.`, + ); + return; // Gracefully exit without failing the test + } + + throw error; // Real failure + } + } +} + +export async function internalEvalTest(evalCase: EvalCase) { + await withEvalRetries(evalCase.name, async () => { const rig = new TestRig(); const { logDir, sanitizedName } = await prepareLogDir(evalCase.name); const activityLogFile = path.join(logDir, `${sanitizedName}.jsonl`); @@ -59,14 +103,21 @@ export async function internalEvalTest(evalCase: EvalCase) { let isSuccess = false; try { - rig.setup(evalCase.name, evalCase.params); + const setupOptions = { + ...evalCase.params, + settings: { + model: { name: EVAL_MODEL }, + ...evalCase.params?.settings, + }, + }; + rig.setup(evalCase.name, setupOptions); if (evalCase.setup) { await evalCase.setup(rig); } if (evalCase.files) { - await setupTestFiles(rig, evalCase.files); + await prepareWorkspace(rig.testDir!, rig.homeDir!, evalCase.files); } symlinkNodeModules(rig.testDir || ''); @@ -139,37 +190,6 @@ export async function internalEvalTest(evalCase: EvalCase) { await evalCase.assert(rig, result); isSuccess = true; - return; // Success! Exit the retry loop. - } catch (error: unknown) { - const errorMessage = - error instanceof Error ? error.message : String(error); - const errorCode = getApiErrorCode(errorMessage); - - if (errorCode) { - const status = attempt < maxRetries ? 'RETRY' : 'SKIP'; - logReliabilityEvent( - evalCase.name, - attempt, - status, - errorCode, - errorMessage, - ); - - if (attempt < maxRetries) { - attempt++; - console.warn( - `[Eval] Attempt ${attempt} failed with ${errorCode} Error. Retrying...`, - ); - continue; // Retry - } - - console.warn( - `[Eval] '${evalCase.name}' failed after ${maxRetries} retries due to persistent API errors. Skipping failure to avoid blocking PR.`, - ); - return; // Gracefully exit without failing the test - } - - throw error; // Real failure } finally { if (isSuccess) { await fs.promises.unlink(activityLogFile).catch((err) => { @@ -188,7 +208,7 @@ export async function internalEvalTest(evalCase: EvalCase) { ); await rig.cleanup(); } - } + }); } function getApiErrorCode(message: string): '500' | '503' | undefined { @@ -252,9 +272,13 @@ function logReliabilityEvent( * intentionally uses synchronous filesystem and child_process operations * for simplicity and to ensure sequential environment preparation. */ -async function setupTestFiles(rig: TestRig, files: Record) { +export async function prepareWorkspace( + testDir: string, + homeDir: string, + files: Record, +) { const acknowledgedAgents: Record> = {}; - const projectRoot = fs.realpathSync(rig.testDir!); + const projectRoot = fs.realpathSync(testDir); for (const [filePath, content] of Object.entries(files)) { if (filePath.includes('..') || path.isAbsolute(filePath)) { @@ -290,7 +314,7 @@ async function setupTestFiles(rig: TestRig, files: Record) { if (Object.keys(acknowledgedAgents).length > 0) { const ackPath = path.join( - rig.homeDir!, + homeDir, '.gemini', 'acknowledgments', 'agents.json', @@ -299,7 +323,7 @@ async function setupTestFiles(rig: TestRig, files: Record) { fs.writeFileSync(ackPath, JSON.stringify(acknowledgedAgents, null, 2)); } - const execOptions = { cwd: rig.testDir!, stdio: 'inherit' as const }; + const execOptions = { cwd: testDir, stdio: 'ignore' as const }; execSync('git init --initial-branch=main', execOptions); execSync('git config user.email "test@example.com"', execOptions); execSync('git config user.name "Test User"', execOptions); @@ -366,15 +390,18 @@ interface ForbiddenToolSettings { }; } -export interface EvalCase { +export interface BaseEvalCase { name: string; + timeout?: number; + files?: Record; +} + +export interface EvalCase extends BaseEvalCase { params?: { settings?: ForbiddenToolSettings & Record; [key: string]: unknown; }; prompt: string; - timeout?: number; - files?: Record; setup?: (rig: TestRig) => Promise | void; /** Conversation history to pre-load via --resume. Each entry is a message object with type, content, etc. */ messages?: Record[]; diff --git a/evals/tsconfig.json b/evals/tsconfig.json new file mode 100644 index 0000000000..edc9007206 --- /dev/null +++ b/evals/tsconfig.json @@ -0,0 +1,10 @@ +{ + "extends": "../tsconfig.json", + "compilerOptions": { + "jsx": "react-jsx", + "lib": ["DOM", "DOM.Iterable", "ES2023"], + "types": ["node", "vitest/globals"] + }, + "include": ["**/*.ts", "**/*.tsx"], + "exclude": ["node_modules", "logs"] +} diff --git a/evals/vitest.config.ts b/evals/vitest.config.ts index 50733a999c..ee397c17b5 100644 --- a/evals/vitest.config.ts +++ b/evals/vitest.config.ts @@ -15,7 +15,7 @@ export default defineConfig({ conditions: ['test'], }, test: { - testTimeout: 300000, // 5 minutes + testTimeout: 600000, // 5 minutes reporters: ['default', 'json'], outputFile: { json: 'evals/logs/report.json', @@ -24,7 +24,10 @@ export default defineConfig({ environment: 'node', globals: true, alias: { - react: path.resolve(__dirname, '../node_modules/react'), + '@google/gemini-cli-core': path.resolve( + __dirname, + '../packages/core/index.ts', + ), }, setupFiles: [path.resolve(__dirname, '../packages/cli/test-setup.ts')], server: {