mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-05-19 00:02:51 -07:00
feat(evals): extract component-level eval infrastructure
This commit is contained in:
+47
-48
@@ -10,10 +10,13 @@ import {
|
||||
runEval,
|
||||
prepareLogDir,
|
||||
symlinkNodeModules,
|
||||
withEvalRetries,
|
||||
prepareWorkspace,
|
||||
BaseEvalCase,
|
||||
EVAL_MODEL,
|
||||
} from './test-helper.js';
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
import { DEFAULT_GEMINI_MODEL } from '@google/gemini-cli-core';
|
||||
|
||||
/**
|
||||
* Config overrides for evals, with tool-restriction fields explicitly
|
||||
@@ -32,12 +35,9 @@ interface EvalConfigOverrides {
|
||||
[key: string]: unknown;
|
||||
}
|
||||
|
||||
export interface AppEvalCase {
|
||||
name: string;
|
||||
export interface AppEvalCase extends BaseEvalCase {
|
||||
configOverrides?: EvalConfigOverrides;
|
||||
prompt: string;
|
||||
timeout?: number;
|
||||
files?: Record<string, string>;
|
||||
setup?: (rig: AppRig) => Promise<void>;
|
||||
assert: (rig: AppRig, output: string) => Promise<void>;
|
||||
}
|
||||
@@ -48,55 +48,54 @@ export interface AppEvalCase {
|
||||
*/
|
||||
export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
|
||||
const fn = async () => {
|
||||
const rig = new AppRig({
|
||||
configOverrides: {
|
||||
model: DEFAULT_GEMINI_MODEL,
|
||||
...evalCase.configOverrides,
|
||||
},
|
||||
});
|
||||
await withEvalRetries(evalCase.name, async () => {
|
||||
const rig = new AppRig({
|
||||
configOverrides: {
|
||||
model: EVAL_MODEL,
|
||||
...evalCase.configOverrides,
|
||||
},
|
||||
});
|
||||
|
||||
const { logDir, sanitizedName } = await prepareLogDir(evalCase.name);
|
||||
const logFile = path.join(logDir, `${sanitizedName}.log`);
|
||||
const { logDir, sanitizedName } = await prepareLogDir(evalCase.name);
|
||||
const logFile = path.join(logDir, `${sanitizedName}.log`);
|
||||
|
||||
try {
|
||||
await rig.initialize();
|
||||
try {
|
||||
await rig.initialize();
|
||||
|
||||
const testDir = rig.getTestDir();
|
||||
symlinkNodeModules(testDir);
|
||||
const testDir = rig.getTestDir();
|
||||
symlinkNodeModules(testDir);
|
||||
|
||||
// Setup initial files
|
||||
if (evalCase.files) {
|
||||
for (const [filePath, content] of Object.entries(evalCase.files)) {
|
||||
const fullPath = path.join(testDir, filePath);
|
||||
fs.mkdirSync(path.dirname(fullPath), { recursive: true });
|
||||
fs.writeFileSync(fullPath, content);
|
||||
// Setup initial files
|
||||
if (evalCase.files) {
|
||||
// Note: AppRig does not use a separate homeDir, so we use testDir twice
|
||||
await prepareWorkspace(testDir, testDir, evalCase.files);
|
||||
}
|
||||
|
||||
// Run custom setup if provided (e.g. for breakpoints)
|
||||
if (evalCase.setup) {
|
||||
await evalCase.setup(rig);
|
||||
}
|
||||
|
||||
// Render the app!
|
||||
await rig.render();
|
||||
|
||||
// Wait for initial ready state
|
||||
await rig.waitForIdle();
|
||||
|
||||
// Send the initial prompt
|
||||
await rig.sendMessage(evalCase.prompt);
|
||||
|
||||
// Run assertion. Interaction-heavy tests can do their own waiting/steering here.
|
||||
const output = rig.getStaticOutput();
|
||||
await evalCase.assert(rig, output);
|
||||
} finally {
|
||||
const output = rig.getStaticOutput();
|
||||
if (output) {
|
||||
await fs.promises.writeFile(logFile, output);
|
||||
}
|
||||
await rig.unmount();
|
||||
}
|
||||
|
||||
// Run custom setup if provided (e.g. for breakpoints)
|
||||
if (evalCase.setup) {
|
||||
await evalCase.setup(rig);
|
||||
}
|
||||
|
||||
// Render the app!
|
||||
await rig.render();
|
||||
|
||||
// Wait for initial ready state
|
||||
await rig.waitForIdle();
|
||||
|
||||
// Send the initial prompt
|
||||
await rig.sendMessage(evalCase.prompt);
|
||||
|
||||
// Run assertion. Interaction-heavy tests can do their own waiting/steering here.
|
||||
const output = rig.getStaticOutput();
|
||||
await evalCase.assert(rig, output);
|
||||
} finally {
|
||||
const output = rig.getStaticOutput();
|
||||
if (output) {
|
||||
await fs.promises.writeFile(logFile, output);
|
||||
}
|
||||
await rig.unmount();
|
||||
}
|
||||
});
|
||||
};
|
||||
|
||||
runEval(policy, evalCase.name, fn, (evalCase.timeout ?? 60000) + 10000);
|
||||
|
||||
@@ -0,0 +1,133 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import {
|
||||
type EvalPolicy,
|
||||
runEval,
|
||||
prepareLogDir,
|
||||
withEvalRetries,
|
||||
prepareWorkspace,
|
||||
type BaseEvalCase,
|
||||
} from './test-helper.js';
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
import os from 'node:os';
|
||||
import { randomUUID } from 'node:crypto';
|
||||
import {
|
||||
Config,
|
||||
type ConfigParameters,
|
||||
AuthType,
|
||||
ApprovalMode,
|
||||
createPolicyEngineConfig,
|
||||
ExtensionLoader,
|
||||
IntegrityDataStatus,
|
||||
makeFakeConfig,
|
||||
} from '@google/gemini-cli-core';
|
||||
import { createMockSettings } from '../packages/cli/src/test-utils/settings.js';
|
||||
|
||||
// A minimal mock ExtensionManager to bypass integrity checks
|
||||
class MockExtensionManager extends ExtensionLoader {
|
||||
getExtensions = () => [];
|
||||
setRequestConsent = () => {};
|
||||
setRequestSetting = () => {};
|
||||
integrityManager = {
|
||||
verifyExtensionIntegrity: async () => IntegrityDataStatus.VERIFIED,
|
||||
storeExtensionIntegrity: async () => undefined,
|
||||
};
|
||||
}
|
||||
|
||||
export interface ComponentEvalCase extends BaseEvalCase {
|
||||
configOverrides?: Partial<ConfigParameters>;
|
||||
setup?: (config: Config) => Promise<void>;
|
||||
assert: (config: Config) => Promise<void>;
|
||||
}
|
||||
|
||||
export class ComponentRig {
|
||||
public config: Config | undefined;
|
||||
public testDir: string;
|
||||
public sessionId: string;
|
||||
|
||||
constructor(
|
||||
private options: { configOverrides?: Partial<ConfigParameters> } = {},
|
||||
) {
|
||||
const uniqueId = randomUUID();
|
||||
this.testDir = fs.mkdtempSync(
|
||||
path.join(os.tmpdir(), `gemini-component-rig-${uniqueId.slice(0, 8)}-`),
|
||||
);
|
||||
this.sessionId = `test-session-${uniqueId}`;
|
||||
}
|
||||
|
||||
async initialize() {
|
||||
const settings = createMockSettings();
|
||||
const policyEngineConfig = await createPolicyEngineConfig(
|
||||
settings.merged,
|
||||
ApprovalMode.DEFAULT,
|
||||
);
|
||||
|
||||
const configParams: ConfigParameters = {
|
||||
sessionId: this.sessionId,
|
||||
targetDir: this.testDir,
|
||||
cwd: this.testDir,
|
||||
debugMode: false,
|
||||
model: 'test-model',
|
||||
interactive: false,
|
||||
approvalMode: ApprovalMode.DEFAULT,
|
||||
policyEngineConfig,
|
||||
enableEventDrivenScheduler: false, // Don't need scheduler for direct component tests
|
||||
extensionLoader: new MockExtensionManager() as any,
|
||||
useAlternateBuffer: false,
|
||||
...this.options.configOverrides,
|
||||
};
|
||||
|
||||
this.config = makeFakeConfig(configParams);
|
||||
await this.config.initialize();
|
||||
|
||||
// Refresh auth using USE_GEMINI to initialize the real BaseLlmClient
|
||||
await this.config.refreshAuth(AuthType.USE_GEMINI);
|
||||
}
|
||||
|
||||
async cleanup() {
|
||||
fs.rmSync(this.testDir, { recursive: true, force: true });
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A helper for running behavioral evaluations directly against backend components.
|
||||
* It provides a fully initialized Config with real API access, bypassing the UI.
|
||||
*/
|
||||
export function componentEvalTest(
|
||||
policy: EvalPolicy,
|
||||
evalCase: ComponentEvalCase,
|
||||
) {
|
||||
const fn = async () => {
|
||||
await withEvalRetries(evalCase.name, async () => {
|
||||
const rig = new ComponentRig({
|
||||
configOverrides: evalCase.configOverrides,
|
||||
});
|
||||
|
||||
const { logDir, sanitizedName } = await prepareLogDir(evalCase.name);
|
||||
const logFile = path.join(logDir, `${sanitizedName}-component.log`);
|
||||
|
||||
try {
|
||||
await rig.initialize();
|
||||
|
||||
if (evalCase.files) {
|
||||
await prepareWorkspace(rig.testDir, rig.testDir, evalCase.files);
|
||||
}
|
||||
|
||||
if (evalCase.setup) {
|
||||
await evalCase.setup(rig.config!);
|
||||
}
|
||||
|
||||
await evalCase.assert(rig.config!);
|
||||
} finally {
|
||||
await rig.cleanup();
|
||||
}
|
||||
});
|
||||
};
|
||||
|
||||
runEval(policy, evalCase.name, fn, (evalCase.timeout ?? 60000) + 10000);
|
||||
}
|
||||
+69
-42
@@ -16,10 +16,18 @@ import {
|
||||
Storage,
|
||||
getProjectHash,
|
||||
SESSION_FILE_PREFIX,
|
||||
PREVIEW_GEMINI_FLASH_MODEL,
|
||||
} from '@google/gemini-cli-core';
|
||||
|
||||
export * from '@google/gemini-cli-test-utils';
|
||||
|
||||
/**
|
||||
* The default model used for all evaluations.
|
||||
* Can be overridden by setting the GEMINI_MODEL environment variable.
|
||||
*/
|
||||
export const EVAL_MODEL =
|
||||
process.env.GEMINI_MODEL || PREVIEW_GEMINI_FLASH_MODEL;
|
||||
|
||||
// Indicates the consistency expectation for this test.
|
||||
// - ALWAYS_PASSES - Means that the test is expected to pass 100% of the time. These
|
||||
// These tests are typically trivial and test basic functionality with unambiguous
|
||||
@@ -47,11 +55,47 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
|
||||
);
|
||||
}
|
||||
|
||||
export async function internalEvalTest(evalCase: EvalCase) {
|
||||
export async function withEvalRetries(
|
||||
name: string,
|
||||
attemptFn: (attempt: number) => Promise<void>,
|
||||
) {
|
||||
const maxRetries = 3;
|
||||
let attempt = 0;
|
||||
|
||||
while (attempt <= maxRetries) {
|
||||
try {
|
||||
await attemptFn(attempt);
|
||||
return; // Success! Exit the retry loop.
|
||||
} catch (error: unknown) {
|
||||
const errorMessage =
|
||||
error instanceof Error ? error.message : String(error);
|
||||
const errorCode = getApiErrorCode(errorMessage);
|
||||
|
||||
if (errorCode) {
|
||||
const status = attempt < maxRetries ? 'RETRY' : 'SKIP';
|
||||
logReliabilityEvent(name, attempt, status, errorCode, errorMessage);
|
||||
|
||||
if (attempt < maxRetries) {
|
||||
attempt++;
|
||||
console.warn(
|
||||
`[Eval] Attempt ${attempt} failed with ${errorCode} Error. Retrying...`,
|
||||
);
|
||||
continue; // Retry
|
||||
}
|
||||
|
||||
console.warn(
|
||||
`[Eval] '${name}' failed after ${maxRetries} retries due to persistent API errors. Skipping failure to avoid blocking PR.`,
|
||||
);
|
||||
return; // Gracefully exit without failing the test
|
||||
}
|
||||
|
||||
throw error; // Real failure
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export async function internalEvalTest(evalCase: EvalCase) {
|
||||
await withEvalRetries(evalCase.name, async () => {
|
||||
const rig = new TestRig();
|
||||
const { logDir, sanitizedName } = await prepareLogDir(evalCase.name);
|
||||
const activityLogFile = path.join(logDir, `${sanitizedName}.jsonl`);
|
||||
@@ -59,14 +103,21 @@ export async function internalEvalTest(evalCase: EvalCase) {
|
||||
let isSuccess = false;
|
||||
|
||||
try {
|
||||
rig.setup(evalCase.name, evalCase.params);
|
||||
const setupOptions = {
|
||||
...evalCase.params,
|
||||
settings: {
|
||||
model: { name: EVAL_MODEL },
|
||||
...evalCase.params?.settings,
|
||||
},
|
||||
};
|
||||
rig.setup(evalCase.name, setupOptions);
|
||||
|
||||
if (evalCase.setup) {
|
||||
await evalCase.setup(rig);
|
||||
}
|
||||
|
||||
if (evalCase.files) {
|
||||
await setupTestFiles(rig, evalCase.files);
|
||||
await prepareWorkspace(rig.testDir!, rig.homeDir!, evalCase.files);
|
||||
}
|
||||
|
||||
symlinkNodeModules(rig.testDir || '');
|
||||
@@ -139,37 +190,6 @@ export async function internalEvalTest(evalCase: EvalCase) {
|
||||
|
||||
await evalCase.assert(rig, result);
|
||||
isSuccess = true;
|
||||
return; // Success! Exit the retry loop.
|
||||
} catch (error: unknown) {
|
||||
const errorMessage =
|
||||
error instanceof Error ? error.message : String(error);
|
||||
const errorCode = getApiErrorCode(errorMessage);
|
||||
|
||||
if (errorCode) {
|
||||
const status = attempt < maxRetries ? 'RETRY' : 'SKIP';
|
||||
logReliabilityEvent(
|
||||
evalCase.name,
|
||||
attempt,
|
||||
status,
|
||||
errorCode,
|
||||
errorMessage,
|
||||
);
|
||||
|
||||
if (attempt < maxRetries) {
|
||||
attempt++;
|
||||
console.warn(
|
||||
`[Eval] Attempt ${attempt} failed with ${errorCode} Error. Retrying...`,
|
||||
);
|
||||
continue; // Retry
|
||||
}
|
||||
|
||||
console.warn(
|
||||
`[Eval] '${evalCase.name}' failed after ${maxRetries} retries due to persistent API errors. Skipping failure to avoid blocking PR.`,
|
||||
);
|
||||
return; // Gracefully exit without failing the test
|
||||
}
|
||||
|
||||
throw error; // Real failure
|
||||
} finally {
|
||||
if (isSuccess) {
|
||||
await fs.promises.unlink(activityLogFile).catch((err) => {
|
||||
@@ -188,7 +208,7 @@ export async function internalEvalTest(evalCase: EvalCase) {
|
||||
);
|
||||
await rig.cleanup();
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
function getApiErrorCode(message: string): '500' | '503' | undefined {
|
||||
@@ -252,9 +272,13 @@ function logReliabilityEvent(
|
||||
* intentionally uses synchronous filesystem and child_process operations
|
||||
* for simplicity and to ensure sequential environment preparation.
|
||||
*/
|
||||
async function setupTestFiles(rig: TestRig, files: Record<string, string>) {
|
||||
export async function prepareWorkspace(
|
||||
testDir: string,
|
||||
homeDir: string,
|
||||
files: Record<string, string>,
|
||||
) {
|
||||
const acknowledgedAgents: Record<string, Record<string, string>> = {};
|
||||
const projectRoot = fs.realpathSync(rig.testDir!);
|
||||
const projectRoot = fs.realpathSync(testDir);
|
||||
|
||||
for (const [filePath, content] of Object.entries(files)) {
|
||||
if (filePath.includes('..') || path.isAbsolute(filePath)) {
|
||||
@@ -290,7 +314,7 @@ async function setupTestFiles(rig: TestRig, files: Record<string, string>) {
|
||||
|
||||
if (Object.keys(acknowledgedAgents).length > 0) {
|
||||
const ackPath = path.join(
|
||||
rig.homeDir!,
|
||||
homeDir,
|
||||
'.gemini',
|
||||
'acknowledgments',
|
||||
'agents.json',
|
||||
@@ -299,7 +323,7 @@ async function setupTestFiles(rig: TestRig, files: Record<string, string>) {
|
||||
fs.writeFileSync(ackPath, JSON.stringify(acknowledgedAgents, null, 2));
|
||||
}
|
||||
|
||||
const execOptions = { cwd: rig.testDir!, stdio: 'inherit' as const };
|
||||
const execOptions = { cwd: testDir, stdio: 'ignore' as const };
|
||||
execSync('git init --initial-branch=main', execOptions);
|
||||
execSync('git config user.email "test@example.com"', execOptions);
|
||||
execSync('git config user.name "Test User"', execOptions);
|
||||
@@ -366,15 +390,18 @@ interface ForbiddenToolSettings {
|
||||
};
|
||||
}
|
||||
|
||||
export interface EvalCase {
|
||||
export interface BaseEvalCase {
|
||||
name: string;
|
||||
timeout?: number;
|
||||
files?: Record<string, string>;
|
||||
}
|
||||
|
||||
export interface EvalCase extends BaseEvalCase {
|
||||
params?: {
|
||||
settings?: ForbiddenToolSettings & Record<string, unknown>;
|
||||
[key: string]: unknown;
|
||||
};
|
||||
prompt: string;
|
||||
timeout?: number;
|
||||
files?: Record<string, string>;
|
||||
setup?: (rig: TestRig) => Promise<void> | void;
|
||||
/** Conversation history to pre-load via --resume. Each entry is a message object with type, content, etc. */
|
||||
messages?: Record<string, unknown>[];
|
||||
|
||||
@@ -0,0 +1,10 @@
|
||||
{
|
||||
"extends": "../tsconfig.json",
|
||||
"compilerOptions": {
|
||||
"jsx": "react-jsx",
|
||||
"lib": ["DOM", "DOM.Iterable", "ES2023"],
|
||||
"types": ["node", "vitest/globals"]
|
||||
},
|
||||
"include": ["**/*.ts", "**/*.tsx"],
|
||||
"exclude": ["node_modules", "logs"]
|
||||
}
|
||||
@@ -15,7 +15,7 @@ export default defineConfig({
|
||||
conditions: ['test'],
|
||||
},
|
||||
test: {
|
||||
testTimeout: 300000, // 5 minutes
|
||||
testTimeout: 600000, // 5 minutes
|
||||
reporters: ['default', 'json'],
|
||||
outputFile: {
|
||||
json: 'evals/logs/report.json',
|
||||
@@ -24,7 +24,10 @@ export default defineConfig({
|
||||
environment: 'node',
|
||||
globals: true,
|
||||
alias: {
|
||||
react: path.resolve(__dirname, '../node_modules/react'),
|
||||
'@google/gemini-cli-core': path.resolve(
|
||||
__dirname,
|
||||
'../packages/core/index.ts',
|
||||
),
|
||||
},
|
||||
setupFiles: [path.resolve(__dirname, '../packages/cli/test-setup.ts')],
|
||||
server: {
|
||||
|
||||
Reference in New Issue
Block a user