Files
gemini-cli/evals/app-test-helper.ts
2026-03-20 02:12:15 +00:00

117 lines
3.1 KiB
TypeScript

/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { AppRig } from '../packages/cli/src/test-utils/AppRig.js';
import {
type EvalPolicy,
runEval,
prepareLogDir,
symlinkNodeModules,
} from './test-helper.js';
import fs from 'node:fs';
import path from 'node:path';
import {
DEFAULT_GEMINI_MODEL,
type ScriptItem,
extractFakeResponses,
extractUserPrompts,
} from '@google/gemini-cli-core';
export interface AppEvalCase {
name: string;
configOverrides?: any;
prompt: string;
timeout?: number;
files?: Record<string, string>;
setup?: (rig: AppRig) => Promise<void>;
assert: (rig: AppRig, output: string) => Promise<void>;
/**
* Optional script to "prime the pump" before the main prompt.
* A sequential array interleaving MockUserTurn (e.g., userText('hello'))
* and FakeResponse (e.g., mockGenerateContentStreamText('hi')).
*/
script?: ScriptItem[];
}
/**
* A helper for running behavioral evaluations using the in-process AppRig.
* This matches the API of evalTest in test-helper.ts as closely as possible.
*/
export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
const fn = async () => {
const configOverrides = {
model: DEFAULT_GEMINI_MODEL,
...evalCase.configOverrides,
};
let userPrompts: string[] = [];
if (evalCase.script) {
configOverrides.fakeModelConfig = {
responses: extractFakeResponses(evalCase.script),
hybridHandoff: true,
};
// Extract the sequence of user prompts for the Mock User driver
userPrompts = extractUserPrompts(evalCase.script);
}
const rig = new AppRig({
configOverrides,
});
const { logDir, sanitizedName } = await prepareLogDir(evalCase.name);
const logFile = path.join(logDir, `${sanitizedName}.log`);
try {
await rig.initialize();
const testDir = rig.getTestDir();
symlinkNodeModules(testDir);
// Setup initial files
if (evalCase.files) {
for (const [filePath, content] of Object.entries(evalCase.files)) {
const fullPath = path.join(testDir, filePath);
fs.mkdirSync(path.dirname(fullPath), { recursive: true });
fs.writeFileSync(fullPath, content);
}
}
// Run custom setup if provided (e.g. for breakpoints)
if (evalCase.setup) {
await evalCase.setup(rig);
}
// Render the app!
await rig.render();
// Wait for initial ready state
await rig.waitForIdle();
// Execute priming script if requested
if (userPrompts.length > 0) {
await rig.driveMockUser(userPrompts, evalCase.timeout);
}
// Send the initial prompt
await rig.sendMessage(evalCase.prompt);
// Run assertion. Interaction-heavy tests can do their own waiting/steering here.
const output = rig.getStaticOutput();
await evalCase.assert(rig, output);
} finally {
const output = rig.getStaticOutput();
if (output) {
await fs.promises.writeFile(logFile, output);
}
await rig.unmount();
}
};
runEval(policy, evalCase.name, fn, (evalCase.timeout ?? 60000) + 10000);
}