Files
gemini-cli/evals/app-test-helper.ts
2026-02-13 04:10:24 +00:00

77 lines
2.1 KiB
TypeScript

/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { it } from 'vitest';
import { AppRig } from '../packages/cli/src/test-utils/AppRig.js';
import type { EvalPolicy } from './test-helper.js';
import fs from 'node:fs';
import path from 'node:path';
import { DEFAULT_GEMINI_MODEL } from '@google/gemini-cli-core';
export interface AppEvalCase {
name: string;
configOverrides?: any;
prompt: string;
timeout?: number;
files?: Record<string, string>;
setup?: (rig: AppRig) => Promise<void>;
assert: (rig: AppRig, output: string) => Promise<void>;
}
/**
* A helper for running behavioral evaluations using the in-process AppRig.
* This matches the API of evalTest in test-helper.ts as closely as possible.
*/
export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
const fn = async () => {
const rig = new AppRig({
configOverrides: {
model: DEFAULT_GEMINI_MODEL,
...evalCase.configOverrides,
},
});
try {
await rig.initialize();
// Setup initial files
if (evalCase.files) {
const testDir = rig.getTestDir();
for (const [filePath, content] of Object.entries(evalCase.files)) {
const fullPath = path.join(testDir, filePath);
fs.mkdirSync(path.dirname(fullPath), { recursive: true });
fs.writeFileSync(fullPath, content);
}
}
// Run custom setup if provided (e.g. for breakpoints)
if (evalCase.setup) {
await evalCase.setup(rig);
}
// Render the app!
rig.render();
// Wait for initial ready state
await rig.waitForIdle();
// Send the initial prompt
await rig.sendMessage(evalCase.prompt);
// Run assertion. Interaction-heavy tests can do their own waiting/steering here.
await evalCase.assert(rig, rig.getStaticOutput());
} finally {
await rig.unmount();
}
};
if (policy === 'USUALLY_PASSES' && !process.env['RUN_EVALS']) {
it.skip(evalCase.name, fn);
} else {
it(evalCase.name, fn, (evalCase.timeout ?? 60000) + 10000);
}
}