feat(core): experimental in-progress steering hints (1 of 3) (#19008)

This commit is contained in:
joshualitt
2026-02-17 14:59:33 -08:00
committed by GitHub
parent 5e2f5df62c
commit 55c628e967
20 changed files with 1381 additions and 60 deletions

86
evals/app-test-helper.ts Normal file
View File

@@ -0,0 +1,86 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { AppRig } from '../packages/cli/src/test-utils/AppRig.js';
import {
type EvalPolicy,
runEval,
prepareLogDir,
symlinkNodeModules,
} from './test-helper.js';
import fs from 'node:fs';
import path from 'node:path';
import { DEFAULT_GEMINI_MODEL } from '@google/gemini-cli-core';
export interface AppEvalCase {
name: string;
configOverrides?: any;
prompt: string;
timeout?: number;
files?: Record<string, string>;
setup?: (rig: AppRig) => Promise<void>;
assert: (rig: AppRig, output: string) => Promise<void>;
}
/**
* A helper for running behavioral evaluations using the in-process AppRig.
* This matches the API of evalTest in test-helper.ts as closely as possible.
*/
export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
const fn = async () => {
const rig = new AppRig({
configOverrides: {
model: DEFAULT_GEMINI_MODEL,
...evalCase.configOverrides,
},
});
const { logDir, sanitizedName } = await prepareLogDir(evalCase.name);
const logFile = path.join(logDir, `${sanitizedName}.log`);
try {
await rig.initialize();
const testDir = rig.getTestDir();
symlinkNodeModules(testDir);
// Setup initial files
if (evalCase.files) {
for (const [filePath, content] of Object.entries(evalCase.files)) {
const fullPath = path.join(testDir, filePath);
fs.mkdirSync(path.dirname(fullPath), { recursive: true });
fs.writeFileSync(fullPath, content);
}
}
// Run custom setup if provided (e.g. for breakpoints)
if (evalCase.setup) {
await evalCase.setup(rig);
}
// Render the app!
rig.render();
// Wait for initial ready state
await rig.waitForIdle();
// Send the initial prompt
await rig.sendMessage(evalCase.prompt);
// Run assertion. Interaction-heavy tests can do their own waiting/steering here.
const output = rig.getStaticOutput();
await evalCase.assert(rig, output);
} finally {
const output = rig.getStaticOutput();
if (output) {
await fs.promises.writeFile(logFile, output);
}
await rig.unmount();
}
};
runEval(policy, evalCase.name, fn, (evalCase.timeout ?? 60000) + 10000);
}

View File

@@ -47,11 +47,7 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
// Symlink node modules to reduce the amount of time needed to
// bootstrap test projects.
const rootNodeModules = path.join(process.cwd(), 'node_modules');
const testNodeModules = path.join(rig.testDir || '', 'node_modules');
if (fs.existsSync(rootNodeModules) && !fs.existsSync(testNodeModules)) {
fs.symlinkSync(rootNodeModules, testNodeModules, 'dir');
}
symlinkNodeModules(rig.testDir || '');
if (evalCase.files) {
const acknowledgedAgents: Record<string, Record<string, string>> = {};
@@ -159,20 +155,47 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
}
};
runEval(policy, evalCase.name, fn, evalCase.timeout);
}
/**
* Wraps a test function with the appropriate Vitest 'it' or 'it.skip' based on policy.
*/
export function runEval(
policy: EvalPolicy,
name: string,
fn: () => Promise<void>,
timeout?: number,
) {
if (policy === 'USUALLY_PASSES' && !process.env['RUN_EVALS']) {
it.skip(evalCase.name, fn);
it.skip(name, fn);
} else {
it(evalCase.name, fn, evalCase.timeout);
it(name, fn, timeout);
}
}
async function prepareLogDir(name: string) {
export async function prepareLogDir(name: string) {
const logDir = path.resolve(process.cwd(), 'evals/logs');
await fs.promises.mkdir(logDir, { recursive: true });
const sanitizedName = name.replace(/[^a-z0-9]/gi, '_').toLowerCase();
return { logDir, sanitizedName };
}
/**
* Symlinks node_modules to the test directory to speed up tests that need to run tools.
*/
export function symlinkNodeModules(testDir: string) {
const rootNodeModules = path.join(process.cwd(), 'node_modules');
const testNodeModules = path.join(testDir, 'node_modules');
if (
testDir &&
fs.existsSync(rootNodeModules) &&
!fs.existsSync(testNodeModules)
) {
fs.symlinkSync(rootNodeModules, testNodeModules, 'dir');
}
}
export interface EvalCase {
name: string;
params?: Record<string, any>;

View File

@@ -5,8 +5,15 @@
*/
import { defineConfig } from 'vitest/config';
import { fileURLToPath } from 'node:url';
import * as path from 'node:path';
const __dirname = path.dirname(fileURLToPath(import.meta.url));
export default defineConfig({
resolve: {
conditions: ['test'],
},
test: {
testTimeout: 300000, // 5 minutes
reporters: ['default', 'json'],
@@ -14,5 +21,16 @@ export default defineConfig({
json: 'evals/logs/report.json',
},
include: ['**/*.eval.ts'],
environment: 'node',
globals: true,
alias: {
react: path.resolve(__dirname, '../node_modules/react'),
},
setupFiles: [path.resolve(__dirname, '../packages/cli/test-setup.ts')],
server: {
deps: {
inline: [/@google\/gemini-cli-core/],
},
},
},
});