feat(cli, core): Support hybrid evals.

This commit is contained in:
Your Name
2026-03-19 16:44:32 +00:00
parent 98d1bec99f
commit f178f9c020
20 changed files with 980 additions and 95 deletions

View File

@@ -13,7 +13,12 @@ import {
} from './test-helper.js';
import fs from 'node:fs';
import path from 'node:path';
import { DEFAULT_GEMINI_MODEL } from '@google/gemini-cli-core';
import {
DEFAULT_GEMINI_MODEL,
type ScriptItem,
extractFakeResponses,
extractUserPrompts,
} from '@google/gemini-cli-core';
export interface AppEvalCase {
name: string;
@@ -23,6 +28,12 @@ export interface AppEvalCase {
files?: Record<string, string>;
setup?: (rig: AppRig) => Promise<void>;
assert: (rig: AppRig, output: string) => Promise<void>;
/**
* Optional script to "prime the pump" before the main prompt.
* A sequential array interleaving MockUserTurn (e.g., userText('hello'))
* and FakeResponse (e.g., mockGenerateContentStreamText('hi')).
*/
script?: ScriptItem[];
}
/**
@@ -31,11 +42,25 @@ export interface AppEvalCase {
*/
export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
const fn = async () => {
const configOverrides = {
model: DEFAULT_GEMINI_MODEL,
...evalCase.configOverrides,
};
let userPrompts: string[] = [];
if (evalCase.script) {
configOverrides.fakeModelConfig = {
responses: extractFakeResponses(evalCase.script),
hybridHandoff: true,
};
// Extract the sequence of user prompts for the Mock User driver
userPrompts = extractUserPrompts(evalCase.script);
}
const rig = new AppRig({
configOverrides: {
model: DEFAULT_GEMINI_MODEL,
...evalCase.configOverrides,
},
configOverrides,
});
const { logDir, sanitizedName } = await prepareLogDir(evalCase.name);
@@ -62,11 +87,16 @@ export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
}
// Render the app!
rig.render();
await rig.render();
// Wait for initial ready state
await rig.waitForIdle();
// Execute priming script if requested
if (userPrompts.length > 0) {
await rig.driveMockUser(userPrompts, evalCase.timeout);
}
// Send the initial prompt
await rig.sendMessage(evalCase.prompt);

View File

@@ -0,0 +1,38 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, expect } from 'vitest';
import { appEvalTest } from './app-test-helper.js';
import {
userText,
mockGenerateContentStreamText,
} from '@google/gemini-cli-core';
describe('Hybrid Handoff (Mock User to Live Model)', () => {
appEvalTest('ALWAYS_PASSES', {
name: 'Mock User successfully primes AppRig using a scripted history and hands off to live model',
timeout: 120000,
script: [
userText('Start priming'),
mockGenerateContentStreamText(
"Hello! I am a fake response. Let's prime the pump.",
),
userText('Continue priming'),
mockGenerateContentStreamText(
'Pump primed successfully. Ready for handoff.',
),
],
prompt: 'What is 5 * 5? Please answer with just the final number.',
assert: async (rig) => {
await rig.drainBreakpointsUntilIdle(undefined, 30000);
const liveOutput = rig.getStaticOutput();
// Ensure the handoff was successful
expect(liveOutput).toContain('25');
},
});
});