mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-04-10 13:22:51 -07:00
feat(cli, core): Support hybrid evals.
This commit is contained in:
@@ -13,7 +13,12 @@ import {
|
||||
} from './test-helper.js';
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
import { DEFAULT_GEMINI_MODEL } from '@google/gemini-cli-core';
|
||||
import {
|
||||
DEFAULT_GEMINI_MODEL,
|
||||
type ScriptItem,
|
||||
extractFakeResponses,
|
||||
extractUserPrompts,
|
||||
} from '@google/gemini-cli-core';
|
||||
|
||||
export interface AppEvalCase {
|
||||
name: string;
|
||||
@@ -23,6 +28,12 @@ export interface AppEvalCase {
|
||||
files?: Record<string, string>;
|
||||
setup?: (rig: AppRig) => Promise<void>;
|
||||
assert: (rig: AppRig, output: string) => Promise<void>;
|
||||
/**
|
||||
* Optional script to "prime the pump" before the main prompt.
|
||||
* A sequential array interleaving MockUserTurn (e.g., userText('hello'))
|
||||
* and FakeResponse (e.g., mockGenerateContentStreamText('hi')).
|
||||
*/
|
||||
script?: ScriptItem[];
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -31,11 +42,25 @@ export interface AppEvalCase {
|
||||
*/
|
||||
export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
|
||||
const fn = async () => {
|
||||
const configOverrides = {
|
||||
model: DEFAULT_GEMINI_MODEL,
|
||||
...evalCase.configOverrides,
|
||||
};
|
||||
|
||||
let userPrompts: string[] = [];
|
||||
|
||||
if (evalCase.script) {
|
||||
configOverrides.fakeModelConfig = {
|
||||
responses: extractFakeResponses(evalCase.script),
|
||||
hybridHandoff: true,
|
||||
};
|
||||
|
||||
// Extract the sequence of user prompts for the Mock User driver
|
||||
userPrompts = extractUserPrompts(evalCase.script);
|
||||
}
|
||||
|
||||
const rig = new AppRig({
|
||||
configOverrides: {
|
||||
model: DEFAULT_GEMINI_MODEL,
|
||||
...evalCase.configOverrides,
|
||||
},
|
||||
configOverrides,
|
||||
});
|
||||
|
||||
const { logDir, sanitizedName } = await prepareLogDir(evalCase.name);
|
||||
@@ -62,11 +87,16 @@ export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
|
||||
}
|
||||
|
||||
// Render the app!
|
||||
rig.render();
|
||||
await rig.render();
|
||||
|
||||
// Wait for initial ready state
|
||||
await rig.waitForIdle();
|
||||
|
||||
// Execute priming script if requested
|
||||
if (userPrompts.length > 0) {
|
||||
await rig.driveMockUser(userPrompts, evalCase.timeout);
|
||||
}
|
||||
|
||||
// Send the initial prompt
|
||||
await rig.sendMessage(evalCase.prompt);
|
||||
|
||||
|
||||
38
evals/hybrid_handoff.eval.ts
Normal file
38
evals/hybrid_handoff.eval.ts
Normal file
@@ -0,0 +1,38 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2026 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { describe, expect } from 'vitest';
|
||||
import { appEvalTest } from './app-test-helper.js';
|
||||
import {
|
||||
userText,
|
||||
mockGenerateContentStreamText,
|
||||
} from '@google/gemini-cli-core';
|
||||
|
||||
describe('Hybrid Handoff (Mock User to Live Model)', () => {
|
||||
appEvalTest('ALWAYS_PASSES', {
|
||||
name: 'Mock User successfully primes AppRig using a scripted history and hands off to live model',
|
||||
timeout: 120000,
|
||||
script: [
|
||||
userText('Start priming'),
|
||||
mockGenerateContentStreamText(
|
||||
"Hello! I am a fake response. Let's prime the pump.",
|
||||
),
|
||||
userText('Continue priming'),
|
||||
mockGenerateContentStreamText(
|
||||
'Pump primed successfully. Ready for handoff.',
|
||||
),
|
||||
],
|
||||
prompt: 'What is 5 * 5? Please answer with just the final number.',
|
||||
assert: async (rig) => {
|
||||
await rig.drainBreakpointsUntilIdle(undefined, 30000);
|
||||
|
||||
const liveOutput = rig.getStaticOutput();
|
||||
|
||||
// Ensure the handoff was successful
|
||||
expect(liveOutput).toContain('25');
|
||||
},
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user