feat(cli, core): Support hybrid evals.

2026-07-11 18:40:57 -07:00 · 2026-03-19 16:44:32 +00:00
parent 98d1bec99f
commit f178f9c020
20 changed files with 980 additions and 95 deletions
@@ -13,7 +13,12 @@ import {
 } from './test-helper.js';
 import fs from 'node:fs';
 import path from 'node:path';
-import { DEFAULT_GEMINI_MODEL } from '@google/gemini-cli-core';
+import {
+  DEFAULT_GEMINI_MODEL,
+  type ScriptItem,
+  extractFakeResponses,
+  extractUserPrompts,
+} from '@google/gemini-cli-core';

 export interface AppEvalCase {
  name: string;
@@ -23,6 +28,12 @@ export interface AppEvalCase {
  files?: Record<string, string>;
  setup?: (rig: AppRig) => Promise<void>;
  assert: (rig: AppRig, output: string) => Promise<void>;
+  /**
+   * Optional script to "prime the pump" before the main prompt.
+   * A sequential array interleaving MockUserTurn (e.g., userText('hello'))
+   * and FakeResponse (e.g., mockGenerateContentStreamText('hi')).
+   */
+  script?: ScriptItem[];
 }

 /**
@@ -31,11 +42,25 @@ export interface AppEvalCase {
 */
 export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
  const fn = async () => {
+    const configOverrides = {
+      model: DEFAULT_GEMINI_MODEL,
+      ...evalCase.configOverrides,
+    };
+
+    let userPrompts: string[] = [];
+
+    if (evalCase.script) {
+      configOverrides.fakeModelConfig = {
+        responses: extractFakeResponses(evalCase.script),
+        hybridHandoff: true,
+      };
+
+      // Extract the sequence of user prompts for the Mock User driver
+      userPrompts = extractUserPrompts(evalCase.script);
+    }
+
    const rig = new AppRig({
-      configOverrides: {
-        model: DEFAULT_GEMINI_MODEL,
-        ...evalCase.configOverrides,
-      },
+      configOverrides,
    });

    const { logDir, sanitizedName } = await prepareLogDir(evalCase.name);
@@ -62,11 +87,16 @@ export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
      }

      // Render the app!
-      rig.render();
+      await rig.render();

      // Wait for initial ready state
      await rig.waitForIdle();

+      // Execute priming script if requested
+      if (userPrompts.length > 0) {
+        await rig.driveMockUser(userPrompts, evalCase.timeout);
+      }
+
      // Send the initial prompt
      await rig.sendMessage(evalCase.prompt);

@@ -0,0 +1,38 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, expect } from 'vitest';
+import { appEvalTest } from './app-test-helper.js';
+import {
+  userText,
+  mockGenerateContentStreamText,
+} from '@google/gemini-cli-core';
+
+describe('Hybrid Handoff (Mock User to Live Model)', () => {
+  appEvalTest('ALWAYS_PASSES', {
+    name: 'Mock User successfully primes AppRig using a scripted history and hands off to live model',
+    timeout: 120000,
+    script: [
+      userText('Start priming'),
+      mockGenerateContentStreamText(
+        "Hello! I am a fake response. Let's prime the pump.",
+      ),
+      userText('Continue priming'),
+      mockGenerateContentStreamText(
+        'Pump primed successfully. Ready for handoff.',
+      ),
+    ],
+    prompt: 'What is 5 * 5? Please answer with just the final number.',
+    assert: async (rig) => {
+      await rig.drainBreakpointsUntilIdle(undefined, 30000);
+
+      const liveOutput = rig.getStaticOutput();
+
+      // Ensure the handoff was successful
+      expect(liveOutput).toContain('25');
+    },
+  });
+});