++

2026-04-23 19:44:30 -07:00 · 2026-03-11 18:58:50 +00:00
parent 23575ff83d
commit 3291d3a58b
15 changed files with 542 additions and 117 deletions
@@ -13,7 +13,12 @@ import {
 } from './test-helper.js';
 import fs from 'node:fs';
 import path from 'node:path';
-import { DEFAULT_GEMINI_MODEL } from '@google/gemini-cli-core';
+import {
+  DEFAULT_GEMINI_MODEL,
+  type ScriptItem,
+  extractFakeResponses,
+  extractUserPrompts,
+} from '@google/gemini-cli-core';

 export interface AppEvalCase {
  name: string;
@@ -23,6 +28,12 @@ export interface AppEvalCase {
  files?: Record<string, string>;
  setup?: (rig: AppRig) => Promise<void>;
  assert: (rig: AppRig, output: string) => Promise<void>;
+  /**
+   * Optional script to "prime the pump" before the main prompt.
+   * A sequential array interleaving MockUserTurn (e.g., userText('hello'))
+   * and FakeResponse (e.g., mockGenerateContentStreamText('hi')).
+   */
+  script?: ScriptItem[];
 }

 /**
@@ -31,11 +42,23 @@ export interface AppEvalCase {
 */
 export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
  const fn = async () => {
+    const configOverrides = {
+      model: DEFAULT_GEMINI_MODEL,
+      ...evalCase.configOverrides,
+    };
+
+    let userPrompts: string[] = [];
+
+    if (evalCase.script) {
+      // Extract fake model responses to inject into the ContentGenerator
+      configOverrides.fakeResponses = extractFakeResponses(evalCase.script);
+
+      // Extract the sequence of user prompts for the Mock User driver
+      userPrompts = extractUserPrompts(evalCase.script);
+    }
+
    const rig = new AppRig({
-      configOverrides: {
-        model: DEFAULT_GEMINI_MODEL,
-        ...evalCase.configOverrides,
-      },
+      configOverrides,
    });

    const { logDir, sanitizedName } = await prepareLogDir(evalCase.name);
@@ -67,6 +90,11 @@ export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
      // Wait for initial ready state
      await rig.waitForIdle();

+      // Execute priming script if requested
+      if (userPrompts.length > 0) {
+        await rig.driveMockUser(userPrompts, evalCase.timeout);
+      }
+
      // Send the initial prompt
      await rig.sendMessage(evalCase.prompt);

@@ -8,11 +8,15 @@ import { describe, expect } from 'vitest';
 import path from 'node:path';
 import fs from 'node:fs';
 import { appEvalTest } from './app-test-helper.js';
+import {
+  userText,
+  mockGenerateContentStreamText,
+} from '@google/gemini-cli-core';

 describe('Auto-Distillation Behavioral Evals', () => {
-  appEvalTest('ALWAYS_PASSES', {
+  appEvalTest('USUALLY_PASSES', {
    name: 'Agent successfully navigates truncated output using the structural map to extract a secret',
-    timeout: 120000,
+    timeout: 180000,
    configOverrides: {},
    setup: async (rig) => {
      const testDir = rig.getTestDir();
@@ -23,8 +27,6 @@ describe('Auto-Distillation Behavioral Evals', () => {
          uptime: 999999,
          environment: 'production',
        },
-        // Pad with enough active sessions to push the next section past the 8,000 character 'head'
-        // 300 sessions * ~80 chars = ~24,000 characters
        active_sessions: [],
        quarantined_payloads: [
          { id: 'Subject-01', status: 'cleared' },
@@ -35,8 +37,6 @@ describe('Auto-Distillation Behavioral Evals', () => {
          },
          { id: 'Subject-99', status: 'cleared' },
        ],
-        // Pad with enough metrics to push the total file size well past 60,000 characters
-        // 2000 metrics * ~70 chars = ~140,000 characters
        archived_metrics: [],
      };

@@ -56,29 +56,26 @@ describe('Auto-Distillation Behavioral Evals', () => {
        });
      }

+      const massiveString = JSON.stringify(mockData, null, 2);
+
      fs.writeFileSync(
        path.join(testDir, 'server_state_dump.json'),
-        JSON.stringify(mockData, null, 2),
+        massiveString,
      );
    },
-    prompt:
-      'A massive log dump is located at server_state_dump.json. First, you MUST run the shell command `cat server_state_dump.json` to view it. The output will likely be truncated. Read the structural map provided in the output, and then figure out a way to extract the secret_token for the quarantined payload "Subject-89".',
+    script: [
+      userText('We have a critical error in production. Are you ready to help?'),
+      mockGenerateContentStreamText(
+        'I am ready. Please provide the details of the error.',
+      ),
+    ],
+    prompt: `My application crashed with: "FATAL: Subject-89 held for review in quarantine". \n\nPlease run \`cat server_state_dump.json\` to investigate. The file is massive, so your tool output will be automatically truncated and you will receive a structural map instead. Use that structural map to determine the right command to extract the \`secret_token\` for Subject-89. Please state the exact secret token when you find it.`,
    assert: async (rig) => {
      await rig.waitForIdle(120000);

      const finalOutput = rig.getStaticOutput();
-      const curatedHistory = rig.getCuratedHistory();

-      // Ensure truncation occurred
-      const stringifiedHistory = JSON.stringify(curatedHistory);
-      expect(stringifiedHistory).toContain('Output too large. Showing first');
-
-      // Ensure the structural map summarizer was triggered
-      expect(stringifiedHistory).toContain(
-        '--- Structural Map of Truncated Content ---',
-      );
-
-      // Ensure the agent correctly extracted the secret token
+      // Ensure the agent correctly extracted the secret token after navigating the distilled output
      expect(finalOutput).toContain('the_cake_is_a_lie');
    },
  });
@@ -0,0 +1,39 @@
+import { describe, expect } from 'vitest';
+import {
+  appEvalTest,
+} from './app-test-helper.js';
+import {
+  userText,
+  mockGenerateContentStreamText,
+} from '@google/gemini-cli-core';
+
+describe('Hybrid Handoff (Mock User to Live Model)', () => {
+  appEvalTest('ALWAYS_PASSES', {
+    name: 'Mock User successfully primes AppRig using a scripted history and hands off to live model',
+    timeout: 120000,
+    script: [
+      userText('Start priming'),
+      mockGenerateContentStreamText(
+        "Hello! I am a fake response. Let's prime the pump.",
+      ),
+      userText('Continue priming'),
+      mockGenerateContentStreamText(
+        'Pump primed successfully. Ready for handoff.',
+      ),
+    ],
+    prompt: 'What is 2 + 2? Please answer with exactly the number "4".',
+    assert: async (rig) => {
+      // The Mock User has automatically driven the script before sending the final prompt.
+      // So the history now has the 2 fake turns in it, and the final prompt was just sent to the LIVE model.
+
+      await rig.waitForIdle(60000);
+
+      const liveOutput = rig.getStaticOutput();
+
+      // Ensure the handoff was successful
+      expect(liveOutput).toContain('4');
+
+      await rig.drainBreakpointsUntilIdle(undefined, 10000);
+    },
+  });
+});