mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-04-23 19:44:30 -07:00
++
This commit is contained in:
@@ -13,7 +13,12 @@ import {
|
||||
} from './test-helper.js';
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
import { DEFAULT_GEMINI_MODEL } from '@google/gemini-cli-core';
|
||||
import {
|
||||
DEFAULT_GEMINI_MODEL,
|
||||
type ScriptItem,
|
||||
extractFakeResponses,
|
||||
extractUserPrompts,
|
||||
} from '@google/gemini-cli-core';
|
||||
|
||||
export interface AppEvalCase {
|
||||
name: string;
|
||||
@@ -23,6 +28,12 @@ export interface AppEvalCase {
|
||||
files?: Record<string, string>;
|
||||
setup?: (rig: AppRig) => Promise<void>;
|
||||
assert: (rig: AppRig, output: string) => Promise<void>;
|
||||
/**
|
||||
* Optional script to "prime the pump" before the main prompt.
|
||||
* A sequential array interleaving MockUserTurn (e.g., userText('hello'))
|
||||
* and FakeResponse (e.g., mockGenerateContentStreamText('hi')).
|
||||
*/
|
||||
script?: ScriptItem[];
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -31,11 +42,23 @@ export interface AppEvalCase {
|
||||
*/
|
||||
export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
|
||||
const fn = async () => {
|
||||
const configOverrides = {
|
||||
model: DEFAULT_GEMINI_MODEL,
|
||||
...evalCase.configOverrides,
|
||||
};
|
||||
|
||||
let userPrompts: string[] = [];
|
||||
|
||||
if (evalCase.script) {
|
||||
// Extract fake model responses to inject into the ContentGenerator
|
||||
configOverrides.fakeResponses = extractFakeResponses(evalCase.script);
|
||||
|
||||
// Extract the sequence of user prompts for the Mock User driver
|
||||
userPrompts = extractUserPrompts(evalCase.script);
|
||||
}
|
||||
|
||||
const rig = new AppRig({
|
||||
configOverrides: {
|
||||
model: DEFAULT_GEMINI_MODEL,
|
||||
...evalCase.configOverrides,
|
||||
},
|
||||
configOverrides,
|
||||
});
|
||||
|
||||
const { logDir, sanitizedName } = await prepareLogDir(evalCase.name);
|
||||
@@ -67,6 +90,11 @@ export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
|
||||
// Wait for initial ready state
|
||||
await rig.waitForIdle();
|
||||
|
||||
// Execute priming script if requested
|
||||
if (userPrompts.length > 0) {
|
||||
await rig.driveMockUser(userPrompts, evalCase.timeout);
|
||||
}
|
||||
|
||||
// Send the initial prompt
|
||||
await rig.sendMessage(evalCase.prompt);
|
||||
|
||||
|
||||
@@ -8,11 +8,15 @@ import { describe, expect } from 'vitest';
|
||||
import path from 'node:path';
|
||||
import fs from 'node:fs';
|
||||
import { appEvalTest } from './app-test-helper.js';
|
||||
import {
|
||||
userText,
|
||||
mockGenerateContentStreamText,
|
||||
} from '@google/gemini-cli-core';
|
||||
|
||||
describe('Auto-Distillation Behavioral Evals', () => {
|
||||
appEvalTest('ALWAYS_PASSES', {
|
||||
appEvalTest('USUALLY_PASSES', {
|
||||
name: 'Agent successfully navigates truncated output using the structural map to extract a secret',
|
||||
timeout: 120000,
|
||||
timeout: 180000,
|
||||
configOverrides: {},
|
||||
setup: async (rig) => {
|
||||
const testDir = rig.getTestDir();
|
||||
@@ -23,8 +27,6 @@ describe('Auto-Distillation Behavioral Evals', () => {
|
||||
uptime: 999999,
|
||||
environment: 'production',
|
||||
},
|
||||
// Pad with enough active sessions to push the next section past the 8,000 character 'head'
|
||||
// 300 sessions * ~80 chars = ~24,000 characters
|
||||
active_sessions: [],
|
||||
quarantined_payloads: [
|
||||
{ id: 'Subject-01', status: 'cleared' },
|
||||
@@ -35,8 +37,6 @@ describe('Auto-Distillation Behavioral Evals', () => {
|
||||
},
|
||||
{ id: 'Subject-99', status: 'cleared' },
|
||||
],
|
||||
// Pad with enough metrics to push the total file size well past 60,000 characters
|
||||
// 2000 metrics * ~70 chars = ~140,000 characters
|
||||
archived_metrics: [],
|
||||
};
|
||||
|
||||
@@ -56,29 +56,26 @@ describe('Auto-Distillation Behavioral Evals', () => {
|
||||
});
|
||||
}
|
||||
|
||||
const massiveString = JSON.stringify(mockData, null, 2);
|
||||
|
||||
fs.writeFileSync(
|
||||
path.join(testDir, 'server_state_dump.json'),
|
||||
JSON.stringify(mockData, null, 2),
|
||||
massiveString,
|
||||
);
|
||||
},
|
||||
prompt:
|
||||
'A massive log dump is located at server_state_dump.json. First, you MUST run the shell command `cat server_state_dump.json` to view it. The output will likely be truncated. Read the structural map provided in the output, and then figure out a way to extract the secret_token for the quarantined payload "Subject-89".',
|
||||
script: [
|
||||
userText('We have a critical error in production. Are you ready to help?'),
|
||||
mockGenerateContentStreamText(
|
||||
'I am ready. Please provide the details of the error.',
|
||||
),
|
||||
],
|
||||
prompt: `My application crashed with: "FATAL: Subject-89 held for review in quarantine". \n\nPlease run \`cat server_state_dump.json\` to investigate. The file is massive, so your tool output will be automatically truncated and you will receive a structural map instead. Use that structural map to determine the right command to extract the \`secret_token\` for Subject-89. Please state the exact secret token when you find it.`,
|
||||
assert: async (rig) => {
|
||||
await rig.waitForIdle(120000);
|
||||
|
||||
const finalOutput = rig.getStaticOutput();
|
||||
const curatedHistory = rig.getCuratedHistory();
|
||||
|
||||
// Ensure truncation occurred
|
||||
const stringifiedHistory = JSON.stringify(curatedHistory);
|
||||
expect(stringifiedHistory).toContain('Output too large. Showing first');
|
||||
|
||||
// Ensure the structural map summarizer was triggered
|
||||
expect(stringifiedHistory).toContain(
|
||||
'--- Structural Map of Truncated Content ---',
|
||||
);
|
||||
|
||||
// Ensure the agent correctly extracted the secret token
|
||||
// Ensure the agent correctly extracted the secret token after navigating the distilled output
|
||||
expect(finalOutput).toContain('the_cake_is_a_lie');
|
||||
},
|
||||
});
|
||||
|
||||
@@ -0,0 +1,39 @@
|
||||
import { describe, expect } from 'vitest';
|
||||
import {
|
||||
appEvalTest,
|
||||
} from './app-test-helper.js';
|
||||
import {
|
||||
userText,
|
||||
mockGenerateContentStreamText,
|
||||
} from '@google/gemini-cli-core';
|
||||
|
||||
describe('Hybrid Handoff (Mock User to Live Model)', () => {
|
||||
appEvalTest('ALWAYS_PASSES', {
|
||||
name: 'Mock User successfully primes AppRig using a scripted history and hands off to live model',
|
||||
timeout: 120000,
|
||||
script: [
|
||||
userText('Start priming'),
|
||||
mockGenerateContentStreamText(
|
||||
"Hello! I am a fake response. Let's prime the pump.",
|
||||
),
|
||||
userText('Continue priming'),
|
||||
mockGenerateContentStreamText(
|
||||
'Pump primed successfully. Ready for handoff.',
|
||||
),
|
||||
],
|
||||
prompt: 'What is 2 + 2? Please answer with exactly the number "4".',
|
||||
assert: async (rig) => {
|
||||
// The Mock User has automatically driven the script before sending the final prompt.
|
||||
// So the history now has the 2 fake turns in it, and the final prompt was just sent to the LIVE model.
|
||||
|
||||
await rig.waitForIdle(60000);
|
||||
|
||||
const liveOutput = rig.getStaticOutput();
|
||||
|
||||
// Ensure the handoff was successful
|
||||
expect(liveOutput).toContain('4');
|
||||
|
||||
await rig.drainBreakpointsUntilIdle(undefined, 10000);
|
||||
},
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user