feat(core): Support auto-distillation for tool output.

This commit is contained in:
Your Name
2026-03-11 01:18:41 +00:00
parent 39d3b0e28c
commit d6173843b0
23 changed files with 1104 additions and 238 deletions
+33 -5
View File
@@ -13,7 +13,12 @@ import {
} from './test-helper.js';
import fs from 'node:fs';
import path from 'node:path';
import { DEFAULT_GEMINI_MODEL } from '@google/gemini-cli-core';
import {
DEFAULT_GEMINI_MODEL,
type ScriptItem,
extractFakeResponses,
extractUserPrompts,
} from '@google/gemini-cli-core';
export interface AppEvalCase {
name: string;
@@ -23,6 +28,12 @@ export interface AppEvalCase {
files?: Record<string, string>;
setup?: (rig: AppRig) => Promise<void>;
assert: (rig: AppRig, output: string) => Promise<void>;
/**
* Optional script to "prime the pump" before the main prompt.
* A sequential array interleaving MockUserTurn (e.g., userText('hello'))
* and FakeResponse (e.g., mockGenerateContentStreamText('hi')).
*/
script?: ScriptItem[];
}
/**
@@ -31,11 +42,23 @@ export interface AppEvalCase {
*/
export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
const fn = async () => {
const configOverrides = {
model: DEFAULT_GEMINI_MODEL,
...evalCase.configOverrides,
};
let userPrompts: string[] = [];
if (evalCase.script) {
// Extract fake model responses to inject into the ContentGenerator
configOverrides.fakeResponses = extractFakeResponses(evalCase.script);
// Extract the sequence of user prompts for the Mock User driver
userPrompts = extractUserPrompts(evalCase.script);
}
const rig = new AppRig({
configOverrides: {
model: DEFAULT_GEMINI_MODEL,
...evalCase.configOverrides,
},
configOverrides,
});
const { logDir, sanitizedName } = await prepareLogDir(evalCase.name);
@@ -67,6 +90,11 @@ export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
// Wait for initial ready state
await rig.waitForIdle();
// Execute priming script if requested
if (userPrompts.length > 0) {
await rig.driveMockUser(userPrompts, evalCase.timeout);
}
// Send the initial prompt
await rig.sendMessage(evalCase.prompt);
+82
View File
@@ -0,0 +1,82 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, expect } from 'vitest';
import path from 'node:path';
import fs from 'node:fs';
import { appEvalTest } from './app-test-helper.js';
import {
userText,
mockGenerateContentStreamText,
} from '@google/gemini-cli-core';
describe('Auto-Distillation Behavioral Evals', () => {
appEvalTest('USUALLY_PASSES', {
name: 'Agent successfully navigates truncated output using the structural map to extract a secret',
timeout: 180000,
configOverrides: {},
setup: async (rig) => {
const testDir = rig.getTestDir();
const mockData: any = {
system_info: {
version: '1.0.0',
uptime: 999999,
environment: 'production',
},
active_sessions: [],
quarantined_payloads: [
{ id: 'Subject-01', status: 'cleared' },
{
id: 'Subject-89',
secret_token: 'the_cake_is_a_lie',
status: 'held_for_review',
},
{ id: 'Subject-99', status: 'cleared' },
],
archived_metrics: [],
};
for (let i = 0; i < 300; i++) {
mockData.active_sessions.push({
session_id: `sess_${i.toString().padStart(4, '0')}`,
duration_ms: Math.floor(Math.random() * 10000),
bytes_transferred: Math.floor(Math.random() * 50000),
});
}
for (let i = 0; i < 2000; i++) {
mockData.archived_metrics.push({
timestamp: Date.now() - i * 1000,
cpu_load: parseFloat(Math.random().toFixed(4)),
mem_usage: parseFloat(Math.random().toFixed(4)),
});
}
const massiveString = JSON.stringify(mockData, null, 2);
fs.writeFileSync(
path.join(testDir, 'server_state_dump.json'),
massiveString,
);
},
script: [
userText('We have a critical error in production. Are you ready to help?'),
mockGenerateContentStreamText(
'I am ready. Please provide the details of the error.',
),
],
prompt: `My application crashed with: "FATAL: Subject-89 held for review in quarantine". \n\nPlease run \`cat server_state_dump.json\` to investigate. The file is massive, so your tool output will be automatically truncated and you will receive a structural map instead. Use that structural map to determine the right command to extract the \`secret_token\` for Subject-89. Please state the exact secret token when you find it.`,
assert: async (rig) => {
await rig.waitForIdle(120000);
const finalOutput = rig.getStaticOutput();
// Ensure the agent correctly extracted the secret token after navigating the distilled output
expect(finalOutput).toContain('the_cake_is_a_lie');
},
});
});
+39
View File
@@ -0,0 +1,39 @@
import { describe, expect } from 'vitest';
import {
appEvalTest,
} from './app-test-helper.js';
import {
userText,
mockGenerateContentStreamText,
} from '@google/gemini-cli-core';
describe('Hybrid Handoff (Mock User to Live Model)', () => {
appEvalTest('ALWAYS_PASSES', {
name: 'Mock User successfully primes AppRig using a scripted history and hands off to live model',
timeout: 120000,
script: [
userText('Start priming'),
mockGenerateContentStreamText(
"Hello! I am a fake response. Let's prime the pump.",
),
userText('Continue priming'),
mockGenerateContentStreamText(
'Pump primed successfully. Ready for handoff.',
),
],
prompt: 'What is 2 + 2? Please answer with exactly the number "4".',
assert: async (rig) => {
// The Mock User has automatically driven the script before sending the final prompt.
// So the history now has the 2 fake turns in it, and the final prompt was just sent to the LIVE model.
await rig.waitForIdle(60000);
const liveOutput = rig.getStaticOutput();
// Ensure the handoff was successful
expect(liveOutput).toContain('4');
await rig.drainBreakpointsUntilIdle(undefined, 10000);
},
});
});