Files
gemini-cli/evals/context_manager.eval.ts
Christian Gunderman 1fa422a7c6 Update tests.
2026-04-10 17:11:58 -07:00

470 lines
14 KiB
TypeScript

/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, expect } from 'vitest';
import { componentEvalTest } from './component-test-helper.js';
import { SimulationHarness } from '../packages/core/src/context/system-tests/SimulationHarness.js';
import type { SidecarConfig } from '../packages/core/src/context/sidecar/types.js';
import { Config, LlmRole, getResponseText } from '@google/gemini-cli-core';
import fs from 'node:fs';
import path from 'node:path';
import { fileURLToPath } from 'node:url';
import type { Content } from '@google/genai';
import { EVAL_MODEL } from './test-helper.js';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const DATA_DIR = path.join(__dirname, 'data', 'context_manager');
interface ScenarioQuestion {
id: string;
prompt: string;
expectedSubstring: string;
}
interface Scenario {
scenarioId: string;
description: string;
history: Content[];
questions: ScenarioQuestion[];
}
const getScenario = (id: string): Scenario => {
const filePath = path.join(DATA_DIR, 'scenario-c-compiler.json');
if (!fs.existsSync(filePath)) {
throw new Error(
`Scenario file not found at ${filePath}. Run generate-c-compiler-scenario.ts first.`,
);
}
const content = fs.readFileSync(filePath, 'utf-8');
return JSON.parse(content) as Scenario;
};
/**
* Runs a single evaluation iteration using the SimulationHarness.
*/
async function runContextManagerEval(
config: Config,
sidecarConfig: SidecarConfig,
seed: number,
scenario: Scenario,
judge: boolean = true,
): Promise<{
pass: boolean;
response: string;
question: ScenarioQuestion;
tokens: number;
}> {
const harness = await SimulationHarness.create(
sidecarConfig,
config.getBaseLlmClient() as any,
path.join(process.cwd(), 'harness-tmp'),
);
// 1. Feed trajectory
for (const message of scenario.history) {
await harness.simulateTurn([message]);
}
// Ensure background tasks (like StateSnapshotProcessor) have finished
// 2. Pick a question based on seed
const questionIndex = seed % scenario.questions.length;
const question = scenario.questions[questionIndex];
// 3. Project compressed history
const compressedHistory =
await harness.contextManager.projectCompressedHistory();
// We can't easily get the episode IDs from the projected Content[],
// but we can look at the working buffer instead.
const workingBuffer = harness.contextManager.getNodes();
console.log('--- WORKING BUFFER EPISODES START ---');
workingBuffer.forEach((node: any, i: number) => {
console.log(`[Node ${i}] ID: ${node.id}, Type: ${node.type}`);
if (node.type === 'USER_PROMPT') {
console.log(` Text: ${node.semanticParts?.[0]?.text?.slice(0, 50)}`);
}
});
console.log('--- WORKING BUFFER EPISODES END ---');
console.log('--- COMPRESSED HISTORY START ---');
compressedHistory.forEach((msg, i) => {
console.log(`[${i}] Role: ${msg.role}`);
(msg.parts || []).forEach((part, j) => {
if ('text' in part) {
console.log(
` Part ${j} (text): ${part.text?.slice(0, 100)}${part.text && part.text.length > 100 ? '...' : ''}`,
);
} else if ('functionCall' in part) {
console.log(` Part ${j} (functionCall): ${part.functionCall?.name}`);
} else if ('functionResponse' in part) {
console.log(
` Part ${j} (functionResponse): ${part.functionResponse?.name}`,
);
} else {
console.log(` Part ${j} (other): ${Object.keys(part).join(', ')}`);
}
});
});
console.log('--- COMPRESSED HISTORY END ---');
// 4. Ask the question
const evalPrompt: Content = {
role: 'user',
parts: [
{
text: `[SYSTEM: EVALUATION MODE - ANSWER AS TEXT ONLY - DO NOT USE TOOLS]\n\n${question.prompt}\n\nIMPORTANT: Answer in plain text and DO NOT call any tools. Provide the specific information requested.`,
},
],
};
const response = await config.getContentGenerator().generateContent(
{
model: config.getModel(),
config: {},
contents: [...compressedHistory, evalPrompt],
},
'eval-prompt',
LlmRole.UTILITY_TOOL,
);
const responseText = getResponseText(response) ?? '';
let pass = false;
if (judge) {
pass = await judgeResponse(config, question, responseText);
} else {
// Naive string check for potential failure identification
pass = responseText
.toLowerCase()
.includes(question.expectedSubstring.toLowerCase());
}
const finalTokens = harness.env.tokenCalculator.calculateConcreteListTokens(
harness.contextManager.getNodes(),
);
return { pass, response: responseText, question, tokens: finalTokens };
}
/**
* Uses an LLM to judge whether the response matches the expected information.
*/
async function judgeResponse(
config: Config,
question: ScenarioQuestion,
actualResponse: string,
): Promise<boolean> {
const lowerResponse = actualResponse.toLowerCase();
const lowerExpected = question.expectedSubstring.toLowerCase();
// Fast path: direct substring match
if (lowerResponse.includes(lowerExpected)) {
return true;
}
const judgePrompt: Content = {
role: 'user',
parts: [
{
text: `Evaluate if the AI's response correctly answers the question by explicitly containing the expected information.
Question: ${question.prompt}
Expected Information: ${question.expectedSubstring}
AI's Response: ${actualResponse}
CRITICAL RULES FOR JUDGING:
1. The AI's response MUST explicitly answer the question in the prompt and the provided details must match those in 'Expected Information'.
2. Do not infer or assume knowledge. Vague, partial, or generalized answers MUST fail.
3. If the AI hallucinates, states it cannot find the information, or provides an incomplete answer, it MUST fail.
4. Expected information may contain extra details that are not required to answer the question that aren't in the AI's Response. For example: "the answer is X. It was previously Y.". These are still considered to be passing cases.
Does the AI's response explicitly and completely satisfy the expected information?
Respond with ONLY "PASS" or "FAIL".`,
},
],
};
const response = await config.getContentGenerator().generateContent(
{
model: EVAL_MODEL,
config: { temperature: 0 },
contents: [judgePrompt],
},
'eval-judge',
LlmRole.UTILITY_TOOL,
);
const judgeText = getResponseText(response)?.trim().toUpperCase() ?? '';
return judgeText === 'PASS';
}
function calculateScenarioTokens(scenario: Scenario): number {
let totalChars = 0;
for (const message of scenario.history) {
for (const part of message.parts || []) {
if ('text' in part && part.text) {
totalChars += part.text.length;
}
}
}
// The SimulationHarness uses 4 chars per token.
return Math.ceil(totalChars / 4);
}
function generateRandomSidecarConfig(
seed: number,
maxRetained: number,
): SidecarConfig {
// Simple LCG or similar for deterministic randomness from seed if needed,
// but for a fuzzer we can just use Math.random and log the seed.
const minRetained = Math.min(1000, maxRetained);
const retained =
minRetained + Math.floor(Math.random() * (maxRetained - minRetained));
const max = Math.floor(retained * (1.1 + Math.random() * 2.0)); // 110% to 300% buffer
const useSquashing = Math.random() > 0.5;
const useRollingSummary = Math.random() > 0.5;
const useSnapshot = Math.random() > 0.5;
const processors: any[] = [
{
processorId: 'ToolMaskingProcessor',
options: {
stringLengthThresholdTokens: Math.floor(
retained * (0.2 + Math.random() * 0.5),
),
},
},
{ processorId: 'BlobDegradationProcessor', options: {} },
{
processorId: 'NodeDistillationProcessor',
options: {
nodeThresholdTokens: Math.floor(retained * (0.1 + Math.random() * 0.3)),
},
},
{
processorId: 'NodeTruncationProcessor',
options: {
maxTokensPerNode: Math.floor(retained * (0.1 + Math.random() * 0.5)),
},
},
];
const backgroundProcessors: any[] = [];
if (useSquashing) {
backgroundProcessors.push({
processorId: 'HistoryTruncationProcessor',
options: {
target: 'freeNTokens',
freeTokensTarget: Math.floor(retained * (0.05 + Math.random() * 0.15)),
},
});
}
if (useRollingSummary) {
backgroundProcessors.push({
processorId: 'RollingSummaryProcessor',
options: {
target: 'freeNTokens',
freeTokensTarget: Math.floor(retained * (0.05 + Math.random() * 0.15)),
},
});
}
if (useSnapshot) {
backgroundProcessors.push({
processorId: 'StateSnapshotProcessor',
options: {},
});
}
const workers: any[] = [];
if (useSnapshot && Math.random() > 0.5) {
workers.push({
workerId: 'StateSnapshotWorker',
options: {
type: Math.random() > 0.5 ? 'accumulate' : 'point-in-time',
},
});
}
return {
budget: { retainedTokens: retained, maxTokens: max },
pipelines: [
{
name: 'Immediate Sanitization',
triggers: ['new_message'],
processors,
},
{
name: 'Deep Background Compression',
triggers: [
Math.random() > 0.5
? { type: 'timer', intervalMs: 100 }
: 'gc_backstop',
'retained_exceeded',
],
processors: backgroundProcessors,
},
],
workers: workers.length > 0 ? workers : undefined,
};
}
describe('ContextManager Evaluation Suite', () => {
const scenario = getScenario('scenario-c-compiler');
/**
* The "Explorer" test.
* Set RUN_EXPLORER=1 to run many iterations and find failures.
*/
if (process.env['RUN_EXPLORER']) {
componentEvalTest('ALWAYS_PASSES', {
suiteName: 'context-manager',
suiteType: 'component-level',
name: 'ContextManager explorer (fuzzer)',
configOverrides: { model: EVAL_MODEL },
timeout: 1200000, // 20 minutes
assert: async (config: Config) => {
const scenarioTokens = calculateScenarioTokens(scenario);
console.log(
`Starting ContextManager explorer loop for scenario of size ${scenarioTokens} tokens...`,
);
for (let i = 0; i < 50; i++) {
const seed = Math.floor(Math.random() * 1000000);
const sidecarConfig = generateRandomSidecarConfig(
seed,
scenarioTokens,
);
const result = await runContextManagerEval(
config,
sidecarConfig,
seed,
scenario,
false, // Optimistic string check
);
if (!result.pass) {
// Potential failure. Confirm with LLM judge.
result.pass = await judgeResponse(
config,
result.question,
result.response,
);
}
if (!result.pass) {
console.log('!!! FAILURE FOUND !!!');
console.log(`Seed: ${seed}`);
console.log(`Question: ${result.question.id}`);
console.log(`Expected: ${result.question.expectedSubstring}`);
console.log(`Actual Response: ${result.response}`);
console.log('---------------------------');
console.log(`NEW FROZEN CASE SUGGESTION:`);
console.log(`
componentEvalTest('ALWAYS_PASSES', {
suiteName: 'context-manager',
suiteType: 'component-level',
name: 'ContextManager Frozen Case - ${result.question.id} (Budget: ${sidecarConfig.budget.retainedTokens})',
configOverrides: { model: EVAL_MODEL },
assert: async (config: Config) => {
const scenario = getScenario('scenario-c-compiler');
const sidecarConfig: SidecarConfig = ${JSON.stringify(sidecarConfig, null, 2)};
const seed = ${seed};
const result = await runContextManagerEval(config, sidecarConfig, seed, scenario);
expect(result.pass, \`Recall failed for ${result.question.id}. Response: \${result.response}\`).toBe(true);
}
});
`);
} else {
console.log(
`Iteration ${i} (Budget: ${sidecarConfig.budget.retainedTokens}, Seed: ${seed}) passed.`,
);
}
}
},
});
}
// --- Frozen cases discovered by explorer ---
componentEvalTest('ALWAYS_PASSES', {
suiteName: 'context-manager',
suiteType: 'component-level',
name: 'ContextManager Frozen Case - lexer-constraint (Budget: 3737)',
configOverrides: { model: EVAL_MODEL },
assert: async (config: Config) => {
const scenario = getScenario('scenario-c-compiler');
const sidecarConfig: SidecarConfig = {
budget: {
retainedTokens: 3737,
maxTokens: 11280,
},
pipelines: [
{
name: 'Immediate Sanitization',
triggers: ['new_message'],
processors: [
{
processorId: 'ToolMaskingProcessor',
options: {
stringLengthThresholdTokens: 2442,
},
},
{
processorId: 'BlobDegradationProcessor',
options: {},
},
{
processorId: 'NodeDistillationProcessor',
options: {
nodeThresholdTokens: 733,
},
},
{
processorId: 'NodeTruncationProcessor',
options: { maxTokensPerNode: 1000 },
},
],
},
{
name: 'Deep Background Compression',
triggers: [
{
type: 'timer',
intervalMs: 100,
},
'retained_exceeded',
],
processors: [
{
processorId: 'HistoryTruncationProcessor',
options: {
target: 'freeNTokens',
freeTokensTarget: 327,
},
},
],
},
],
};
const seed = 288421;
const result = await runContextManagerEval(
config,
sidecarConfig,
seed,
scenario,
);
expect(
result.pass,
`Recall failed for lexer-constraint. Response: ${result.response}`,
).toBe(true);
},
});
});