/** * @license * Copyright 2026 Google LLC * SPDX-License-Identifier: Apache-2.0 */ import { describe, expect } from 'vitest'; import { componentEvalTest } from './component-test-helper.js'; import { SimulationHarness } from '../packages/core/src/context/system-tests/SimulationHarness.js'; import type { SidecarConfig } from '../packages/core/src/context/sidecar/types.js'; import { Config, LlmRole, getResponseText } from '@google/gemini-cli-core'; import fs from 'node:fs'; import path from 'node:path'; import { fileURLToPath } from 'node:url'; import type { Content } from '@google/genai'; import { EVAL_MODEL } from './test-helper.js'; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); const DATA_DIR = path.join(__dirname, 'data', 'context_manager'); interface ScenarioQuestion { id: string; prompt: string; expectedSubstring: string; } interface Scenario { scenarioId: string; description: string; history: Content[]; questions: ScenarioQuestion[]; } const getScenario = (id: string): Scenario => { const filePath = path.join(DATA_DIR, 'scenario-c-compiler.json'); if (!fs.existsSync(filePath)) { throw new Error( `Scenario file not found at ${filePath}. Run generate-c-compiler-scenario.ts first.`, ); } const content = fs.readFileSync(filePath, 'utf-8'); return JSON.parse(content) as Scenario; }; /** * Runs a single evaluation iteration using the SimulationHarness. */ async function runContextManagerEval( config: Config, sidecarConfig: SidecarConfig, seed: number, scenario: Scenario, ): Promise<{ pass: boolean; response: string; question: ScenarioQuestion; tokens: number; }> { const harness = await SimulationHarness.create( sidecarConfig, config.getBaseLlmClient(), ); // 1. Feed trajectory for (const message of scenario.history) { await harness.simulateTurn([message]); } // Ensure background tasks (like StateSnapshotProcessor) have finished await harness.waitForIdle(); // 2. Pick a question based on seed const questionIndex = seed % scenario.questions.length; const question = scenario.questions[questionIndex]; // 3. Project compressed history const compressedHistory = await harness.contextManager.projectCompressedHistory(); console.log('--- COMPRESSED HISTORY ---'); console.log(JSON.stringify(compressedHistory, null, 2)); console.log('--- END COMPRESSED HISTORY ---'); // 4. Ask the question const evalPrompt: Content = { role: 'user', parts: [ { text: `[SYSTEM: EVALUATION MODE - ANSWER AS TEXT ONLY - DO NOT USE TOOLS]\n\n${question.prompt}\n\nIMPORTANT: Answer in plain text and DO NOT call any tools. Provide the specific information requested.`, }, ], }; const response = await config.getContentGenerator().generateContent( { model: config.getModel(), config: {}, contents: [...compressedHistory, evalPrompt], }, 'eval-prompt', LlmRole.UTILITY_TOOL, ); const responseText = getResponseText(response) ?? ''; const pass = responseText .toLowerCase() .includes(question.expectedSubstring.toLowerCase()); const finalTokens = harness.env.tokenCalculator.calculateEpisodeListTokens( harness.contextManager.getWorkingBufferView(), ); console.log(`Retained Budget: ${sidecarConfig.budget.retainedTokens}`); console.log(`Final Tokens: ${finalTokens}`); return { pass, response: responseText, question, tokens: finalTokens }; } function generateRandomSidecarConfig(seed: number): SidecarConfig { // Simple LCG or similar for deterministic randomness from seed if needed, // but for a fuzzer we can just use Math.random and log the seed. const retained = 500 + Math.floor(Math.random() * 9500); // 500 to 10000 const max = Math.floor(retained * (1.2 + Math.random() * 0.8)); // 20% to 100% buffer const strategies: Array<'truncate' | 'compress' | 'rollingSummarizer'> = [ 'truncate', 'compress', 'rollingSummarizer', ]; const strategy = strategies[Math.floor(Math.random() * strategies.length)]; const useSquashing = Math.random() > 0.5; const useSnapshot = Math.random() > 0.5; const processors: any[] = [ { processorId: 'ToolMaskingProcessor', options: { stringLengthThresholdTokens: 2000 + Math.floor(Math.random() * 8000), }, }, { processorId: 'BlobDegradationProcessor', options: {} }, { processorId: 'SemanticCompressionProcessor', options: { nodeThresholdTokens: 1000 + Math.floor(Math.random() * 4000) }, }, { processorId: 'EmergencyTruncationProcessor', options: {} }, ]; const backgroundProcessors: any[] = []; if (useSquashing) { backgroundProcessors.push({ processorId: 'HistorySquashingProcessor', options: { maxTokensPerNode: 500 + Math.floor(Math.random() * 2500) }, }); } if (useSnapshot) { backgroundProcessors.push({ processorId: 'StateSnapshotProcessor', options: {}, }); } return { budget: { retainedTokens: retained, maxTokens: max }, gcBackstop: { strategy, target: 'incremental', freeTokensTarget: Math.floor(retained * 0.1), }, pipelines: [ { name: 'Immediate Sanitization', triggers: ['on_turn'], execution: 'blocking', processors, }, { name: 'Deep Background Compression', triggers: [{ type: 'timer', intervalMs: 100 }, 'budget_exceeded'], execution: 'background', processors: backgroundProcessors, }, ], }; } describe('ContextManager Evaluation Suite', () => { const scenario = getScenario('scenario-c-compiler'); /** * The "Explorer" test. * Set RUN_EXPLORER=1 to run many iterations and find failures. */ if (process.env.RUN_EXPLORER) { componentEvalTest('ALWAYS_PASSES', { suiteName: 'context-manager', suiteType: 'component-level', name: 'ContextManager explorer (fuzzer)', configOverrides: { model: EVAL_MODEL }, timeout: 1200000, // 20 minutes assert: async (config: Config) => { console.log('Starting ContextManager explorer loop...'); for (let i = 0; i < 50; i++) { const seed = Math.floor(Math.random() * 1000000); const sidecarConfig = generateRandomSidecarConfig(seed); const result = await runContextManagerEval( config, sidecarConfig, seed, scenario, ); if (!result.pass) { console.log('!!! FAILURE FOUND !!!'); console.log(`Seed: ${seed}`); console.log(`Question: ${result.question.id}`); console.log(`Expected: ${result.question.expectedSubstring}`); console.log(`Actual Response: ${result.response}`); console.log('---------------------------'); console.log(`NEW FROZEN CASE SUGGESTION:`); console.log(` componentEvalTest('ALWAYS_PASSES', { suiteName: 'context-manager', suiteType: 'component-level', name: 'ContextManager Frozen Case - ${result.question.id} (Budget: ${sidecarConfig.budget.retainedTokens})', configOverrides: { model: EVAL_MODEL }, assert: async (config: Config) => { const scenario = getScenario('scenario-c-compiler'); const sidecarConfig: SidecarConfig = ${JSON.stringify(sidecarConfig, null, 2)}; const seed = ${seed}; const result = await runContextManagerEval(config, sidecarConfig, seed, scenario); expect(result.pass, \`Recall failed for ${result.question.id}. Response: \${result.response}\`).toBe(true); } }); `); } else { console.log( `Iteration ${i} (Budget: ${sidecarConfig.budget.retainedTokens}, Seed: ${seed}) passed.`, ); } } }, }); } // --- Frozen cases discovered by explorer --- componentEvalTest('ALWAYS_PASSES', { suiteName: 'context-manager', suiteType: 'component-level', name: 'ContextManager Baseline - Generous Budget (Full Recall)', configOverrides: { model: EVAL_MODEL }, assert: async (config: Config) => { const budget = { retained: 100000, max: 200000 }; const sidecarConfig: SidecarConfig = { budget: { retainedTokens: budget.retained, maxTokens: budget.max }, gcBackstop: { strategy: 'truncate', target: 'incremental' }, pipelines: [], }; const seed = 42; const result = await runContextManagerEval( config, sidecarConfig, seed, scenario, ); expect( result.pass, `Baseline recall failed for ${result.question.id}. Response: ${result.response}`, ).toBe(true); }, }); componentEvalTest('ALWAYS_PASSES', { suiteName: 'context-manager', suiteType: 'component-level', name: 'ContextManager - Recall spatial lexer path (Limited Budget)', configOverrides: { model: EVAL_MODEL }, assert: async (config: Config) => { const sidecarConfig: SidecarConfig = { budget: { retainedTokens: 5000, maxTokens: 8000 }, gcBackstop: { strategy: 'truncate', target: 'incremental', freeTokensTarget: 500, }, pipelines: [ { name: 'Immediate Sanitization', triggers: ['on_turn'], execution: 'blocking', processors: [ { processorId: 'ToolMaskingProcessor', options: { stringLengthThresholdTokens: 8000 }, }, { processorId: 'BlobDegradationProcessor', options: {} }, { processorId: 'SemanticCompressionProcessor', options: { nodeThresholdTokens: 5000 }, }, { processorId: 'EmergencyTruncationProcessor', options: {} }, ], }, ], }; const seed = 12; // Deterministically pick spatial-lexer-path question const result = await runContextManagerEval( config, sidecarConfig, seed, scenario, ); expect( result.pass, `Recall failed for spatial path. Response: ${result.response}`, ).toBe(true); }, }); componentEvalTest('ALWAYS_PASSES', { suiteName: 'context-manager', suiteType: 'component-level', name: 'ContextManager Frozen Case - secret-beta (Budget: 2564)', configOverrides: { model: EVAL_MODEL }, assert: async (config: Config) => { const scenario = getScenario('scenario-c-compiler'); const sidecarConfig: SidecarConfig = { budget: { retainedTokens: 2564, maxTokens: 3855, }, gcBackstop: { strategy: 'rollingSummarizer', target: 'incremental', freeTokensTarget: 256, }, pipelines: [ { name: 'Immediate Sanitization', triggers: ['on_turn'], execution: 'blocking', processors: [ { processorId: 'ToolMaskingProcessor', options: { stringLengthThresholdTokens: 3379, }, }, { processorId: 'BlobDegradationProcessor', options: {}, }, { processorId: 'SemanticCompressionProcessor', options: { nodeThresholdTokens: 2456, }, }, { processorId: 'EmergencyTruncationProcessor', options: {}, }, ], }, { name: 'Deep Background Compression', triggers: [ { type: 'timer', intervalMs: 100, }, 'budget_exceeded', ], execution: 'background', processors: [ { processorId: 'HistorySquashingProcessor', options: { maxTokensPerNode: 2588, }, }, { processorId: 'StateSnapshotProcessor', options: {}, }, ], }, ], }; const seed = 248506; const result = await runContextManagerEval( config, sidecarConfig, seed, scenario, ); expect( result.pass, `Recall failed for secret-beta. Response: ${result.response}`, ).toBe(true); }, }); componentEvalTest('ALWAYS_PASSES', { suiteName: 'context-manager', suiteType: 'component-level', name: 'ContextManager Frozen Case - codegen-header (Budget: 1820)', configOverrides: { model: EVAL_MODEL }, assert: async (config: Config) => { const scenario = getScenario('scenario-c-compiler'); const sidecarConfig: SidecarConfig = { budget: { retainedTokens: 1820, maxTokens: 3353, }, gcBackstop: { strategy: 'rollingSummarizer', target: 'incremental', freeTokensTarget: 182, }, pipelines: [ { name: 'Immediate Sanitization', triggers: ['on_turn'], execution: 'blocking', processors: [ { processorId: 'ToolMaskingProcessor', options: { stringLengthThresholdTokens: 5375, }, }, { processorId: 'BlobDegradationProcessor', options: {}, }, { processorId: 'SemanticCompressionProcessor', options: { nodeThresholdTokens: 3886, }, }, { processorId: 'EmergencyTruncationProcessor', options: {}, }, ], }, { name: 'Deep Background Compression', triggers: [ { type: 'timer', intervalMs: 100, }, 'budget_exceeded', ], execution: 'background', processors: [ { processorId: 'HistorySquashingProcessor', options: { maxTokensPerNode: 1999, }, }, { processorId: 'StateSnapshotProcessor', options: {}, }, ], }, ], }; const seed = 322027; const result = await runContextManagerEval( config, sidecarConfig, seed, scenario, ); expect( result.pass, `Recall failed for codegen-header. Response: ${result.response}`, ).toBe(true); }, }); componentEvalTest('ALWAYS_PASSES', { suiteName: 'context-manager', suiteType: 'component-level', name: 'ContextManager Frozen Case - lexer-constraint (Budget: 9097)', configOverrides: { model: EVAL_MODEL }, assert: async (config: Config) => { const scenario = getScenario('scenario-c-compiler'); const sidecarConfig: SidecarConfig = { budget: { retainedTokens: 9097, maxTokens: 16089, }, gcBackstop: { strategy: 'compress', target: 'incremental', freeTokensTarget: 909, }, pipelines: [ { name: 'Immediate Sanitization', triggers: ['on_turn'], execution: 'blocking', processors: [ { processorId: 'ToolMaskingProcessor', options: { stringLengthThresholdTokens: 8799, }, }, { processorId: 'BlobDegradationProcessor', options: {}, }, { processorId: 'SemanticCompressionProcessor', options: { nodeThresholdTokens: 4044, }, }, { processorId: 'EmergencyTruncationProcessor', options: {}, }, ], }, { name: 'Deep Background Compression', triggers: [ { type: 'timer', intervalMs: 100, }, 'budget_exceeded', ], execution: 'background', processors: [ { processorId: 'HistorySquashingProcessor', options: { maxTokensPerNode: 1973, }, }, { processorId: 'StateSnapshotProcessor', options: {}, }, ], }, ], }; const seed = 161221; const result = await runContextManagerEval( config, sidecarConfig, seed, scenario, ); expect( result.pass, `Recall failed for lexer-constraint. Response: ${result.response}`, ).toBe(true); }, }); });