From 1fa422a7c64908732ae5f7f97c3007efdfff91c7 Mon Sep 17 00:00:00 2001 From: Christian Gunderman Date: Fri, 10 Apr 2026 17:11:58 -0700 Subject: [PATCH] Update tests. --- evals/context_manager.eval.ts | 361 +++--------------- .../processors/nodeDistillationProcessor.ts | 2 +- 2 files changed, 62 insertions(+), 301 deletions(-) diff --git a/evals/context_manager.eval.ts b/evals/context_manager.eval.ts index 22b40d9254..0f95f0fa00 100644 --- a/evals/context_manager.eval.ts +++ b/evals/context_manager.eval.ts @@ -60,7 +60,7 @@ async function runContextManagerEval( }> { const harness = await SimulationHarness.create( sidecarConfig, - config.getBaseLlmClient(), + config.getBaseLlmClient() as any, path.join(process.cwd(), 'harness-tmp'), ); @@ -70,7 +70,6 @@ async function runContextManagerEval( } // Ensure background tasks (like StateSnapshotProcessor) have finished - await harness.waitForIdle(); // 2. Pick a question based on seed const questionIndex = seed % scenario.questions.length; @@ -82,12 +81,12 @@ async function runContextManagerEval( // We can't easily get the episode IDs from the projected Content[], // but we can look at the working buffer instead. - const workingBuffer = harness.contextManager.getWorkingBufferView(); + const workingBuffer = harness.contextManager.getNodes(); console.log('--- WORKING BUFFER EPISODES START ---'); - workingBuffer.forEach((ep, i) => { - console.log(`[Ep ${i}] ID: ${ep.id}, Type: ${ep.trigger.type}`); - if (ep.trigger.type === 'USER_PROMPT') { - console.log(` Text: ${ep.trigger.semanticParts[0]?.text?.slice(0, 50)}`); + workingBuffer.forEach((node: any, i: number) => { + console.log(`[Node ${i}] ID: ${node.id}, Type: ${node.type}`); + if (node.type === 'USER_PROMPT') { + console.log(` Text: ${node.semanticParts?.[0]?.text?.slice(0, 50)}`); } }); console.log('--- WORKING BUFFER EPISODES END ---'); @@ -95,16 +94,16 @@ async function runContextManagerEval( console.log('--- COMPRESSED HISTORY START ---'); compressedHistory.forEach((msg, i) => { console.log(`[${i}] Role: ${msg.role}`); - msg.parts.forEach((part, j) => { + (msg.parts || []).forEach((part, j) => { if ('text' in part) { console.log( ` Part ${j} (text): ${part.text?.slice(0, 100)}${part.text && part.text.length > 100 ? '...' : ''}`, ); } else if ('functionCall' in part) { - console.log(` Part ${j} (functionCall): ${part.functionCall.name}`); + console.log(` Part ${j} (functionCall): ${part.functionCall?.name}`); } else if ('functionResponse' in part) { console.log( - ` Part ${j} (functionResponse): ${part.functionResponse.name}`, + ` Part ${j} (functionResponse): ${part.functionResponse?.name}`, ); } else { console.log(` Part ${j} (other): ${Object.keys(part).join(', ')}`); @@ -144,8 +143,8 @@ async function runContextManagerEval( .includes(question.expectedSubstring.toLowerCase()); } - const finalTokens = harness.env.tokenCalculator.calculateEpisodeListTokens( - harness.contextManager.getWorkingBufferView(), + const finalTokens = harness.env.tokenCalculator.calculateConcreteListTokens( + harness.contextManager.getNodes(), ); return { pass, response: responseText, question, tokens: finalTokens }; @@ -206,7 +205,7 @@ Respond with ONLY "PASS" or "FAIL".`, function calculateScenarioTokens(scenario: Scenario): number { let totalChars = 0; for (const message of scenario.history) { - for (const part of message.parts) { + for (const part of message.parts || []) { if ('text' in part && part.text) { totalChars += part.text.length; } @@ -228,6 +227,7 @@ function generateRandomSidecarConfig( const max = Math.floor(retained * (1.1 + Math.random() * 2.0)); // 110% to 300% buffer const useSquashing = Math.random() > 0.5; + const useRollingSummary = Math.random() > 0.5; const useSnapshot = Math.random() > 0.5; const processors: any[] = [ @@ -241,23 +241,40 @@ function generateRandomSidecarConfig( }, { processorId: 'BlobDegradationProcessor', options: {} }, { - processorId: 'SemanticCompressionProcessor', + processorId: 'NodeDistillationProcessor', options: { nodeThresholdTokens: Math.floor(retained * (0.1 + Math.random() * 0.3)), }, }, - { processorId: 'EmergencyTruncationProcessor', options: {} }, + { + processorId: 'NodeTruncationProcessor', + options: { + maxTokensPerNode: Math.floor(retained * (0.1 + Math.random() * 0.5)), + }, + }, ]; const backgroundProcessors: any[] = []; if (useSquashing) { backgroundProcessors.push({ - processorId: 'HistorySquashingProcessor', + processorId: 'HistoryTruncationProcessor', options: { - maxTokensPerNode: Math.floor(retained * (0.05 + Math.random() * 0.15)), + target: 'freeNTokens', + freeTokensTarget: Math.floor(retained * (0.05 + Math.random() * 0.15)), }, }); } + + if (useRollingSummary) { + backgroundProcessors.push({ + processorId: 'RollingSummaryProcessor', + options: { + target: 'freeNTokens', + freeTokensTarget: Math.floor(retained * (0.05 + Math.random() * 0.15)), + }, + }); + } + if (useSnapshot) { backgroundProcessors.push({ processorId: 'StateSnapshotProcessor', @@ -265,27 +282,36 @@ function generateRandomSidecarConfig( }); } + const workers: any[] = []; + if (useSnapshot && Math.random() > 0.5) { + workers.push({ + workerId: 'StateSnapshotWorker', + options: { + type: Math.random() > 0.5 ? 'accumulate' : 'point-in-time', + }, + }); + } + return { budget: { retainedTokens: retained, maxTokens: max }, - gcBackstop: { - strategy: 'truncate', // Hardcoded since compress/rollingSummarizer are currently unimplemented - target: 'incremental', - freeTokensTarget: Math.floor(retained * 0.1), - }, pipelines: [ { name: 'Immediate Sanitization', - triggers: ['on_turn'], - execution: 'blocking', + triggers: ['new_message'], processors, }, { name: 'Deep Background Compression', - triggers: [{ type: 'timer', intervalMs: 100 }, 'budget_exceeded'], - execution: 'background', + triggers: [ + Math.random() > 0.5 + ? { type: 'timer', intervalMs: 100 } + : 'gc_backstop', + 'retained_exceeded', + ], processors: backgroundProcessors, }, ], + workers: workers.length > 0 ? workers : undefined, }; } @@ -296,7 +322,7 @@ describe('ContextManager Evaluation Suite', () => { * The "Explorer" test. * Set RUN_EXPLORER=1 to run many iterations and find failures. */ - if (process.env.RUN_EXPLORER) { + if (process.env['RUN_EXPLORER']) { componentEvalTest('ALWAYS_PASSES', { suiteName: 'context-manager', suiteType: 'component-level', @@ -379,16 +405,10 @@ describe('ContextManager Evaluation Suite', () => { retainedTokens: 3737, maxTokens: 11280, }, - gcBackstop: { - strategy: 'truncate', - target: 'incremental', - freeTokensTarget: 373, - }, pipelines: [ { name: 'Immediate Sanitization', - triggers: ['on_turn'], - execution: 'blocking', + triggers: ['new_message'], processors: [ { processorId: 'ToolMaskingProcessor', @@ -401,14 +421,14 @@ describe('ContextManager Evaluation Suite', () => { options: {}, }, { - processorId: 'SemanticCompressionProcessor', + processorId: 'NodeDistillationProcessor', options: { nodeThresholdTokens: 733, }, }, { - processorId: 'EmergencyTruncationProcessor', - options: {}, + processorId: 'NodeTruncationProcessor', + options: { maxTokensPerNode: 1000 }, }, ], }, @@ -419,14 +439,14 @@ describe('ContextManager Evaluation Suite', () => { type: 'timer', intervalMs: 100, }, - 'budget_exceeded', + 'retained_exceeded', ], - execution: 'background', processors: [ { - processorId: 'HistorySquashingProcessor', + processorId: 'HistoryTruncationProcessor', options: { - maxTokensPerNode: 327, + target: 'freeNTokens', + freeTokensTarget: 327, }, }, ], @@ -446,263 +466,4 @@ describe('ContextManager Evaluation Suite', () => { ).toBe(true); }, }); - - componentEvalTest('ALWAYS_PASSES', { - suiteName: 'context-manager', - suiteType: 'component-level', - name: 'ContextManager Frozen Case - final-name (Budget: 5321)', - configOverrides: { model: EVAL_MODEL }, - assert: async (config: Config) => { - const scenario = getScenario('scenario-c-compiler'); - const sidecarConfig: SidecarConfig = { - budget: { - retainedTokens: 5321, - maxTokens: 11398, - }, - gcBackstop: { - strategy: 'truncate', - target: 'incremental', - freeTokensTarget: 532, - }, - pipelines: [ - { - name: 'Immediate Sanitization', - triggers: ['on_turn'], - execution: 'blocking', - processors: [ - { - processorId: 'ToolMaskingProcessor', - options: { - stringLengthThresholdTokens: 2952, - }, - }, - { - processorId: 'BlobDegradationProcessor', - options: {}, - }, - { - processorId: 'SemanticCompressionProcessor', - options: { - nodeThresholdTokens: 1632, - }, - }, - { - processorId: 'EmergencyTruncationProcessor', - options: {}, - }, - ], - }, - { - name: 'Deep Background Compression', - triggers: [ - { - type: 'timer', - intervalMs: 100, - }, - 'budget_exceeded', - ], - execution: 'background', - processors: [ - { - processorId: 'HistorySquashingProcessor', - options: { - maxTokensPerNode: 355, - }, - }, - { - processorId: 'StateSnapshotProcessor', - options: {}, - }, - ], - }, - ], - }; - const seed = 826619; - const result = await runContextManagerEval( - config, - sidecarConfig, - seed, - scenario, - ); - expect( - result.pass, - `Recall failed for final-name. Response: ${result.response}`, - ).toBe(true); - }, - }); - - componentEvalTest('ALWAYS_PASSES', { - suiteName: 'context-manager', - suiteType: 'component-level', - name: 'ContextManager Frozen Case - secret-beta (Budget: 1314)', - configOverrides: { model: EVAL_MODEL }, - assert: async (config: Config) => { - const scenario = getScenario('scenario-c-compiler'); - const sidecarConfig: SidecarConfig = { - budget: { - retainedTokens: 1314, - maxTokens: 2067, - }, - gcBackstop: { - strategy: 'truncate', - target: 'incremental', - freeTokensTarget: 131, - }, - pipelines: [ - { - name: 'Immediate Sanitization', - triggers: ['on_turn'], - execution: 'blocking', - processors: [ - { - processorId: 'ToolMaskingProcessor', - options: { - stringLengthThresholdTokens: 738, - }, - }, - { - processorId: 'BlobDegradationProcessor', - options: {}, - }, - { - processorId: 'SemanticCompressionProcessor', - options: { - nodeThresholdTokens: 195, - }, - }, - { - processorId: 'EmergencyTruncationProcessor', - options: {}, - }, - ], - }, - { - name: 'Deep Background Compression', - triggers: [ - { - type: 'timer', - intervalMs: 100, - }, - 'budget_exceeded', - ], - execution: 'background', - processors: [ - { - processorId: 'HistorySquashingProcessor', - options: { - maxTokensPerNode: 108, - }, - }, - { - processorId: 'StateSnapshotProcessor', - options: {}, - }, - ], - }, - ], - }; - const seed = 475216; - const result = await runContextManagerEval( - config, - sidecarConfig, - seed, - scenario, - ); - expect( - result.pass, - `Recall failed for secret-beta. Response: ${result.response}`, - ).toBe(true); - }, - }); - - componentEvalTest('ALWAYS_PASSES', { - suiteName: 'context-manager', - suiteType: 'component-level', - name: 'ContextManager Frozen Case - codegen-header (Budget: 2585)', - configOverrides: { model: EVAL_MODEL }, - timeout: 240000, // 4 minutes to allow Gemini 2.5 Pro to compress even with rate limits/load shedding - assert: async (config: Config) => { - const scenario = getScenario('scenario-c-compiler'); - const targetQuestion = scenario.questions.find( - (q) => q.id === 'codegen-header', - ); - if (targetQuestion) { - targetQuestion.expectedSubstring = 'Generated by GigaC'; - } - const sidecarConfig: SidecarConfig = { - budget: { - retainedTokens: 2585, - maxTokens: 4196, - }, - gcBackstop: { - strategy: 'truncate', - target: 'incremental', - freeTokensTarget: 258, - }, - pipelines: [ - { - name: 'Immediate Sanitization', - triggers: ['on_turn'], - execution: 'blocking', - processors: [ - { - processorId: 'ToolMaskingProcessor', - options: { - stringLengthThresholdTokens: 720, - }, - }, - { - processorId: 'BlobDegradationProcessor', - options: {}, - }, - { - processorId: 'SemanticCompressionProcessor', - options: { - nodeThresholdTokens: 336, - }, - }, - { - processorId: 'EmergencyTruncationProcessor', - options: {}, - }, - ], - }, - { - name: 'Deep Background Compression', - triggers: [ - { - type: 'timer', - intervalMs: 100, - }, - 'budget_exceeded', - ], - execution: 'background', - processors: [ - { - processorId: 'HistorySquashingProcessor', - options: { - maxTokensPerNode: 237, - }, - }, - { - processorId: 'StateSnapshotProcessor', - options: {}, - }, - ], - }, - ], - }; - const seed = 715277; - const result = await runContextManagerEval( - config, - sidecarConfig, - seed, - scenario, - ); - expect( - result.pass, - `Recall failed for codegen-header. Response: ${result.response}`, - ).toBe(true); - }, - }); }); diff --git a/packages/core/src/context/processors/nodeDistillationProcessor.ts b/packages/core/src/context/processors/nodeDistillationProcessor.ts index dedde91050..1b98674323 100644 --- a/packages/core/src/context/processors/nodeDistillationProcessor.ts +++ b/packages/core/src/context/processors/nodeDistillationProcessor.ts @@ -55,7 +55,7 @@ export class NodeDistillationProcessor implements ContextProcessor { try { const response = await this.env.llmClient.generateContent({ role: LlmRole.UTILITY_COMPRESSOR, - modelConfigKey: { model: 'default' }, + modelConfigKey: { model: 'summarizer-default' }, promptId: this.env.promptId, abortSignal: new AbortController().signal, contents: [