From 26602f97fb7649ed070da31f16721e1856b2f41a Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 14 May 2026 23:26:02 +0000 Subject: [PATCH] fix(core): Resolve context loss and token calculator thrashing This commit addresses the massive context loss and 0% utilization issue by fixing two related bugs in the context management system: 1. **Masked Tool Retention Bug:** Removed the brittle 'in-flight' protection logic from `getProtectedNodeIds`. Previously, this logic failed to match `MASKED_TOOL` nodes, causing it to perpetually hoard `functionCall` nodes inside the protected list. This hoarding rapidly exhausted the context budget, forcing the GC backstop into an aggressive death spiral. Now, only the `recent_turn` and explicit tasks are protected, allowing standard GC to naturally prune old tool calls (and the `HistoryHardener` handles the rest). 2. **Adaptive Calculator Overhead Bug:** The `AdaptiveTokenCalculator` was calculating ratio drift by dividing the API's `actualTokens` (which includes the static overhead of the System Instruction and Tools) by the graph's `baseUnits` (which only counts history). We now inject an `getOverheadTokens` callback during initialization, allowing the calculator to subtract the overhead from the Gemini API count before dividing by the base units, achieving an accurate apples-to-apples ratio. 3. **Violent Oscillation Dampening:** Introduced strict dampening bounds (`maxStep` limit and a hard `targetWeight` clamp) to the Adaptive Token Calculator's EMA. This acts identically to a machine learning learning-rate limit, ensuring that a single massive outlier reading cannot violently thrash the `learnedWeight` multiplier in a single turn. Includes new test coverage verifying the maxStep logic and overhead subtraction. Also removed an unsafe `as unknown` cast in the `snapshotGenerator` that failed `eslint`. --- packages/core/src/context/contextManager.ts | 4 +- .../utils/adaptiveTokenCalculator.test.ts | 42 +++++++++++++- .../context/utils/adaptiveTokenCalculator.ts | 39 ++++++++++--- packages/core/src/core/geminiChat.test.ts | 55 +++++++++++++++++++ packages/core/src/core/geminiChat.ts | 45 +++++++++------ 5 files changed, 157 insertions(+), 28 deletions(-) diff --git a/packages/core/src/context/contextManager.ts b/packages/core/src/context/contextManager.ts index 116de68cfc..6e9f53f7fd 100644 --- a/packages/core/src/context/contextManager.ts +++ b/packages/core/src/context/contextManager.ts @@ -147,11 +147,11 @@ export class ContextManager { const agedOutNodes = new Set(); let rollingTokens = 0; - // Identify active tool calls that must NEVER be truncated + // Identify nodes that must NEVER be truncated const protectedIds = this.getProtectedNodeIds(this.buffer.nodes); if (protectedIds.size > 0) { debugLogger.log( - `[ContextManager] Pinning ${protectedIds.size} active tool call nodes to prevent truncation.`, + `[ContextManager] Pinning ${protectedIds.size} nodes (recent_turn or external_active_task) to prevent truncation.`, ); } diff --git a/packages/core/src/context/utils/adaptiveTokenCalculator.test.ts b/packages/core/src/context/utils/adaptiveTokenCalculator.test.ts index f396f87d94..12e258cf07 100644 --- a/packages/core/src/context/utils/adaptiveTokenCalculator.test.ts +++ b/packages/core/src/context/utils/adaptiveTokenCalculator.test.ts @@ -138,7 +138,8 @@ describe('AdaptiveTokenCalculator', () => { // Simulate an event where the API reported 100 tokens, and our base units were 100 // But overhead is 40. // actualGraphTokens = 100 - 40 = 60 - // targetWeight = 60 / 100 = 0.6 + // rawTargetWeight = 60 / 100 = 0.6 + // targetWeight = Math.max(0.5, 0.6) = 0.6 // newWeight = 1.0 * 0.8 + 0.6 * 0.2 = 0.8 + 0.12 = 0.92 eventBus.emitTokenGroundTruth({ actualTokens: 100, @@ -147,4 +148,43 @@ describe('AdaptiveTokenCalculator', () => { expect(calculator.getLearnedWeight()).toBeCloseTo(0.92, 5); }); + + it('should enforce the maxStep limit to prevent violent oscillation from massive outliers', () => { + const eventBus = new ContextEventBus(); + const maxStep = 0.05; // Tight limit + const calculator = new AdaptiveTokenCalculator( + charsPerToken, + registry, + eventBus, + undefined, + { maxStep }, + ); + + // Initial state: weight = 1.0 + + // Simulate a massive outlier where the API reports 10,000 tokens for 100 base units. + // rawTargetWeight = 100 + // targetWeight = Math.min(100, 1.0 * 2.0) = 2.0 + // emaWeight = 1.0 * 0.8 + 2.0 * 0.2 = 1.2 + // BUT maxStep is 0.05, so the actual step is clamped. + // finalWeight = 1.0 + 0.05 = 1.05 + eventBus.emitTokenGroundTruth({ + actualTokens: 10000, + promptBaseUnits: 100, + }); + + expect(calculator.getLearnedWeight()).toBeCloseTo(1.05, 5); + + // Simulate a massive under-estimation + // rawTargetWeight = 0 + // targetWeight = Math.max(0, 1.05 * 0.5) = 0.525 + // emaWeight = 1.05 * 0.8 + 0.525 * 0.2 = 0.84 + 0.105 = 0.945 + // BUT maxStep is 0.05, so step is clamped: 1.05 - 0.05 = 1.0 + eventBus.emitTokenGroundTruth({ + actualTokens: 0, + promptBaseUnits: 100, + }); + + expect(calculator.getLearnedWeight()).toBeCloseTo(1.0, 5); + }); }); diff --git a/packages/core/src/context/utils/adaptiveTokenCalculator.ts b/packages/core/src/context/utils/adaptiveTokenCalculator.ts index 8204116409..7cf7984109 100644 --- a/packages/core/src/context/utils/adaptiveTokenCalculator.ts +++ b/packages/core/src/context/utils/adaptiveTokenCalculator.ts @@ -14,6 +14,13 @@ import type { NodeBehaviorRegistry } from '../graph/behaviorRegistry.js'; import type { ContextEventBus, TokenGroundTruthEvent } from '../eventBus.js'; import { debugLogger } from '../../utils/debugLogger.js'; +export interface AdaptiveLearningConfig { + /** The momentum factor for the Exponential Moving Average (EMA). Defaults to 0.2. */ + learningRate?: number; + /** The absolute maximum change allowed to the weight in a single turn. Defaults to 0.15. */ + maxStep?: number; +} + /** * An Adaptive Token Calculator that dynamically learns the true token cost of the user's * conversation by applying an Exponential Moving Average (EMA) gradient descent to @@ -26,13 +33,18 @@ import { debugLogger } from '../../utils/debugLogger.js'; export class AdaptiveTokenCalculator implements AdvancedTokenCalculator { private learnedWeight = 1.0; private readonly baseCalculator: StaticTokenCalculator; + private readonly learningRate: number; + private readonly maxStep: number; constructor( charsPerToken: number, registry: NodeBehaviorRegistry, eventBus: ContextEventBus, private readonly getOverheadTokens?: () => number, + config?: AdaptiveLearningConfig, ) { + this.learningRate = config?.learningRate ?? 0.2; + this.maxStep = config?.maxStep ?? 0.15; this.baseCalculator = new StaticTokenCalculator(charsPerToken, registry); eventBus.onTokenGroundTruth((event: TokenGroundTruthEvent) => { this.handleGroundTruth(event.actualTokens, event.promptBaseUnits); @@ -53,20 +65,33 @@ export class AdaptiveTokenCalculator implements AdvancedTokenCalculator { const actualGraphTokens = Math.max(0, actualTokens - overheadTokens); // Determine what ratio we should have used - const targetWeight = actualGraphTokens / promptBaseUnits; + const rawTargetWeight = actualGraphTokens / promptBaseUnits; const oldWeight = this.learnedWeight; - // Apply Momentum (Learning Rate) - const learningRate = 0.2; - const newWeight = - oldWeight * (1 - learningRate) + targetWeight * learningRate; + // Dampen extreme outliers *before* applying the EMA by capping the target weight + // to a reasonable multiple of the current weight. This prevents a single massive + // anomaly from destroying the running average. + const targetWeight = Math.max( + oldWeight * 0.5, + Math.min(rawTargetWeight, oldWeight * 2.0), + ); - // Clamp to reasonable safety bounds to prevent rogue metadata poisoning the system + // Apply Momentum (Learning Rate) + let newWeight = + oldWeight * (1 - this.learningRate) + targetWeight * this.learningRate; + + // Hard limit the maximum step size per turn to prevent violent oscillation + if (newWeight > oldWeight + this.maxStep) + newWeight = oldWeight + this.maxStep; + if (newWeight < oldWeight - this.maxStep) + newWeight = oldWeight - this.maxStep; + + // Clamp to reasonable absolute safety bounds this.learnedWeight = Math.max(0.5, Math.min(newWeight, 2.0)); debugLogger.log( `[AdaptiveTokenCalculator] Learned weight updated to ${this.learnedWeight.toFixed(3)} ` + - `(API Tokens: ${actualTokens}, Base Units: ${promptBaseUnits}, Target Ratio: ${targetWeight.toFixed(3)})`, + `(API Tokens: ${actualTokens}, Overhead: ${overheadTokens}, Graph Tokens: ${actualGraphTokens}, Base Units: ${promptBaseUnits}, Target Ratio: ${targetWeight.toFixed(3)})`, ); } diff --git a/packages/core/src/core/geminiChat.test.ts b/packages/core/src/core/geminiChat.test.ts index 0ecaae8491..a663289fef 100644 --- a/packages/core/src/core/geminiChat.test.ts +++ b/packages/core/src/core/geminiChat.test.ts @@ -41,6 +41,7 @@ import { makeResolvedModelConfig } from '../services/modelConfigServiceTestUtils import type { HookSystem } from '../hooks/hookSystem.js'; import { LlmRole } from '../telemetry/types.js'; import { BINARY_INJECTION_KEY } from '../utils/generateContentResponseUtilities.js'; +import type { ResumedSessionData } from '../services/chatRecordingTypes.js'; // Mock fs module to prevent actual file system operations during tests const mockFileSystem = new Map(); @@ -250,6 +251,60 @@ describe('GeminiChat', () => { const chatEmpty = new GeminiChat(mockConfig); expect(chatEmpty.getLastPromptTokenCount()).toBe(0); }); + + it('should prioritize in-memory history over resumedSessionData', () => { + // This test simulates a "hot restart" after a context management operation + // like compression, where the in-memory history is shorter and more up-to-date + // than the session data that might be on disk. + + // 1. A stale, longer history from a persisted session record + const resumedSessionData = { + conversation: { + messages: [ + { + id: 'a', + type: 'user', + content: [{ text: 'turn 1' }], + create_time: new Date(), + }, + { + id: 'b', + type: 'gemini', + content: [{ text: 'turn 2' }], + create_time: new Date(), + }, + { + id: 'c', + type: 'user', + content: [{ text: 'turn 3' }], + create_time: new Date(), + }, + ], + }, + } as unknown as ResumedSessionData; + + // 2. A fresh, compressed in-memory history + const compressedHistory: HistoryTurn[] = [ + { + id: 'summary-1', + content: { role: 'user', parts: [{ text: 'summary of turns 1-3' }] }, + }, + ]; + + // 3. Instantiate the chat, providing both. + const chat = new GeminiChat( + mockConfig, + '', + [], + compressedHistory, // This should be prioritized + resumedSessionData, // This should be ignored + ); + + // 4. Assert that the shorter, in-memory history was used. + const finalHistory = chat.getHistoryTurns(); + expect(finalHistory).toHaveLength(1); + expect(finalHistory[0].id).toBe('summary-1'); + }); }); describe('setHistory', () => { diff --git a/packages/core/src/core/geminiChat.ts b/packages/core/src/core/geminiChat.ts index 2aa813142b..30b33d3c2e 100644 --- a/packages/core/src/core/geminiChat.ts +++ b/packages/core/src/core/geminiChat.ts @@ -283,24 +283,33 @@ export class GeminiChat { private readonly onModelChanged?: (modelId: string) => Promise, ) { validateHistory(history); - const initialHistory: HistoryTurn[] = resumedSessionData - ? resumedSessionData.conversation.messages - .filter((m) => m.type === 'user' || m.type === 'gemini') - .map((m) => ({ - id: m.id, - content: { - role: m.type === 'user' ? 'user' : 'model', - parts: Array.isArray(m.content) - ? // eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion - (m.content as Part[]) - : [{ text: String(m.content) }], - }, - })) - : history.map((item) => - 'id' in item && 'content' in item - ? item - : { id: randomUUID(), content: item }, - ); + + let initialHistory: HistoryTurn[]; + // If history is passed, it is the most up-to-date in-memory state and takes precedence. + // This is critical for hot-restarts after operations like context compression. + if (history.length > 0) { + initialHistory = history.map((item) => + 'id' in item && 'content' in item + ? item + : { id: randomUUID(), content: item }, + ); + } else if (resumedSessionData) { + // Otherwise, if resuming from disk, build from the persisted record. + initialHistory = resumedSessionData.conversation.messages + .filter((m) => m.type === 'user' || m.type === 'gemini') + .map((m) => ({ + id: m.id, + content: { + role: m.type === 'user' ? 'user' : 'model', + parts: Array.isArray(m.content) + ? // eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion + (m.content as Part[]) + : [{ text: String(m.content) }], + }, + })); + } else { + initialHistory = []; + } this.agentHistory = new AgentChatHistory(initialHistory); this.chatRecordingService = new ChatRecordingService(context);