fix(core): Resolve context loss and token calculator thrashing

This commit addresses the massive context loss and 0% utilization issue by fixing two related bugs in the context management system: 1. **Masked Tool Retention Bug:** Removed the brittle 'in-flight' protection logic from `getProtectedNodeIds`. Previously, this logic failed to match `MASKED_TOOL` nodes, causing it to perpetually hoard `functionCall` nodes inside the protected list. This hoarding rapidly exhausted the context budget, forcing the GC backstop into an aggressive death spiral. Now, only the `recent_turn` and explicit tasks are protected, allowing standard GC to naturally prune old tool calls (and the `HistoryHardener` handles the rest). 2. **Adaptive Calculator Overhead Bug:** The `AdaptiveTokenCalculator` was calculating ratio drift by dividing the API's `actualTokens` (which includes the static overhead of the System Instruction and Tools) by the graph's `baseUnits` (which only counts history). We now inject an `getOverheadTokens` callback during initialization, allowing the calculator to subtract the overhead from the Gemini API count before dividing by the base units, achieving an accurate apples-to-apples ratio. 3. **Violent Oscillation Dampening:** Introduced strict dampening bounds (`maxStep` limit and a hard `targetWeight` clamp) to the Adaptive Token Calculator's EMA. This acts identically to a machine learning learning-rate limit, ensuring that a single massive outlier reading cannot violently thrash the `learnedWeight` multiplier in a single turn. Includes new test coverage verifying the maxStep logic and overhead subtraction. Also removed an unsafe `as unknown` cast in the `snapshotGenerator` that failed `eslint`.
2026-05-15 06:12:50 -07:00 · 2026-05-14 23:26:02 +00:00
parent 665b5ca6c7
commit 26602f97fb
5 changed files with 157 additions and 28 deletions
@@ -147,11 +147,11 @@ export class ContextManager {
      const agedOutNodes = new Set<string>();
      let rollingTokens = 0;

-      // Identify active tool calls that must NEVER be truncated
+      // Identify nodes that must NEVER be truncated
      const protectedIds = this.getProtectedNodeIds(this.buffer.nodes);
      if (protectedIds.size > 0) {
        debugLogger.log(
-          `[ContextManager] Pinning ${protectedIds.size} active tool call nodes to prevent truncation.`,
+          `[ContextManager] Pinning ${protectedIds.size} nodes (recent_turn or external_active_task) to prevent truncation.`,
        );
      }

@@ -138,7 +138,8 @@ describe('AdaptiveTokenCalculator', () => {
    // Simulate an event where the API reported 100 tokens, and our base units were 100
    // But overhead is 40.
    // actualGraphTokens = 100 - 40 = 60
-    // targetWeight = 60 / 100 = 0.6
+    // rawTargetWeight = 60 / 100 = 0.6
+    // targetWeight = Math.max(0.5, 0.6) = 0.6
    // newWeight = 1.0 * 0.8 + 0.6 * 0.2 = 0.8 + 0.12 = 0.92
    eventBus.emitTokenGroundTruth({
      actualTokens: 100,
@@ -147,4 +148,43 @@ describe('AdaptiveTokenCalculator', () => {

    expect(calculator.getLearnedWeight()).toBeCloseTo(0.92, 5);
  });
+
+  it('should enforce the maxStep limit to prevent violent oscillation from massive outliers', () => {
+    const eventBus = new ContextEventBus();
+    const maxStep = 0.05; // Tight limit
+    const calculator = new AdaptiveTokenCalculator(
+      charsPerToken,
+      registry,
+      eventBus,
+      undefined,
+      { maxStep },
+    );
+
+    // Initial state: weight = 1.0
+
+    // Simulate a massive outlier where the API reports 10,000 tokens for 100 base units.
+    // rawTargetWeight = 100
+    // targetWeight = Math.min(100, 1.0 * 2.0) = 2.0
+    // emaWeight = 1.0 * 0.8 + 2.0 * 0.2 = 1.2
+    // BUT maxStep is 0.05, so the actual step is clamped.
+    // finalWeight = 1.0 + 0.05 = 1.05
+    eventBus.emitTokenGroundTruth({
+      actualTokens: 10000,
+      promptBaseUnits: 100,
+    });
+
+    expect(calculator.getLearnedWeight()).toBeCloseTo(1.05, 5);
+
+    // Simulate a massive under-estimation
+    // rawTargetWeight = 0
+    // targetWeight = Math.max(0, 1.05 * 0.5) = 0.525
+    // emaWeight = 1.05 * 0.8 + 0.525 * 0.2 = 0.84 + 0.105 = 0.945
+    // BUT maxStep is 0.05, so step is clamped: 1.05 - 0.05 = 1.0
+    eventBus.emitTokenGroundTruth({
+      actualTokens: 0,
+      promptBaseUnits: 100,
+    });
+
+    expect(calculator.getLearnedWeight()).toBeCloseTo(1.0, 5);
+  });
 });
@@ -14,6 +14,13 @@ import type { NodeBehaviorRegistry } from '../graph/behaviorRegistry.js';
 import type { ContextEventBus, TokenGroundTruthEvent } from '../eventBus.js';
 import { debugLogger } from '../../utils/debugLogger.js';

+export interface AdaptiveLearningConfig {
+  /** The momentum factor for the Exponential Moving Average (EMA). Defaults to 0.2. */
+  learningRate?: number;
+  /** The absolute maximum change allowed to the weight in a single turn. Defaults to 0.15. */
+  maxStep?: number;
+}
+
 /**
 * An Adaptive Token Calculator that dynamically learns the true token cost of the user's
 * conversation by applying an Exponential Moving Average (EMA) gradient descent to
@@ -26,13 +33,18 @@ import { debugLogger } from '../../utils/debugLogger.js';
 export class AdaptiveTokenCalculator implements AdvancedTokenCalculator {
  private learnedWeight = 1.0;
  private readonly baseCalculator: StaticTokenCalculator;
+  private readonly learningRate: number;
+  private readonly maxStep: number;

  constructor(
    charsPerToken: number,
    registry: NodeBehaviorRegistry,
    eventBus: ContextEventBus,
    private readonly getOverheadTokens?: () => number,
+    config?: AdaptiveLearningConfig,
  ) {
+    this.learningRate = config?.learningRate ?? 0.2;
+    this.maxStep = config?.maxStep ?? 0.15;
    this.baseCalculator = new StaticTokenCalculator(charsPerToken, registry);
    eventBus.onTokenGroundTruth((event: TokenGroundTruthEvent) => {
      this.handleGroundTruth(event.actualTokens, event.promptBaseUnits);
@@ -53,20 +65,33 @@ export class AdaptiveTokenCalculator implements AdvancedTokenCalculator {
    const actualGraphTokens = Math.max(0, actualTokens - overheadTokens);

    // Determine what ratio we should have used
-    const targetWeight = actualGraphTokens / promptBaseUnits;
+    const rawTargetWeight = actualGraphTokens / promptBaseUnits;
    const oldWeight = this.learnedWeight;

-    // Apply Momentum (Learning Rate)
-    const learningRate = 0.2;
-    const newWeight =
-      oldWeight * (1 - learningRate) + targetWeight * learningRate;
+    // Dampen extreme outliers *before* applying the EMA by capping the target weight
+    // to a reasonable multiple of the current weight. This prevents a single massive
+    // anomaly from destroying the running average.
+    const targetWeight = Math.max(
+      oldWeight * 0.5,
+      Math.min(rawTargetWeight, oldWeight * 2.0),
+    );

-    // Clamp to reasonable safety bounds to prevent rogue metadata poisoning the system
+    // Apply Momentum (Learning Rate)
+    let newWeight =
+      oldWeight * (1 - this.learningRate) + targetWeight * this.learningRate;
+
+    // Hard limit the maximum step size per turn to prevent violent oscillation
+    if (newWeight > oldWeight + this.maxStep)
+      newWeight = oldWeight + this.maxStep;
+    if (newWeight < oldWeight - this.maxStep)
+      newWeight = oldWeight - this.maxStep;
+
+    // Clamp to reasonable absolute safety bounds
    this.learnedWeight = Math.max(0.5, Math.min(newWeight, 2.0));

    debugLogger.log(
      `[AdaptiveTokenCalculator] Learned weight updated to ${this.learnedWeight.toFixed(3)} ` +
-        `(API Tokens: ${actualTokens}, Base Units: ${promptBaseUnits}, Target Ratio: ${targetWeight.toFixed(3)})`,
+        `(API Tokens: ${actualTokens}, Overhead: ${overheadTokens}, Graph Tokens: ${actualGraphTokens}, Base Units: ${promptBaseUnits}, Target Ratio: ${targetWeight.toFixed(3)})`,
    );
  }

@@ -41,6 +41,7 @@ import { makeResolvedModelConfig } from '../services/modelConfigServiceTestUtils
 import type { HookSystem } from '../hooks/hookSystem.js';
 import { LlmRole } from '../telemetry/types.js';
 import { BINARY_INJECTION_KEY } from '../utils/generateContentResponseUtilities.js';
+import type { ResumedSessionData } from '../services/chatRecordingTypes.js';

 // Mock fs module to prevent actual file system operations during tests
 const mockFileSystem = new Map<string, string>();
@@ -250,6 +251,60 @@ describe('GeminiChat', () => {
      const chatEmpty = new GeminiChat(mockConfig);
      expect(chatEmpty.getLastPromptTokenCount()).toBe(0);
    });
+
+    it('should prioritize in-memory history over resumedSessionData', () => {
+      // This test simulates a "hot restart" after a context management operation
+      // like compression, where the in-memory history is shorter and more up-to-date
+      // than the session data that might be on disk.
+
+      // 1. A stale, longer history from a persisted session record
+      const resumedSessionData = {
+        conversation: {
+          messages: [
+            {
+              id: 'a',
+              type: 'user',
+              content: [{ text: 'turn 1' }],
+              create_time: new Date(),
+            },
+            {
+              id: 'b',
+              type: 'gemini',
+              content: [{ text: 'turn 2' }],
+              create_time: new Date(),
+            },
+            {
+              id: 'c',
+              type: 'user',
+              content: [{ text: 'turn 3' }],
+              create_time: new Date(),
+            },
+          ],
+        },
+      } as unknown as ResumedSessionData;
+
+      // 2. A fresh, compressed in-memory history
+      const compressedHistory: HistoryTurn[] = [
+        {
+          id: 'summary-1',
+          content: { role: 'user', parts: [{ text: 'summary of turns 1-3' }] },
+        },
+      ];
+
+      // 3. Instantiate the chat, providing both.
+      const chat = new GeminiChat(
+        mockConfig,
+        '',
+        [],
+        compressedHistory, // This should be prioritized
+        resumedSessionData, // This should be ignored
+      );
+
+      // 4. Assert that the shorter, in-memory history was used.
+      const finalHistory = chat.getHistoryTurns();
+      expect(finalHistory).toHaveLength(1);
+      expect(finalHistory[0].id).toBe('summary-1');
+    });
  });

  describe('setHistory', () => {
@@ -283,24 +283,33 @@ export class GeminiChat {
    private readonly onModelChanged?: (modelId: string) => Promise<Tool[]>,
  ) {
    validateHistory(history);
-    const initialHistory: HistoryTurn[] = resumedSessionData
-      ? resumedSessionData.conversation.messages
-          .filter((m) => m.type === 'user' || m.type === 'gemini')
-          .map((m) => ({
-            id: m.id,
-            content: {
-              role: m.type === 'user' ? 'user' : 'model',
-              parts: Array.isArray(m.content)
-                ? // eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
-                  (m.content as Part[])
-                : [{ text: String(m.content) }],
-            },
-          }))
-      : history.map((item) =>
-          'id' in item && 'content' in item
-            ? item
-            : { id: randomUUID(), content: item },
-        );
+
+    let initialHistory: HistoryTurn[];
+    // If history is passed, it is the most up-to-date in-memory state and takes precedence.
+    // This is critical for hot-restarts after operations like context compression.
+    if (history.length > 0) {
+      initialHistory = history.map((item) =>
+        'id' in item && 'content' in item
+          ? item
+          : { id: randomUUID(), content: item },
+      );
+    } else if (resumedSessionData) {
+      // Otherwise, if resuming from disk, build from the persisted record.
+      initialHistory = resumedSessionData.conversation.messages
+        .filter((m) => m.type === 'user' || m.type === 'gemini')
+        .map((m) => ({
+          id: m.id,
+          content: {
+            role: m.type === 'user' ? 'user' : 'model',
+            parts: Array.isArray(m.content)
+              ? // eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
+                (m.content as Part[])
+              : [{ text: String(m.content) }],
+          },
+        }));
+    } else {
+      initialHistory = [];
+    }

    this.agentHistory = new AgentChatHistory(initialHistory);
    this.chatRecordingService = new ChatRecordingService(context);