feat(core): introduce decoupled ContextManager and Sidecar architecture (#24752)

This commit is contained in:
joshualitt
2026-04-13 15:02:22 -07:00
committed by GitHub
parent 706d4d4707
commit daf5006237
54 changed files with 6454 additions and 0 deletions
@@ -0,0 +1,107 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import type { Part } from '@google/genai';
import { estimateTokenCountSync as baseEstimate } from '../../utils/tokenCalculation.js';
import type { ConcreteNode } from '../graph/types.js';
import type { NodeBehaviorRegistry } from '../graph/behaviorRegistry.js';
/**
* The flat token cost assigned to a single multi-modal asset (like an image tile)
* by the Gemini API. We use this as a baseline heuristic for inlineData/fileData.
*/
export class ContextTokenCalculator {
private readonly tokenCache = new Map<string, number>();
constructor(
private readonly charsPerToken: number,
private readonly registry: NodeBehaviorRegistry,
) {}
/**
* Estimates tokens for a simple string based on character count.
* Fast, but inherently inaccurate compared to real model tokenization.
*/
estimateTokensForString(text: string): number {
return Math.ceil(text.length / this.charsPerToken);
}
/**
* Fast, simple heuristic conversion from tokens to expected character length.
* Useful for calculating truncation thresholds.
*/
tokensToChars(tokens: number): number {
return tokens * this.charsPerToken;
}
/**
* Pre-calculates and caches the token cost of a newly minted node.
* Because nodes are immutable, this cost never changes for this node ID.
*/
/**
* Removes cached token counts for any nodes that are no longer in the given live set.
* This prevents unbounded memory growth during long sessions.
*/
garbageCollectCache(liveNodeIds: ReadonlySet<string>): void {
for (const [id] of this.tokenCache) {
if (!liveNodeIds.has(id)) {
this.tokenCache.delete(id);
}
}
}
cacheNodeTokens(node: ConcreteNode): number {
const behavior = this.registry.get(node.type);
const parts = behavior.getEstimatableParts(node);
const tokens = this.estimateTokensForParts(parts);
this.tokenCache.set(node.id, tokens);
return tokens;
}
/**
* Retrieves the token cost of a single node from the cache.
* If it misses the cache, it computes it and caches it.
*/
getTokenCost(node: ConcreteNode): number {
const cached = this.tokenCache.get(node.id);
if (cached !== undefined) return cached;
return this.cacheNodeTokens(node);
}
/**
* Fast calculation for a flat array of ConcreteNodes (The Nodes).
* It relies entirely on the O(1) sidecar token cache.
*/
calculateConcreteListTokens(nodes: readonly ConcreteNode[]): number {
let tokens = 0;
for (const node of nodes) {
tokens += this.getTokenCost(node);
}
return tokens;
}
/**
* Slower, precise estimation for a Gemini Content/Part graph.
* Deeply inspects the nested structure and uses the base tokenization math.
*/
estimateTokensForParts(parts: Part[], depth: number = 0): number {
let totalTokens = 0;
for (const part of parts) {
if (typeof part.text === 'string') {
totalTokens += Math.ceil(part.text.length / this.charsPerToken);
} else if (part.inlineData !== undefined || part.fileData !== undefined) {
totalTokens += 258;
} else {
totalTokens += Math.ceil(
JSON.stringify(part).length / this.charsPerToken,
);
}
}
// Also include structural overhead
return totalTokens + baseEstimate(parts, depth);
}
}
@@ -0,0 +1,54 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import type { ConcreteNode } from '../graph/types.js';
import type { ContextEnvironment } from '../pipeline/environment.js';
import { LlmRole } from '../../telemetry/llmRole.js';
export class SnapshotGenerator {
constructor(private readonly env: ContextEnvironment) {}
async synthesizeSnapshot(
nodes: readonly ConcreteNode[],
systemInstruction?: string,
): Promise<string> {
const systemPrompt =
systemInstruction ??
`You are an expert Context Memory Manager. You will be provided with a raw transcript of older conversation turns between a user and an AI assistant.
Your task is to synthesize these turns into a single, dense, factual snapshot that preserves all critical context, preferences, active tasks, and factual knowledge, but discards conversational filler, pleasantries, and redundant back-and-forth iterations.
Output ONLY the raw factual snapshot, formatted compactly. Do not include markdown wrappers, prefixes like "Here is the snapshot", or conversational elements.`;
let userPromptText = 'TRANSCRIPT TO SNAPSHOT:\n\n';
for (const node of nodes) {
let nodeContent = '';
if ('text' in node && typeof node.text === 'string') {
nodeContent = node.text;
} else if ('semanticParts' in node) {
nodeContent = JSON.stringify(node.semanticParts);
} else if ('observation' in node) {
nodeContent =
typeof node.observation === 'string'
? node.observation
: JSON.stringify(node.observation);
}
userPromptText += `[${node.type}]: ${nodeContent}\n`;
}
const response = await this.env.llmClient.generateContent({
role: LlmRole.UTILITY_STATE_SNAPSHOT_PROCESSOR,
modelConfigKey: { model: 'gemini-3-flash-base' },
contents: [{ role: 'user', parts: [{ text: userPromptText }] }],
systemInstruction: { role: 'system', parts: [{ text: systemPrompt }] },
promptId: this.env.promptId,
abortSignal: new AbortController().signal,
});
const candidate = response.candidates?.[0];
const textPart = candidate?.content?.parts?.[0];
return textPart?.text || '';
}
}