2025-10-27 14:29:39 -07:00
|
|
|
/**
|
|
|
|
|
* @license
|
|
|
|
|
* Copyright 2025 Google LLC
|
|
|
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
import type { Content } from '@google/genai';
|
|
|
|
|
import type { Config } from '../config/config.js';
|
|
|
|
|
import type { GeminiChat } from '../core/geminiChat.js';
|
|
|
|
|
import { type ChatCompressionInfo, CompressionStatus } from '../core/turn.js';
|
|
|
|
|
import { tokenLimit } from '../core/tokenLimits.js';
|
|
|
|
|
import { getCompressionPrompt } from '../core/prompts.js';
|
|
|
|
|
import { getResponseText } from '../utils/partUtils.js';
|
|
|
|
|
import { logChatCompression } from '../telemetry/loggers.js';
|
|
|
|
|
import { makeChatCompressionEvent } from '../telemetry/types.js';
|
2026-01-18 22:40:07 -08:00
|
|
|
import {
|
|
|
|
|
saveTruncatedToolOutput,
|
|
|
|
|
formatTruncatedToolOutput,
|
|
|
|
|
} from '../utils/fileUtils.js';
|
|
|
|
|
import { debugLogger } from '../utils/debugLogger.js';
|
2025-10-27 14:29:39 -07:00
|
|
|
import { getInitialChatHistory } from '../utils/environmentContext.js';
|
2026-01-18 22:40:07 -08:00
|
|
|
import {
|
|
|
|
|
calculateRequestTokenCount,
|
|
|
|
|
estimateTokenCountSync,
|
|
|
|
|
} from '../utils/tokenCalculation.js';
|
2025-11-24 12:24:45 -08:00
|
|
|
import {
|
|
|
|
|
DEFAULT_GEMINI_FLASH_LITE_MODEL,
|
|
|
|
|
DEFAULT_GEMINI_FLASH_MODEL,
|
|
|
|
|
DEFAULT_GEMINI_MODEL,
|
|
|
|
|
PREVIEW_GEMINI_MODEL,
|
2025-12-17 09:43:21 -08:00
|
|
|
PREVIEW_GEMINI_FLASH_MODEL,
|
2025-11-24 12:24:45 -08:00
|
|
|
} from '../config/models.js';
|
2025-12-03 09:04:13 -08:00
|
|
|
import { PreCompressTrigger } from '../hooks/types.js';
|
2025-10-27 14:29:39 -07:00
|
|
|
|
|
|
|
|
/**
|
2025-10-30 13:46:41 -07:00
|
|
|
* Default threshold for compression token count as a fraction of the model's
|
|
|
|
|
* token limit. If the chat history exceeds this threshold, it will be compressed.
|
2025-10-27 14:29:39 -07:00
|
|
|
*/
|
2025-11-20 11:43:35 -08:00
|
|
|
export const DEFAULT_COMPRESSION_TOKEN_THRESHOLD = 0.5;
|
2025-10-27 14:29:39 -07:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* The fraction of the latest chat history to keep. A value of 0.3
|
|
|
|
|
* means that only the last 30% of the chat history will be kept after compression.
|
|
|
|
|
*/
|
|
|
|
|
export const COMPRESSION_PRESERVE_THRESHOLD = 0.3;
|
|
|
|
|
|
2026-01-18 22:40:07 -08:00
|
|
|
/**
|
|
|
|
|
* The budget for function response tokens in the preserved history.
|
|
|
|
|
*/
|
|
|
|
|
export const COMPRESSION_FUNCTION_RESPONSE_TOKEN_BUDGET = 50_000;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* The number of lines to keep when truncating a function response during compression.
|
|
|
|
|
*/
|
|
|
|
|
export const COMPRESSION_TRUNCATE_LINES = 30;
|
|
|
|
|
|
2025-10-27 14:29:39 -07:00
|
|
|
/**
|
|
|
|
|
* Returns the index of the oldest item to keep when compressing. May return
|
|
|
|
|
* contents.length which indicates that everything should be compressed.
|
|
|
|
|
*
|
|
|
|
|
* Exported for testing purposes.
|
|
|
|
|
*/
|
|
|
|
|
export function findCompressSplitPoint(
|
|
|
|
|
contents: Content[],
|
|
|
|
|
fraction: number,
|
|
|
|
|
): number {
|
|
|
|
|
if (fraction <= 0 || fraction >= 1) {
|
|
|
|
|
throw new Error('Fraction must be between 0 and 1');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const charCounts = contents.map((content) => JSON.stringify(content).length);
|
|
|
|
|
const totalCharCount = charCounts.reduce((a, b) => a + b, 0);
|
|
|
|
|
const targetCharCount = totalCharCount * fraction;
|
|
|
|
|
|
|
|
|
|
let lastSplitPoint = 0; // 0 is always valid (compress nothing)
|
|
|
|
|
let cumulativeCharCount = 0;
|
|
|
|
|
for (let i = 0; i < contents.length; i++) {
|
|
|
|
|
const content = contents[i];
|
|
|
|
|
if (
|
|
|
|
|
content.role === 'user' &&
|
|
|
|
|
!content.parts?.some((part) => !!part.functionResponse)
|
|
|
|
|
) {
|
|
|
|
|
if (cumulativeCharCount >= targetCharCount) {
|
|
|
|
|
return i;
|
|
|
|
|
}
|
|
|
|
|
lastSplitPoint = i;
|
|
|
|
|
}
|
|
|
|
|
cumulativeCharCount += charCounts[i];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// We found no split points after targetCharCount.
|
|
|
|
|
// Check if it's safe to compress everything.
|
|
|
|
|
const lastContent = contents[contents.length - 1];
|
|
|
|
|
if (
|
|
|
|
|
lastContent?.role === 'model' &&
|
|
|
|
|
!lastContent?.parts?.some((part) => part.functionCall)
|
|
|
|
|
) {
|
|
|
|
|
return contents.length;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Can't compress everything so just compress at last splitpoint.
|
|
|
|
|
return lastSplitPoint;
|
|
|
|
|
}
|
|
|
|
|
|
2025-11-24 12:24:45 -08:00
|
|
|
export function modelStringToModelConfigAlias(model: string): string {
|
|
|
|
|
switch (model) {
|
|
|
|
|
case PREVIEW_GEMINI_MODEL:
|
|
|
|
|
return 'chat-compression-3-pro';
|
2025-12-17 09:43:21 -08:00
|
|
|
case PREVIEW_GEMINI_FLASH_MODEL:
|
|
|
|
|
return 'chat-compression-3-flash';
|
2025-11-24 12:24:45 -08:00
|
|
|
case DEFAULT_GEMINI_MODEL:
|
|
|
|
|
return 'chat-compression-2.5-pro';
|
|
|
|
|
case DEFAULT_GEMINI_FLASH_MODEL:
|
|
|
|
|
return 'chat-compression-2.5-flash';
|
|
|
|
|
case DEFAULT_GEMINI_FLASH_LITE_MODEL:
|
|
|
|
|
return 'chat-compression-2.5-flash-lite';
|
|
|
|
|
default:
|
2025-11-24 15:02:00 -08:00
|
|
|
return 'chat-compression-default';
|
2025-11-24 12:24:45 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-01-18 22:40:07 -08:00
|
|
|
/**
|
|
|
|
|
* Processes the chat history to ensure function responses don't exceed a specific token budget.
|
|
|
|
|
*
|
|
|
|
|
* This function implements a "Reverse Token Budget" strategy:
|
|
|
|
|
* 1. It iterates through the history from the most recent turn to the oldest.
|
|
|
|
|
* 2. It keeps a running tally of tokens used by function responses.
|
|
|
|
|
* 3. Recent tool outputs are preserved in full to maintain high-fidelity context for the current turn.
|
|
|
|
|
* 4. Once the budget (COMPRESSION_FUNCTION_RESPONSE_TOKEN_BUDGET) is exceeded, any older large
|
|
|
|
|
* tool responses are truncated to their last 30 lines and saved to a temporary file.
|
|
|
|
|
*
|
|
|
|
|
* This ensures that compression effectively reduces context size even when recent turns
|
|
|
|
|
* contain massive tool outputs (like large grep results or logs).
|
|
|
|
|
*/
|
|
|
|
|
async function truncateHistoryToBudget(
|
|
|
|
|
history: Content[],
|
|
|
|
|
config: Config,
|
|
|
|
|
): Promise<Content[]> {
|
|
|
|
|
let functionResponseTokenCounter = 0;
|
|
|
|
|
const truncatedHistory: Content[] = [];
|
|
|
|
|
|
|
|
|
|
// Iterate backwards: newest messages first to prioritize their context.
|
|
|
|
|
for (let i = history.length - 1; i >= 0; i--) {
|
|
|
|
|
const content = history[i];
|
|
|
|
|
const newParts = [];
|
|
|
|
|
|
|
|
|
|
if (content.parts) {
|
|
|
|
|
// Process parts of the message backwards as well.
|
|
|
|
|
for (let j = content.parts.length - 1; j >= 0; j--) {
|
|
|
|
|
const part = content.parts[j];
|
|
|
|
|
|
|
|
|
|
if (part.functionResponse) {
|
|
|
|
|
const responseObj = part.functionResponse.response;
|
|
|
|
|
// Ensure we have a string representation to truncate.
|
|
|
|
|
// If the response is an object, we try to extract a primary string field (output or content).
|
|
|
|
|
let contentStr: string;
|
|
|
|
|
if (typeof responseObj === 'string') {
|
|
|
|
|
contentStr = responseObj;
|
|
|
|
|
} else if (responseObj && typeof responseObj === 'object') {
|
|
|
|
|
if (
|
|
|
|
|
'output' in responseObj &&
|
|
|
|
|
typeof responseObj['output'] === 'string'
|
|
|
|
|
) {
|
|
|
|
|
contentStr = responseObj['output'];
|
|
|
|
|
} else if (
|
|
|
|
|
'content' in responseObj &&
|
|
|
|
|
typeof responseObj['content'] === 'string'
|
|
|
|
|
) {
|
|
|
|
|
contentStr = responseObj['content'];
|
|
|
|
|
} else {
|
|
|
|
|
contentStr = JSON.stringify(responseObj, null, 2);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
contentStr = JSON.stringify(responseObj, null, 2);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const tokens = estimateTokenCountSync([{ text: contentStr }]);
|
|
|
|
|
|
|
|
|
|
if (
|
|
|
|
|
functionResponseTokenCounter + tokens >
|
|
|
|
|
COMPRESSION_FUNCTION_RESPONSE_TOKEN_BUDGET
|
|
|
|
|
) {
|
|
|
|
|
try {
|
|
|
|
|
// Budget exceeded: Truncate this response.
|
|
|
|
|
const { outputFile } = await saveTruncatedToolOutput(
|
|
|
|
|
contentStr,
|
|
|
|
|
part.functionResponse.name ?? 'unknown_tool',
|
|
|
|
|
config.getNextCompressionTruncationId(),
|
|
|
|
|
config.storage.getProjectTempDir(),
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
// Prepare a honest, readable snippet of the tail.
|
|
|
|
|
const truncatedMessage = formatTruncatedToolOutput(
|
|
|
|
|
contentStr,
|
|
|
|
|
outputFile,
|
|
|
|
|
COMPRESSION_TRUNCATE_LINES,
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
newParts.unshift({
|
|
|
|
|
functionResponse: {
|
|
|
|
|
...part.functionResponse,
|
|
|
|
|
response: { output: truncatedMessage },
|
|
|
|
|
},
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
// Count the small truncated placeholder towards the budget.
|
|
|
|
|
functionResponseTokenCounter += estimateTokenCountSync([
|
|
|
|
|
{ text: truncatedMessage },
|
|
|
|
|
]);
|
|
|
|
|
} catch (error) {
|
|
|
|
|
// Fallback: if truncation fails, keep the original part to avoid data loss in the chat.
|
|
|
|
|
debugLogger.debug('Failed to truncate history to budget:', error);
|
|
|
|
|
newParts.unshift(part);
|
|
|
|
|
functionResponseTokenCounter += tokens;
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
// Within budget: keep the full response.
|
|
|
|
|
functionResponseTokenCounter += tokens;
|
|
|
|
|
newParts.unshift(part);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
// Non-tool response part: always keep.
|
|
|
|
|
newParts.unshift(part);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Reconstruct the message with processed (potentially truncated) parts.
|
|
|
|
|
truncatedHistory.unshift({ ...content, parts: newParts });
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return truncatedHistory;
|
|
|
|
|
}
|
|
|
|
|
|
2025-10-27 14:29:39 -07:00
|
|
|
export class ChatCompressionService {
|
|
|
|
|
async compress(
|
|
|
|
|
chat: GeminiChat,
|
|
|
|
|
promptId: string,
|
|
|
|
|
force: boolean,
|
|
|
|
|
model: string,
|
|
|
|
|
config: Config,
|
|
|
|
|
hasFailedCompressionAttempt: boolean,
|
|
|
|
|
): Promise<{ newHistory: Content[] | null; info: ChatCompressionInfo }> {
|
|
|
|
|
const curatedHistory = chat.getHistory(true);
|
|
|
|
|
|
|
|
|
|
// Regardless of `force`, don't do anything if the history is empty.
|
|
|
|
|
if (
|
|
|
|
|
curatedHistory.length === 0 ||
|
|
|
|
|
(hasFailedCompressionAttempt && !force)
|
|
|
|
|
) {
|
|
|
|
|
return {
|
|
|
|
|
newHistory: null,
|
|
|
|
|
info: {
|
|
|
|
|
originalTokenCount: 0,
|
|
|
|
|
newTokenCount: 0,
|
|
|
|
|
compressionStatus: CompressionStatus.NOOP,
|
|
|
|
|
},
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
|
2026-01-10 01:48:06 +05:30
|
|
|
// Fire PreCompress hook before compression
|
2025-12-03 09:04:13 -08:00
|
|
|
// This fires for both manual and auto compression attempts
|
2026-01-10 01:48:06 +05:30
|
|
|
const trigger = force ? PreCompressTrigger.Manual : PreCompressTrigger.Auto;
|
|
|
|
|
await config.getHookSystem()?.firePreCompressEvent(trigger);
|
2025-12-03 09:04:13 -08:00
|
|
|
|
2025-10-30 14:05:38 -07:00
|
|
|
const originalTokenCount = chat.getLastPromptTokenCount();
|
2025-10-27 14:29:39 -07:00
|
|
|
|
|
|
|
|
// Don't compress if not forced and we are under the limit.
|
|
|
|
|
if (!force) {
|
|
|
|
|
const threshold =
|
2025-11-04 15:09:53 -08:00
|
|
|
(await config.getCompressionThreshold()) ??
|
|
|
|
|
DEFAULT_COMPRESSION_TOKEN_THRESHOLD;
|
2025-10-27 14:29:39 -07:00
|
|
|
if (originalTokenCount < threshold * tokenLimit(model)) {
|
|
|
|
|
return {
|
|
|
|
|
newHistory: null,
|
|
|
|
|
info: {
|
|
|
|
|
originalTokenCount,
|
|
|
|
|
newTokenCount: originalTokenCount,
|
|
|
|
|
compressionStatus: CompressionStatus.NOOP,
|
|
|
|
|
},
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-01-18 22:40:07 -08:00
|
|
|
// Apply token-based truncation to the entire history before splitting.
|
|
|
|
|
// This ensures that even the "to compress" portion is within safe limits for the summarization model.
|
|
|
|
|
const truncatedHistory = await truncateHistoryToBudget(
|
2025-10-27 14:29:39 -07:00
|
|
|
curatedHistory,
|
2026-01-18 22:40:07 -08:00
|
|
|
config,
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
const splitPoint = findCompressSplitPoint(
|
|
|
|
|
truncatedHistory,
|
2025-10-27 14:29:39 -07:00
|
|
|
1 - COMPRESSION_PRESERVE_THRESHOLD,
|
|
|
|
|
);
|
|
|
|
|
|
2026-01-18 22:40:07 -08:00
|
|
|
const historyToCompressTruncated = truncatedHistory.slice(0, splitPoint);
|
|
|
|
|
const historyToKeepTruncated = truncatedHistory.slice(splitPoint);
|
2025-10-27 14:29:39 -07:00
|
|
|
|
2026-01-18 22:40:07 -08:00
|
|
|
if (historyToCompressTruncated.length === 0) {
|
2025-10-27 14:29:39 -07:00
|
|
|
return {
|
|
|
|
|
newHistory: null,
|
|
|
|
|
info: {
|
|
|
|
|
originalTokenCount,
|
|
|
|
|
newTokenCount: originalTokenCount,
|
|
|
|
|
compressionStatus: CompressionStatus.NOOP,
|
|
|
|
|
},
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
|
2026-01-18 22:40:07 -08:00
|
|
|
// High Fidelity Decision: Should we send the original or truncated history to the summarizer?
|
|
|
|
|
const originalHistoryToCompress = curatedHistory.slice(0, splitPoint);
|
|
|
|
|
const originalToCompressTokenCount = estimateTokenCountSync(
|
|
|
|
|
originalHistoryToCompress.flatMap((c) => c.parts || []),
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
const historyForSummarizer =
|
|
|
|
|
originalToCompressTokenCount < tokenLimit(model)
|
|
|
|
|
? originalHistoryToCompress
|
|
|
|
|
: historyToCompressTruncated;
|
|
|
|
|
|
2025-11-24 12:24:45 -08:00
|
|
|
const summaryResponse = await config.getBaseLlmClient().generateContent({
|
|
|
|
|
modelConfigKey: { model: modelStringToModelConfigAlias(model) },
|
|
|
|
|
contents: [
|
2026-01-18 22:40:07 -08:00
|
|
|
...historyForSummarizer,
|
2025-11-24 12:24:45 -08:00
|
|
|
{
|
|
|
|
|
role: 'user',
|
|
|
|
|
parts: [
|
|
|
|
|
{
|
|
|
|
|
text: 'First, reason in your scratchpad. Then, generate the <state_snapshot>.',
|
|
|
|
|
},
|
|
|
|
|
],
|
2025-10-27 14:29:39 -07:00
|
|
|
},
|
2025-11-24 12:24:45 -08:00
|
|
|
],
|
|
|
|
|
systemInstruction: { text: getCompressionPrompt() },
|
2025-10-27 14:29:39 -07:00
|
|
|
promptId,
|
2025-11-24 12:24:45 -08:00
|
|
|
// TODO(joshualitt): wire up a sensible abort signal,
|
|
|
|
|
abortSignal: new AbortController().signal,
|
|
|
|
|
});
|
2025-10-27 14:29:39 -07:00
|
|
|
const summary = getResponseText(summaryResponse) ?? '';
|
|
|
|
|
|
|
|
|
|
const extraHistory: Content[] = [
|
|
|
|
|
{
|
|
|
|
|
role: 'user',
|
|
|
|
|
parts: [{ text: summary }],
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
role: 'model',
|
|
|
|
|
parts: [{ text: 'Got it. Thanks for the additional context!' }],
|
|
|
|
|
},
|
2026-01-18 22:40:07 -08:00
|
|
|
...historyToKeepTruncated,
|
2025-10-27 14:29:39 -07:00
|
|
|
];
|
|
|
|
|
|
|
|
|
|
// Use a shared utility to construct the initial history for an accurate token count.
|
|
|
|
|
const fullNewHistory = await getInitialChatHistory(config, extraHistory);
|
|
|
|
|
|
2025-11-26 12:20:46 +08:00
|
|
|
const newTokenCount = await calculateRequestTokenCount(
|
|
|
|
|
fullNewHistory.flatMap((c) => c.parts || []),
|
|
|
|
|
config.getContentGenerator(),
|
|
|
|
|
model,
|
2025-10-27 14:29:39 -07:00
|
|
|
);
|
|
|
|
|
|
|
|
|
|
logChatCompression(
|
|
|
|
|
config,
|
|
|
|
|
makeChatCompressionEvent({
|
|
|
|
|
tokens_before: originalTokenCount,
|
|
|
|
|
tokens_after: newTokenCount,
|
|
|
|
|
}),
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
if (newTokenCount > originalTokenCount) {
|
|
|
|
|
return {
|
|
|
|
|
newHistory: null,
|
|
|
|
|
info: {
|
|
|
|
|
originalTokenCount,
|
|
|
|
|
newTokenCount,
|
|
|
|
|
compressionStatus:
|
|
|
|
|
CompressionStatus.COMPRESSION_FAILED_INFLATED_TOKEN_COUNT,
|
|
|
|
|
},
|
|
|
|
|
};
|
|
|
|
|
} else {
|
|
|
|
|
return {
|
|
|
|
|
newHistory: extraHistory,
|
|
|
|
|
info: {
|
|
|
|
|
originalTokenCount,
|
|
|
|
|
newTokenCount,
|
|
|
|
|
compressionStatus: CompressionStatus.COMPRESSED,
|
|
|
|
|
},
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|