diff --git a/packages/core/src/config/config.ts b/packages/core/src/config/config.ts index 0cb3c7a8cc..80b1dece50 100644 --- a/packages/core/src/config/config.ts +++ b/packages/core/src/config/config.ts @@ -490,6 +490,7 @@ export class Config { private readonly enablePromptCompletion: boolean = false; private readonly truncateToolOutputThreshold: number; private readonly truncateToolOutputLines: number; + private compressionTruncationCounter = 0; private readonly enableToolOutputTruncation: boolean; private initialized: boolean = false; readonly storage: Storage; @@ -1585,6 +1586,8 @@ export class Config { return this.compressionThreshold; } + await this.ensureExperimentsLoaded(); + const remoteThreshold = this.experiments?.flags[ExperimentFlags.CONTEXT_COMPRESSION_THRESHOLD] ?.floatValue; @@ -1766,6 +1769,10 @@ export class Config { return this.truncateToolOutputLines; } + getNextCompressionTruncationId(): number { + return ++this.compressionTruncationCounter; + } + getUseWriteTodos(): boolean { return this.useWriteTodos; } diff --git a/packages/core/src/core/client.test.ts b/packages/core/src/core/client.test.ts index 11dc28a998..f9da52be00 100644 --- a/packages/core/src/core/client.test.ts +++ b/packages/core/src/core/client.test.ts @@ -21,7 +21,7 @@ import { type ContentGenerator, type ContentGeneratorConfig, } from './contentGenerator.js'; -import { type GeminiChat } from './geminiChat.js'; +import { GeminiChat } from './geminiChat.js'; import type { Config } from '../config/config.js'; import { CompressionStatus, @@ -49,8 +49,6 @@ import { ClearcutLogger } from '../telemetry/clearcut-logger/clearcut-logger.js' import * as policyCatalog from '../availability/policyCatalog.js'; import { partToString } from '../utils/partUtils.js'; -vi.mock('../services/chatCompressionService.js'); - // Mock fs module to prevent actual file system operations during tests const mockFileSystem = new Map(); @@ -139,6 +137,7 @@ vi.mock('../hooks/hookSystem.js'); const mockHookSystem = { fireBeforeAgentEvent: vi.fn().mockResolvedValue(undefined), fireAfterAgentEvent: vi.fn().mockResolvedValue(undefined), + firePreCompressEvent: vi.fn().mockResolvedValue(undefined), }; /** @@ -165,15 +164,6 @@ describe('Gemini Client (client.ts)', () => { ClearcutLogger.clearInstance(); vi.mocked(uiTelemetryService.setLastPromptTokenCount).mockClear(); - vi.mocked(ChatCompressionService.prototype.compress).mockResolvedValue({ - newHistory: null, - info: { - originalTokenCount: 0, - newTokenCount: 0, - compressionStatus: CompressionStatus.NOOP, - }, - }); - mockGenerateContentFn = vi.fn().mockResolvedValue({ candidates: [{ content: { parts: [{ text: '{"key": "value"}' }] } }], }); @@ -246,6 +236,7 @@ describe('Gemini Client (client.ts)', () => { getMessageBus: vi.fn().mockReturnValue(undefined), getEnableHooks: vi.fn().mockReturnValue(false), getChatCompression: vi.fn().mockReturnValue(undefined), + getCompressionThreshold: vi.fn().mockReturnValue(undefined), getSkipNextSpeakerCheck: vi.fn().mockReturnValue(false), getShowModelInfoInChat: vi.fn().mockReturnValue(false), getContinueOnFailedApiCall: vi.fn(), @@ -406,7 +397,7 @@ describe('Gemini Client (client.ts)', () => { { role: 'model', parts: [{ text: 'Got it' }] }, ]; - vi.mocked(ChatCompressionService.prototype.compress).mockResolvedValue({ + vi.spyOn(ChatCompressionService.prototype, 'compress').mockResolvedValue({ newHistory: compressionStatus === CompressionStatus.COMPRESSED ? newHistory @@ -577,7 +568,7 @@ describe('Gemini Client (client.ts)', () => { const MOCKED_TOKEN_LIMIT = 1000; const originalTokenCount = MOCKED_TOKEN_LIMIT * 0.699; - vi.mocked(ChatCompressionService.prototype.compress).mockResolvedValue({ + vi.spyOn(ChatCompressionService.prototype, 'compress').mockResolvedValue({ newHistory: null, info: { originalTokenCount, @@ -1336,10 +1327,10 @@ ${JSON.stringify( }; client['chat'] = mockChat as GeminiChat; - // Remaining = 100. Threshold (95%) = 95. - // We need a request > 95 tokens. - // A string of length 400 is roughly 100 tokens. - const longText = 'a'.repeat(400); + // Remaining = 100. + // We need a request > 100 tokens. + // A string of length 404 is roughly 101 tokens. + const longText = 'a'.repeat(404); const request: Part[] = [{ text: longText }]; // estimateTextOnlyLength counts only text content (400 chars), not JSON structure const estimatedRequestTokenCount = Math.floor(longText.length / 4); @@ -1396,9 +1387,9 @@ ${JSON.stringify( }; client['chat'] = mockChat as GeminiChat; - // Remaining (sticky) = 100. Threshold (95%) = 95. - // We need a request > 95 tokens. - const longText = 'a'.repeat(400); + // Remaining (sticky) = 100. + // We need a request > 100 tokens. + const longText = 'a'.repeat(404); const request: Part[] = [{ text: longText }]; // estimateTextOnlyLength counts only text content (400 chars), not JSON structure const estimatedRequestTokenCount = Math.floor(longText.length / 4); @@ -1432,6 +1423,165 @@ ${JSON.stringify( expect(mockTurnRunFn).not.toHaveBeenCalled(); }); + it('should attempt compression before overflow check and proceed if compression frees space', async () => { + // Arrange + const MOCKED_TOKEN_LIMIT = 1000; + vi.mocked(tokenLimit).mockReturnValue(MOCKED_TOKEN_LIMIT); + + // Initial state: 950 tokens used, 50 remaining. + const initialTokenCount = 950; + // Request: 60 tokens. (950 + 60 = 1010 > 1000) -> Would overflow without compression. + const longText = 'a'.repeat(240); // 240 / 4 = 60 tokens + const request: Part[] = [{ text: longText }]; + + // Use the real GeminiChat to manage state and token counts more realistically + const mockChatCompressed = { + getLastPromptTokenCount: vi.fn().mockReturnValue(400), + getHistory: vi + .fn() + .mockReturnValue([{ role: 'user', parts: [{ text: 'old' }] }]), + addHistory: vi.fn(), + getChatRecordingService: vi.fn().mockReturnValue({ + getConversation: vi.fn(), + getConversationFilePath: vi.fn(), + }), + } as unknown as GeminiChat; + + const mockChatInitial = { + getLastPromptTokenCount: vi.fn().mockReturnValue(initialTokenCount), + getHistory: vi + .fn() + .mockReturnValue([{ role: 'user', parts: [{ text: 'old' }] }]), + addHistory: vi.fn(), + getChatRecordingService: vi.fn().mockReturnValue({ + getConversation: vi.fn(), + getConversationFilePath: vi.fn(), + }), + } as unknown as GeminiChat; + + client['chat'] = mockChatInitial; + + // Mock tryCompressChat to simulate successful compression + const tryCompressSpy = vi + .spyOn(client, 'tryCompressChat') + .mockImplementation(async () => { + // In reality, tryCompressChat replaces this.chat + client['chat'] = mockChatCompressed; + return { + originalTokenCount: initialTokenCount, + newTokenCount: 400, + compressionStatus: CompressionStatus.COMPRESSED, + }; + }); + + // Use a manual spy on Turn.prototype.run since Turn is a real class in this test context + // but mocked at the top of the file + mockTurnRunFn.mockImplementation(async function* () { + yield { type: 'content', value: 'Success after compression' }; + }); + + // Act + const stream = client.sendMessageStream( + request, + new AbortController().signal, + 'prompt-id-compression-test', + ); + + const events = await fromAsync(stream); + + // Assert + // 1. Should NOT contain overflow warning + expect(events).not.toContainEqual( + expect.objectContaining({ + type: GeminiEventType.ContextWindowWillOverflow, + }), + ); + + // 2. Should contain compression event + expect(events).toContainEqual( + expect.objectContaining({ + type: GeminiEventType.ChatCompressed, + }), + ); + + // 3. Should have called tryCompressChat + expect(tryCompressSpy).toHaveBeenCalled(); + + // 4. Should have called Turn.run (proceeded with the request) + expect(mockTurnRunFn).toHaveBeenCalled(); + }); + + it('should handle massive function responses by truncating them and then yielding overflow warning', async () => { + // Arrange + const MOCKED_TOKEN_LIMIT = 1000; + vi.mocked(tokenLimit).mockReturnValue(MOCKED_TOKEN_LIMIT); + + // History has a large compressible part and a massive function response at the end. + const massiveText = 'a'.repeat(200000); + const history: Content[] = [ + { role: 'user', parts: [{ text: 'a'.repeat(100000) }] }, // compressible part + { role: 'model', parts: [{ text: 'ok' }] }, + { + role: 'model', + parts: [{ functionCall: { name: 'huge_tool', args: {} } }], + }, + { + role: 'user', + parts: [ + { + functionResponse: { + name: 'huge_tool', + response: { data: massiveText }, + }, + }, + ], + }, + ]; + + const realChat = new GeminiChat(mockConfig, '', [], history); + client['chat'] = realChat; + + // Use a realistic mock for compression that simulates the 40k truncation effect. + // We spy on the instance directly to ensure it intercepts correctly. + const compressSpy = vi + .spyOn(client['compressionService'], 'compress') + .mockResolvedValue({ + newHistory: history, // Keep history large for the overflow check + info: { + originalTokenCount: 50000, + newTokenCount: 10000, // Reduced from 50k but still > 1000 limit + compressionStatus: CompressionStatus.COMPRESSED, + }, + }); + + // The new request + const request: Part[] = [{ text: 'next question' }]; + + // Act + const stream = client.sendMessageStream( + request, + new AbortController().signal, + 'prompt-id-massive-test', + ); + + const events = await fromAsync(stream); + + // Assert + // 1. Should have attempted compression + expect(compressSpy).toHaveBeenCalled(); + + // 2. Should yield overflow warning because 10000 > 1000 limit. + expect(events).toContainEqual( + expect.objectContaining({ + type: GeminiEventType.ContextWindowWillOverflow, + value: expect.objectContaining({ + estimatedRequestTokenCount: expect.any(Number), + remainingTokenCount: expect.any(Number), + }), + }), + ); + }); + it('should not trigger overflow warning for requests with large binary data (PDFs/images)', async () => { // Arrange const MOCKED_TOKEN_LIMIT = 1000000; // 1M tokens diff --git a/packages/core/src/core/client.ts b/packages/core/src/core/client.ts index 73445da67b..1310dfb445 100644 --- a/packages/core/src/core/client.ts +++ b/packages/core/src/core/client.ts @@ -535,6 +535,15 @@ export class GeminiClient { // Check for context window overflow const modelForLimitCheck = this._getActiveModelForCurrentTurn(); + const compressed = await this.tryCompressChat(prompt_id, false); + + if (compressed.compressionStatus === CompressionStatus.COMPRESSED) { + yield { type: GeminiEventType.ChatCompressed, value: compressed }; + } + + const remainingTokenCount = + tokenLimit(modelForLimitCheck) - this.getChat().getLastPromptTokenCount(); + // Estimate tokens. For text-only requests, we estimate based on character length. // For requests with non-text parts (like images, tools), we use the countTokens API. const estimatedRequestTokenCount = await calculateRequestTokenCount( @@ -543,10 +552,7 @@ export class GeminiClient { modelForLimitCheck, ); - const remainingTokenCount = - tokenLimit(modelForLimitCheck) - this.getChat().getLastPromptTokenCount(); - - if (estimatedRequestTokenCount > remainingTokenCount * 0.95) { + if (estimatedRequestTokenCount > remainingTokenCount) { yield { type: GeminiEventType.ContextWindowWillOverflow, value: { estimatedRequestTokenCount, remainingTokenCount }, @@ -554,12 +560,6 @@ export class GeminiClient { return turn; } - const compressed = await this.tryCompressChat(prompt_id, false); - - if (compressed.compressionStatus === CompressionStatus.COMPRESSED) { - yield { type: GeminiEventType.ChatCompressed, value: compressed }; - } - // Prevent context updates from being sent while a tool call is // waiting for a response. The Gemini API requires that a functionResponse // part from the user immediately follows a functionCall part from the model diff --git a/packages/core/src/services/chatCompressionService.test.ts b/packages/core/src/services/chatCompressionService.test.ts index 38c7dba65d..728f8e79b9 100644 --- a/packages/core/src/services/chatCompressionService.test.ts +++ b/packages/core/src/services/chatCompressionService.test.ts @@ -12,12 +12,15 @@ import { } from './chatCompressionService.js'; import type { Content, GenerateContentResponse } from '@google/genai'; import { CompressionStatus } from '../core/turn.js'; -import { tokenLimit } from '../core/tokenLimits.js'; import type { GeminiChat } from '../core/geminiChat.js'; import type { Config } from '../config/config.js'; +import * as fileUtils from '../utils/fileUtils.js'; import { getInitialChatHistory } from '../utils/environmentContext.js'; +import * as tokenCalculation from '../utils/tokenCalculation.js'; +import os from 'node:os'; +import path from 'node:path'; +import fs from 'node:fs'; -vi.mock('../core/tokenLimits.js'); vi.mock('../telemetry/loggers.js'); vi.mock('../utils/environmentContext.js'); @@ -128,10 +131,14 @@ describe('ChatCompressionService', () => { let service: ChatCompressionService; let mockChat: GeminiChat; let mockConfig: Config; + let testTempDir: string; const mockModel = 'gemini-2.5-pro'; const mockPromptId = 'test-prompt-id'; beforeEach(() => { + testTempDir = fs.mkdtempSync( + path.join(os.tmpdir(), 'chat-compression-test-'), + ); service = new ChatCompressionService(); mockChat = { getHistory: vi.fn(), @@ -160,9 +167,12 @@ describe('ChatCompressionService', () => { getEnableHooks: vi.fn().mockReturnValue(false), getMessageBus: vi.fn().mockReturnValue(undefined), getHookSystem: () => undefined, + getNextCompressionTruncationId: vi.fn().mockReturnValue(1), + storage: { + getProjectTempDir: vi.fn().mockReturnValue(testTempDir), + }, } as unknown as Config; - vi.mocked(tokenLimit).mockReturnValue(1000); vi.mocked(getInitialChatHistory).mockImplementation( async (_config, extraHistory) => extraHistory || [], ); @@ -170,6 +180,9 @@ describe('ChatCompressionService', () => { afterEach(() => { vi.restoreAllMocks(); + if (fs.existsSync(testTempDir)) { + fs.rmSync(testTempDir, { recursive: true, force: true }); + } }); it('should return NOOP if history is empty', async () => { @@ -206,9 +219,8 @@ describe('ChatCompressionService', () => { vi.mocked(mockChat.getHistory).mockReturnValue([ { role: 'user', parts: [{ text: 'hi' }] }, ]); - vi.mocked(mockChat.getLastPromptTokenCount).mockReturnValue(600); - vi.mocked(tokenLimit).mockReturnValue(1000); - // Threshold is 0.7 * 1000 = 700. 600 < 700, so NOOP. + vi.mocked(mockChat.getLastPromptTokenCount).mockReturnValue(1000); + // Real token limit is ~1M, threshold 0.5. 1000 < 500k, so NOOP. const result = await service.compress( mockChat, @@ -230,8 +242,8 @@ describe('ChatCompressionService', () => { { role: 'model', parts: [{ text: 'msg4' }] }, ]; vi.mocked(mockChat.getHistory).mockReturnValue(history); - vi.mocked(mockChat.getLastPromptTokenCount).mockReturnValue(800); - vi.mocked(tokenLimit).mockReturnValue(1000); + vi.mocked(mockChat.getLastPromptTokenCount).mockReturnValue(600000); + // 600k > 500k (0.5 * 1M), so should compress. const result = await service.compress( mockChat, @@ -257,7 +269,6 @@ describe('ChatCompressionService', () => { ]; vi.mocked(mockChat.getHistory).mockReturnValue(history); vi.mocked(mockChat.getLastPromptTokenCount).mockReturnValue(100); - vi.mocked(tokenLimit).mockReturnValue(1000); const result = await service.compress( mockChat, @@ -278,8 +289,7 @@ describe('ChatCompressionService', () => { { role: 'model', parts: [{ text: 'msg2' }] }, ]; vi.mocked(mockChat.getHistory).mockReturnValue(history); - vi.mocked(mockChat.getLastPromptTokenCount).mockReturnValue(10); - vi.mocked(tokenLimit).mockReturnValue(1000); + vi.mocked(mockChat.getLastPromptTokenCount).mockReturnValue(100); const longSummary = 'a'.repeat(1000); // Long summary to inflate token count vi.mocked(mockConfig.getBaseLlmClient().generateContent).mockResolvedValue({ @@ -292,10 +302,10 @@ describe('ChatCompressionService', () => { ], } as unknown as GenerateContentResponse); - // Override mock to simulate high token count for this specific test - vi.mocked(mockConfig.getContentGenerator().countTokens).mockResolvedValue({ - totalTokens: 10000, - }); + // Inflate the token count by spying on calculateRequestTokenCount + vi.spyOn(tokenCalculation, 'calculateRequestTokenCount').mockResolvedValue( + 10000, + ); const result = await service.compress( mockChat, @@ -311,4 +321,380 @@ describe('ChatCompressionService', () => { ); expect(result.newHistory).toBeNull(); }); + + describe('Reverse Token Budget Truncation', () => { + it('should truncate older function responses when budget is exceeded', async () => { + vi.mocked(mockConfig.getCompressionThreshold).mockResolvedValue(0.5); + vi.mocked(mockChat.getLastPromptTokenCount).mockReturnValue(600000); + + // Large response part that exceeds budget (40k tokens). + // Heuristic is roughly chars / 4, so 170k chars should exceed it. + const largeResponse = 'a'.repeat(170000); + + const history: Content[] = [ + { role: 'user', parts: [{ text: 'old msg' }] }, + { role: 'model', parts: [{ text: 'old resp' }] }, + // History to keep + { role: 'user', parts: [{ text: 'msg 1' }] }, + { + role: 'user', + parts: [ + { + functionResponse: { + name: 'grep', + response: { content: largeResponse }, + }, + }, + ], + }, + { role: 'model', parts: [{ text: 'resp 2' }] }, + { + role: 'user', + parts: [ + { + functionResponse: { + name: 'grep', + response: { content: largeResponse }, + }, + }, + ], + }, + ]; + + vi.mocked(mockChat.getHistory).mockReturnValue(history); + + const result = await service.compress( + mockChat, + mockPromptId, + true, + mockModel, + mockConfig, + false, + ); + + expect(result.info.compressionStatus).toBe(CompressionStatus.COMPRESSED); + + // Verify the new history contains the truncated message + const keptHistory = result.newHistory!.slice(2); // After summary and 'Got it' + const truncatedPart = keptHistory[1].parts![0].functionResponse; + expect(truncatedPart?.response?.['output']).toContain( + 'Output too large.', + ); + + // Verify a file was actually created + const files = fs.readdirSync(testTempDir); + expect(files.length).toBeGreaterThan(0); + expect(files[0]).toMatch(/grep_.*\.txt/); + }); + + it('should correctly handle massive single-line strings inside JSON by using multi-line Elephant Line logic', async () => { + vi.mocked(mockConfig.getCompressionThreshold).mockResolvedValue(0.5); + vi.mocked(mockChat.getLastPromptTokenCount).mockReturnValue(600000); + + // 170,000 chars on a single line to exceed budget + const massiveSingleLine = 'a'.repeat(170000); + + const history: Content[] = [ + { role: 'user', parts: [{ text: 'old msg 1' }] }, + { role: 'model', parts: [{ text: 'old resp 1' }] }, + { role: 'user', parts: [{ text: 'old msg 2' }] }, + { role: 'model', parts: [{ text: 'old resp 2' }] }, + { role: 'user', parts: [{ text: 'old msg 3' }] }, + { role: 'model', parts: [{ text: 'old resp 3' }] }, + { role: 'user', parts: [{ text: 'msg 1' }] }, + { + role: 'user', + parts: [ + { + functionResponse: { + name: 'shell', + response: { output: massiveSingleLine }, + }, + }, + ], + }, + { + role: 'user', + parts: [ + { + functionResponse: { + name: 'shell', + response: { output: massiveSingleLine }, + }, + }, + ], + }, + ]; + + vi.mocked(mockChat.getHistory).mockReturnValue(history); + + const result = await service.compress( + mockChat, + mockPromptId, + true, + mockModel, + mockConfig, + false, + ); + + // Verify it compressed + expect(result.newHistory).not.toBeNull(); + // Find the shell response in the kept history (the older one was truncated) + const keptHistory = result.newHistory!.slice(2); // after summary and 'Got it' + const shellResponse = keptHistory.find( + (h) => + h.parts?.some((p) => p.functionResponse?.name === 'shell') && + (h.parts?.[0].functionResponse?.response?.['output'] as string) + ?.length < 100000, + ); + const truncatedPart = shellResponse!.parts![0].functionResponse; + const content = truncatedPart?.response?.['output'] as string; + + expect(content).toContain( + 'Output too large. Showing the last 4,000 characters of the output.', + ); + // It's a single line, so NO [LINE WIDTH TRUNCATED] + }); + + it('should use character-based truncation for massive single-line raw strings', async () => { + vi.mocked(mockConfig.getCompressionThreshold).mockResolvedValue(0.5); + vi.mocked(mockChat.getLastPromptTokenCount).mockReturnValue(600000); + + const massiveRawString = 'c'.repeat(170000); + + const history: Content[] = [ + { role: 'user', parts: [{ text: 'old msg 1' }] }, + { role: 'model', parts: [{ text: 'old resp 1' }] }, + { role: 'user', parts: [{ text: 'old msg 2' }] }, + { role: 'model', parts: [{ text: 'old resp 2' }] }, + { role: 'user', parts: [{ text: 'msg 1' }] }, + { + role: 'user', + parts: [ + { + functionResponse: { + name: 'raw_tool', + response: { content: massiveRawString }, + }, + }, + ], + }, + { + role: 'user', + parts: [ + { + functionResponse: { + name: 'raw_tool', + response: { content: massiveRawString }, + }, + }, + ], + }, + ]; + + vi.mocked(mockChat.getHistory).mockReturnValue(history); + + const result = await service.compress( + mockChat, + mockPromptId, + true, + mockModel, + mockConfig, + false, + ); + + expect(result.newHistory).not.toBeNull(); + const keptHistory = result.newHistory!.slice(2); + const rawResponse = keptHistory.find( + (h) => + h.parts?.some((p) => p.functionResponse?.name === 'raw_tool') && + (h.parts?.[0].functionResponse?.response?.['output'] as string) + ?.length < 100000, + ); + const truncatedPart = rawResponse!.parts![0].functionResponse; + const content = truncatedPart?.response?.['output'] as string; + + expect(content).toContain( + 'Output too large. Showing the last 4,000 characters of the output.', + ); + }); + + it('should fallback to original content and still update budget if truncation fails', async () => { + vi.mocked(mockChat.getLastPromptTokenCount).mockReturnValue(600000); + + const largeResponse = 'd'.repeat(170000); + const history: Content[] = [ + { role: 'user', parts: [{ text: 'old msg 1' }] }, + { role: 'model', parts: [{ text: 'old resp 1' }] }, + { role: 'user', parts: [{ text: 'old msg 2' }] }, + { role: 'model', parts: [{ text: 'old resp 2' }] }, + { role: 'user', parts: [{ text: 'msg 1' }] }, + { + role: 'user', + parts: [ + { + functionResponse: { + name: 'grep', + response: { content: largeResponse }, + }, + }, + ], + }, + { + role: 'user', + parts: [ + { + functionResponse: { + name: 'grep', + response: { content: largeResponse }, + }, + }, + ], + }, + ]; + + vi.mocked(mockChat.getHistory).mockReturnValue(history); + + // Simulate failure in saving the truncated output + vi.spyOn(fileUtils, 'saveTruncatedToolOutput').mockRejectedValue( + new Error('Disk Full'), + ); + + const result = await service.compress( + mockChat, + mockPromptId, + true, + mockModel, + mockConfig, + false, + ); + + expect(result.info.compressionStatus).toBe(CompressionStatus.COMPRESSED); + + // Verify the new history contains the ORIGINAL message (not truncated) + const keptHistory = result.newHistory!.slice(2); + const toolResponseTurn = keptHistory.find((h) => + h.parts?.some((p) => p.functionResponse?.name === 'grep'), + ); + const preservedPart = toolResponseTurn!.parts![0].functionResponse; + expect(preservedPart?.response).toEqual({ content: largeResponse }); + }); + + it('should use high-fidelity original history for summarization when under the limit, but truncated version for active window', async () => { + // Large response in the "to compress" section (first message) + // 300,000 chars is ~75k tokens, well under the 1,000,000 summarizer limit. + const massiveText = 'a'.repeat(300000); + const history: Content[] = [ + { + role: 'user', + parts: [ + { + functionResponse: { + name: 'grep', + response: { content: massiveText }, + }, + }, + ], + }, + // More history to ensure the first message is in the "to compress" group + { role: 'user', parts: [{ text: 'msg 2' }] }, + { role: 'model', parts: [{ text: 'resp 2' }] }, + { role: 'user', parts: [{ text: 'preserved msg' }] }, + { + role: 'user', + parts: [ + { + functionResponse: { + name: 'massive_preserved', + response: { content: massiveText }, + }, + }, + ], + }, + ]; + + vi.mocked(mockChat.getHistory).mockReturnValue(history); + vi.mocked(mockChat.getLastPromptTokenCount).mockReturnValue(600000); + + const result = await service.compress( + mockChat, + mockPromptId, + true, + mockModel, + mockConfig, + false, + ); + + expect(result.info.compressionStatus).toBe(CompressionStatus.COMPRESSED); + + // 1. Verify that the summary was generated from the ORIGINAL high-fidelity history + const generateContentCall = vi.mocked( + mockConfig.getBaseLlmClient().generateContent, + ).mock.calls[0][0]; + const historySentToSummarizer = generateContentCall.contents; + + const summarizerGrepResponse = + historySentToSummarizer[0].parts![0].functionResponse; + // Should be original content because total tokens < 1M + expect(summarizerGrepResponse?.response).toEqual({ + content: massiveText, + }); + + // 2. Verify that the PRESERVED history (the active window) IS truncated + const keptHistory = result.newHistory!.slice(2); // Skip summary + ack + const preservedToolTurn = keptHistory.find((h) => + h.parts?.some((p) => p.functionResponse?.name === 'massive_preserved'), + ); + const preservedPart = preservedToolTurn!.parts![0].functionResponse; + expect(preservedPart?.response?.['output']).toContain( + 'Output too large.', + ); + }); + + it('should fall back to truncated history for summarization when original is massive (>1M tokens)', async () => { + // 5,000,000 chars is ~1.25M tokens, exceeding the 1M limit. + const superMassiveText = 'a'.repeat(5000000); + const history: Content[] = [ + { + role: 'user', + parts: [ + { + functionResponse: { + name: 'grep', + response: { content: superMassiveText }, + }, + }, + ], + }, + { role: 'user', parts: [{ text: 'msg 2' }] }, + { role: 'model', parts: [{ text: 'resp 2' }] }, + ]; + + vi.mocked(mockChat.getHistory).mockReturnValue(history); + + const result = await service.compress( + mockChat, + mockPromptId, + true, + mockModel, + mockConfig, + false, + ); + + expect(result.info.compressionStatus).toBe(CompressionStatus.COMPRESSED); + + // Verify that the summary was generated from the TRUNCATED history + const generateContentCall = vi.mocked( + mockConfig.getBaseLlmClient().generateContent, + ).mock.calls[0][0]; + const historySentToSummarizer = generateContentCall.contents; + + const summarizerGrepResponse = + historySentToSummarizer[0].parts![0].functionResponse; + // Should be truncated because original > 1M tokens + expect(summarizerGrepResponse?.response?.['output']).toContain( + 'Output too large.', + ); + }); + }); }); diff --git a/packages/core/src/services/chatCompressionService.ts b/packages/core/src/services/chatCompressionService.ts index c76e937750..5a08ed9d3d 100644 --- a/packages/core/src/services/chatCompressionService.ts +++ b/packages/core/src/services/chatCompressionService.ts @@ -13,8 +13,16 @@ import { getCompressionPrompt } from '../core/prompts.js'; import { getResponseText } from '../utils/partUtils.js'; import { logChatCompression } from '../telemetry/loggers.js'; import { makeChatCompressionEvent } from '../telemetry/types.js'; +import { + saveTruncatedToolOutput, + formatTruncatedToolOutput, +} from '../utils/fileUtils.js'; +import { debugLogger } from '../utils/debugLogger.js'; import { getInitialChatHistory } from '../utils/environmentContext.js'; -import { calculateRequestTokenCount } from '../utils/tokenCalculation.js'; +import { + calculateRequestTokenCount, + estimateTokenCountSync, +} from '../utils/tokenCalculation.js'; import { DEFAULT_GEMINI_FLASH_LITE_MODEL, DEFAULT_GEMINI_FLASH_MODEL, @@ -36,6 +44,16 @@ export const DEFAULT_COMPRESSION_TOKEN_THRESHOLD = 0.5; */ export const COMPRESSION_PRESERVE_THRESHOLD = 0.3; +/** + * The budget for function response tokens in the preserved history. + */ +export const COMPRESSION_FUNCTION_RESPONSE_TOKEN_BUDGET = 50_000; + +/** + * The number of lines to keep when truncating a function response during compression. + */ +export const COMPRESSION_TRUNCATE_LINES = 30; + /** * Returns the index of the oldest item to keep when compressing. May return * contents.length which indicates that everything should be compressed. @@ -101,6 +119,119 @@ export function modelStringToModelConfigAlias(model: string): string { } } +/** + * Processes the chat history to ensure function responses don't exceed a specific token budget. + * + * This function implements a "Reverse Token Budget" strategy: + * 1. It iterates through the history from the most recent turn to the oldest. + * 2. It keeps a running tally of tokens used by function responses. + * 3. Recent tool outputs are preserved in full to maintain high-fidelity context for the current turn. + * 4. Once the budget (COMPRESSION_FUNCTION_RESPONSE_TOKEN_BUDGET) is exceeded, any older large + * tool responses are truncated to their last 30 lines and saved to a temporary file. + * + * This ensures that compression effectively reduces context size even when recent turns + * contain massive tool outputs (like large grep results or logs). + */ +async function truncateHistoryToBudget( + history: Content[], + config: Config, +): Promise { + let functionResponseTokenCounter = 0; + const truncatedHistory: Content[] = []; + + // Iterate backwards: newest messages first to prioritize their context. + for (let i = history.length - 1; i >= 0; i--) { + const content = history[i]; + const newParts = []; + + if (content.parts) { + // Process parts of the message backwards as well. + for (let j = content.parts.length - 1; j >= 0; j--) { + const part = content.parts[j]; + + if (part.functionResponse) { + const responseObj = part.functionResponse.response; + // Ensure we have a string representation to truncate. + // If the response is an object, we try to extract a primary string field (output or content). + let contentStr: string; + if (typeof responseObj === 'string') { + contentStr = responseObj; + } else if (responseObj && typeof responseObj === 'object') { + if ( + 'output' in responseObj && + typeof responseObj['output'] === 'string' + ) { + contentStr = responseObj['output']; + } else if ( + 'content' in responseObj && + typeof responseObj['content'] === 'string' + ) { + contentStr = responseObj['content']; + } else { + contentStr = JSON.stringify(responseObj, null, 2); + } + } else { + contentStr = JSON.stringify(responseObj, null, 2); + } + + const tokens = estimateTokenCountSync([{ text: contentStr }]); + + if ( + functionResponseTokenCounter + tokens > + COMPRESSION_FUNCTION_RESPONSE_TOKEN_BUDGET + ) { + try { + // Budget exceeded: Truncate this response. + const { outputFile } = await saveTruncatedToolOutput( + contentStr, + part.functionResponse.name ?? 'unknown_tool', + config.getNextCompressionTruncationId(), + config.storage.getProjectTempDir(), + ); + + // Prepare a honest, readable snippet of the tail. + const truncatedMessage = formatTruncatedToolOutput( + contentStr, + outputFile, + COMPRESSION_TRUNCATE_LINES, + ); + + newParts.unshift({ + functionResponse: { + ...part.functionResponse, + response: { output: truncatedMessage }, + }, + }); + + // Count the small truncated placeholder towards the budget. + functionResponseTokenCounter += estimateTokenCountSync([ + { text: truncatedMessage }, + ]); + } catch (error) { + // Fallback: if truncation fails, keep the original part to avoid data loss in the chat. + debugLogger.debug('Failed to truncate history to budget:', error); + newParts.unshift(part); + functionResponseTokenCounter += tokens; + } + } else { + // Within budget: keep the full response. + functionResponseTokenCounter += tokens; + newParts.unshift(part); + } + } else { + // Non-tool response part: always keep. + newParts.unshift(part); + } + } + } + + // Reconstruct the message with processed (potentially truncated) parts. + truncatedHistory.unshift({ ...content, parts: newParts }); + } + + return truncatedHistory; +} + export class ChatCompressionService { async compress( chat: GeminiChat, @@ -151,15 +282,22 @@ export class ChatCompressionService { } } - const splitPoint = findCompressSplitPoint( + // Apply token-based truncation to the entire history before splitting. + // This ensures that even the "to compress" portion is within safe limits for the summarization model. + const truncatedHistory = await truncateHistoryToBudget( curatedHistory, + config, + ); + + const splitPoint = findCompressSplitPoint( + truncatedHistory, 1 - COMPRESSION_PRESERVE_THRESHOLD, ); - const historyToCompress = curatedHistory.slice(0, splitPoint); - const historyToKeep = curatedHistory.slice(splitPoint); + const historyToCompressTruncated = truncatedHistory.slice(0, splitPoint); + const historyToKeepTruncated = truncatedHistory.slice(splitPoint); - if (historyToCompress.length === 0) { + if (historyToCompressTruncated.length === 0) { return { newHistory: null, info: { @@ -170,10 +308,21 @@ export class ChatCompressionService { }; } + // High Fidelity Decision: Should we send the original or truncated history to the summarizer? + const originalHistoryToCompress = curatedHistory.slice(0, splitPoint); + const originalToCompressTokenCount = estimateTokenCountSync( + originalHistoryToCompress.flatMap((c) => c.parts || []), + ); + + const historyForSummarizer = + originalToCompressTokenCount < tokenLimit(model) + ? originalHistoryToCompress + : historyToCompressTruncated; + const summaryResponse = await config.getBaseLlmClient().generateContent({ modelConfigKey: { model: modelStringToModelConfigAlias(model) }, contents: [ - ...historyToCompress, + ...historyForSummarizer, { role: 'user', parts: [ @@ -199,7 +348,7 @@ export class ChatCompressionService { role: 'model', parts: [{ text: 'Got it. Thanks for the additional context!' }], }, - ...historyToKeep, + ...historyToKeepTruncated, ]; // Use a shared utility to construct the initial history for an accurate token count. diff --git a/packages/core/src/utils/fileUtils.test.ts b/packages/core/src/utils/fileUtils.test.ts index bc27cd6e56..7d792217b1 100644 --- a/packages/core/src/utils/fileUtils.test.ts +++ b/packages/core/src/utils/fileUtils.test.ts @@ -1123,9 +1123,9 @@ describe('fileUtils', () => { const formatted = formatTruncatedToolOutput(content, outputFile); expect(formatted).toContain( - 'Output too large. Showing the last 10,000 characters', + 'Output too large. Showing the last 4,000 characters', ); - expect(formatted.endsWith(content.slice(-10000))).toBe(true); + expect(formatted.endsWith(content.slice(-4000))).toBe(true); }); }); }); diff --git a/packages/core/src/utils/fileUtils.ts b/packages/core/src/utils/fileUtils.ts index ec2370404c..5525f98d06 100644 --- a/packages/core/src/utils/fileUtils.ts +++ b/packages/core/src/utils/fileUtils.ts @@ -517,7 +517,7 @@ export async function fileExists(filePath: string): Promise { } const MAX_TRUNCATED_LINE_WIDTH = 1000; -const MAX_TRUNCATED_CHARS = 10000; +const MAX_TRUNCATED_CHARS = 4000; /** * Formats a truncated message for tool output, handling multi-line and single-line (elephant) cases.