From 4d1ca92a19fc51d120b4e22c5d713b4f6a702f67 Mon Sep 17 00:00:00 2001 From: Aishanee Shah Date: Mon, 4 May 2026 16:31:20 -0400 Subject: [PATCH] fix(core): filter unsupported multimodal types from tool responses (#26352) --- packages/core/src/core/geminiChat.test.ts | 148 ++++++++++++++++++ packages/core/src/core/geminiChat.ts | 53 ++++++- .../generateContentResponseUtilities.test.ts | 51 ++++++ .../utils/generateContentResponseUtilities.ts | 63 +++++++- 4 files changed, 307 insertions(+), 8 deletions(-) diff --git a/packages/core/src/core/geminiChat.test.ts b/packages/core/src/core/geminiChat.test.ts index e719878ff0..1a54821f52 100644 --- a/packages/core/src/core/geminiChat.test.ts +++ b/packages/core/src/core/geminiChat.test.ts @@ -38,6 +38,7 @@ import * as policyHelpers from '../availability/policyHelpers.js'; import { makeResolvedModelConfig } from '../services/modelConfigServiceTestUtils.js'; import type { HookSystem } from '../hooks/hookSystem.js'; import { LlmRole } from '../telemetry/types.js'; +import { BINARY_INJECTION_KEY } from '../utils/generateContentResponseUtilities.js'; // Mock fs module to prevent actual file system operations during tests const mockFileSystem = new Map(); @@ -2575,6 +2576,153 @@ describe('GeminiChat', () => { }); }); + describe('automated binary injection', () => { + it('should expand history with synthetic turns when __binary_injection__ is detected', async () => { + const audioParts = [ + { + functionResponse: { + id: 'call-123', + name: 'read_file', + response: { + output: 'Success', + [BINARY_INJECTION_KEY]: [ + { inlineData: { mimeType: 'audio/mpeg', data: 'base64' } }, + ], + }, + }, + }, + ]; + + // Mock API to capture the history it receives + let capturedContents: Content[] = []; + vi.mocked(mockContentGenerator.generateContentStream).mockImplementation( + async (req) => { + capturedContents = req.contents as Content[]; + return (async function* () { + yield { + candidates: [ + { + content: { parts: [{ text: 'Analysis done' }] }, + finishReason: 'STOP', + }, + ], + } as unknown as GenerateContentResponse; + })(); + }, + ); + + const stream = await chat.sendMessageStream( + { model: 'gemini-pro' }, + audioParts, + 'test-id', + new AbortController().signal, + LlmRole.MAIN, + ); + + for await (const _ of stream) { + // No-op + } + + // Verify history expansion + // Turn 1: Tool response (cleaned) + // Turn 2: Model Ack (synthetic) + // Turn 3: User Binary data (current request) + expect(capturedContents).toHaveLength(3); + expect(capturedContents[0].role).toBe('user'); + expect(capturedContents[0].parts![0].functionResponse!.response).toEqual({ + output: 'Success', + }); + expect(capturedContents[1].role).toBe('model'); + expect(capturedContents[1].parts![0].text).toContain( + 'Binary content received', + ); + expect(capturedContents[1].parts![0].thoughtSignature).toBe( + SYNTHETIC_THOUGHT_SIGNATURE, + ); + expect(capturedContents[2].role).toBe('user'); + expect(capturedContents[2].parts![0].inlineData!.mimeType).toBe( + 'audio/mpeg', + ); + }); + + it('should handle multiple parallel binary injections', async () => { + const parallelParts = [ + { + functionResponse: { + id: 'call-1', + name: 'read_file', + response: { + output: 'Success 1', + [BINARY_INJECTION_KEY]: [ + { inlineData: { mimeType: 'audio/mpeg', data: 'audio1' } }, + ], + }, + }, + }, + { + functionResponse: { + id: 'call-2', + name: 'read_file', + response: { + output: 'Success 2', + [BINARY_INJECTION_KEY]: [ + { inlineData: { mimeType: 'video/mp4', data: 'video2' } }, + ], + }, + }, + }, + ]; + + let capturedContents: Content[] = []; + vi.mocked(mockContentGenerator.generateContentStream).mockImplementation( + async (req) => { + capturedContents = req.contents as Content[]; + return (async function* () { + yield { + candidates: [ + { + content: { parts: [{ text: 'Done' }] }, + finishReason: 'STOP', + }, + ], + } as unknown as GenerateContentResponse; + })(); + }, + ); + + const stream = await chat.sendMessageStream( + { model: 'gemini-pro' }, + parallelParts, + 'test-id', + new AbortController().signal, + LlmRole.MAIN, + ); + + for await (const _ of stream) { + // No-op + } + + // Turn 1: Cleaned tool responses (both) + // Turn 2: Model Ack + // Turn 3: Both binary parts combined + expect(capturedContents).toHaveLength(3); + expect(capturedContents[0].parts).toHaveLength(2); + expect(capturedContents[0].parts![0].functionResponse!.response).toEqual({ + output: 'Success 1', + }); + expect(capturedContents[0].parts![1].functionResponse!.response).toEqual({ + output: 'Success 2', + }); + expect(capturedContents[2].parts).toHaveLength(2); + expect(capturedContents[2].parts![0].inlineData!.mimeType).toBe( + 'audio/mpeg', + ); + expect(capturedContents[2].parts![1].inlineData!.mimeType).toBe( + 'video/mp4', + ); + }); + }); + describe('recordCompletedToolCalls', () => { it('should use originalRequestName and originalRequestArgs if present', () => { const completedCall: CompletedToolCall = { diff --git a/packages/core/src/core/geminiChat.ts b/packages/core/src/core/geminiChat.ts index 289172a88e..16006ad160 100644 --- a/packages/core/src/core/geminiChat.ts +++ b/packages/core/src/core/geminiChat.ts @@ -50,6 +50,7 @@ import { handleFallback } from '../fallback/handler.js'; import { isFunctionResponse } from '../utils/messageInspectors.js'; import { scrubHistory } from '../utils/historyHardening.js'; import { partListUnionToString } from './geminiRequest.js'; +import { BINARY_INJECTION_KEY } from '../utils/generateContentResponseUtilities.js'; import type { ModelConfigKey } from '../services/modelConfigService.js'; import { estimateTokenCountSync } from '../utils/tokenCalculation.js'; import { @@ -336,7 +337,7 @@ export class GeminiChat { }); this.sendPromise = streamDonePromise; - const userContent = createUserContent(message); + let userContent = createUserContent(message); const { model } = this.context.config.modelConfigService.getResolvedConfig(modelConfigKey); @@ -366,6 +367,30 @@ export class GeminiChat { } // Add user content to history ONCE before any attempts. + const binaryInjections = this.extractBinaryInjections(userContent.parts); + if (binaryInjections) { + // Turn 1: The original tool response (now cleaned) + this.agentHistory.push(userContent); + + // Turn 2: Synthetic Model Acknowledgment + this.agentHistory.push({ + role: 'model', + parts: [ + { + text: 'Binary content received. Proceeding with analysis.', + thought: true, + thoughtSignature: SYNTHETIC_THOUGHT_SIGNATURE, + }, + ], + }); + + // Turn 3: The actual binary data (becomes the current request message) + userContent = { + role: 'user', + parts: binaryInjections, + }; + } + this.agentHistory.push(userContent); const requestContents = this.getHistory(true); @@ -510,6 +535,32 @@ export class GeminiChat { return streamWithRetries.call(this); } + private extractBinaryInjections( + parts: Part[] | undefined, + ): Part[] | undefined { + if (!parts) { + return undefined; + } + + const binaryInjections: Part[] = []; + + for (const part of parts) { + const response = part.functionResponse?.response; + + if (response && BINARY_INJECTION_KEY in response) { + // eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion + const binaryParts = response[BINARY_INJECTION_KEY] as Part[]; + delete response[BINARY_INJECTION_KEY]; + + if (Array.isArray(binaryParts)) { + binaryInjections.push(...binaryParts); + } + } + } + + return binaryInjections.length > 0 ? binaryInjections : undefined; + } + private async makeApiCallAndProcessStream( modelConfigKey: ModelConfigKey, requestContents: readonly Content[], diff --git a/packages/core/src/utils/generateContentResponseUtilities.test.ts b/packages/core/src/utils/generateContentResponseUtilities.test.ts index 179144964e..5b86a3a630 100644 --- a/packages/core/src/utils/generateContentResponseUtilities.test.ts +++ b/packages/core/src/utils/generateContentResponseUtilities.test.ts @@ -158,6 +158,57 @@ describe('generateContentResponseUtilities', () => { ]); }); + it('should filter out audio/video MIME types and add a minimal system note (generic tool)', () => { + const llmContent: PartListUnion = [ + { text: 'Some text' }, + { inlineData: { mimeType: 'audio/mpeg', data: 'audio_data' } }, + ]; + + const result = convertToFunctionResponse( + 'other_tool', + callId, + llmContent, + PREVIEW_GEMINI_MODEL, + ); + + const frPart = result.find((p) => p.functionResponse); + const response: Record = {}; + if (frPart?.functionResponse?.response) { + Object.assign(response, frPart.functionResponse.response); + } + const output = response['output'] as string; + expect(output).toContain( + '[SYSTEM: Binary content (audio/mpeg) stripped from response due to protocol limitations.]', + ); + expect(output).not.toContain('__binary_injection__'); + }); + + it('should use the __binary_injection__ flag for read_file and read_many_files tools', () => { + const llmContent: PartListUnion = [ + { text: 'Reading audio' }, + { inlineData: { mimeType: 'audio/mpeg', data: 'audio_data' } }, + ]; + + for (const tool of ['read_file', 'read_many_files']) { + const result = convertToFunctionResponse( + tool, + callId, + llmContent, + PREVIEW_GEMINI_MODEL, + ); + + const frPart = result.find((p) => p.functionResponse); + const response: Record = {}; + if (frPart?.functionResponse?.response) { + Object.assign(response, frPart.functionResponse.response); + } + expect(response['output']).toContain('read successfully'); + expect(response['__binary_injection__']).toBeDefined(); + const injection = response['__binary_injection__'] as Part[]; + expect(injection[0].inlineData?.mimeType).toBe('audio/mpeg'); + } + }); + it('should handle llmContent with fileData for Gemini 3 model (should be siblings)', () => { const llmContent: Part = { fileData: { mimeType: 'application/pdf', fileUri: 'gs://...' }, diff --git a/packages/core/src/utils/generateContentResponseUtilities.ts b/packages/core/src/utils/generateContentResponseUtilities.ts index 3b27dd372f..d5a4e7d6ed 100644 --- a/packages/core/src/utils/generateContentResponseUtilities.ts +++ b/packages/core/src/utils/generateContentResponseUtilities.ts @@ -15,6 +15,8 @@ import { supportsMultimodalFunctionResponse } from '../config/models.js'; import { debugLogger } from './debugLogger.js'; import type { Config } from '../config/config.js'; +export const BINARY_INJECTION_KEY = '__binary_injection__'; + /** * Formats tool output for a Gemini FunctionResponse. */ @@ -89,6 +91,43 @@ export function convertToFunctionResponse( // Ignore other part types } + // build a list of unsupported MIME types for function responses + const filteredInlineDataParts: Part[] = []; + const unsupportedInlineDataParts: Part[] = []; + + for (const part of inlineDataParts) { + const mimeType = part.inlineData?.mimeType; + if ( + mimeType && + (mimeType.startsWith('audio/') || mimeType.startsWith('video/')) + ) { + unsupportedInlineDataParts.push(part); + } else { + filteredInlineDataParts.push(part); + } + } + + if (unsupportedInlineDataParts.length > 0) { + const uniqueMimes = Array.from( + new Set( + unsupportedInlineDataParts.map((p) => p.inlineData?.mimeType ?? ''), + ), + ).join(', '); + + const isReadFileTool = + toolName === 'read_file' || toolName === 'read_many_files'; + + if (isReadFileTool) { + textParts.unshift( + `Binary content (${uniqueMimes}) read successfully. Content will be injected for analysis in the next sequence.`, + ); + } else { + textParts.unshift( + `[SYSTEM: Binary content (${uniqueMimes}) stripped from response due to protocol limitations.]`, + ); + } + } + // Build the primary response part const part: Part = { functionResponse: { @@ -98,30 +137,40 @@ export function convertToFunctionResponse( }, }; + const isReadFileTool = + toolName === 'read_file' || toolName === 'read_many_files'; + + if (unsupportedInlineDataParts.length > 0 && isReadFileTool) { + if (part.functionResponse) { + Object.assign(part.functionResponse.response!, { + [BINARY_INJECTION_KEY]: unsupportedInlineDataParts, + }); + } + } + const isMultimodalFRSupported = supportsMultimodalFunctionResponse( model, config, ); const siblingParts: Part[] = [...fileDataParts]; - if (inlineDataParts.length > 0) { + if (filteredInlineDataParts.length > 0) { if (isMultimodalFRSupported) { // Nest inlineData if supported by the model - // eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion - (part.functionResponse as unknown as { parts: Part[] }).parts = - inlineDataParts; + Object.assign(part.functionResponse!, { parts: filteredInlineDataParts }); } else { // Otherwise treat as siblings - siblingParts.push(...inlineDataParts); + siblingParts.push(...filteredInlineDataParts); } } // Add descriptive text if the response object is empty but we have binary content if ( textParts.length === 0 && - (inlineDataParts.length > 0 || fileDataParts.length > 0) + (filteredInlineDataParts.length > 0 || fileDataParts.length > 0) ) { - const totalBinaryItems = inlineDataParts.length + fileDataParts.length; + const totalBinaryItems = + filteredInlineDataParts.length + fileDataParts.length; part.functionResponse!.response = { output: `Binary content provided (${totalBinaryItems} item(s)).`, };