fix(core): filter unsupported multimodal types from tool responses (#26352)

This commit is contained in:
Aishanee Shah
2026-05-04 16:31:20 -04:00
committed by GitHub
parent b6fc583b0c
commit 4d1ca92a19
4 changed files with 307 additions and 8 deletions
+148
View File
@@ -38,6 +38,7 @@ import * as policyHelpers from '../availability/policyHelpers.js';
import { makeResolvedModelConfig } from '../services/modelConfigServiceTestUtils.js';
import type { HookSystem } from '../hooks/hookSystem.js';
import { LlmRole } from '../telemetry/types.js';
import { BINARY_INJECTION_KEY } from '../utils/generateContentResponseUtilities.js';
// Mock fs module to prevent actual file system operations during tests
const mockFileSystem = new Map<string, string>();
@@ -2575,6 +2576,153 @@ describe('GeminiChat', () => {
});
});
describe('automated binary injection', () => {
it('should expand history with synthetic turns when __binary_injection__ is detected', async () => {
const audioParts = [
{
functionResponse: {
id: 'call-123',
name: 'read_file',
response: {
output: 'Success',
[BINARY_INJECTION_KEY]: [
{ inlineData: { mimeType: 'audio/mpeg', data: 'base64' } },
],
},
},
},
];
// Mock API to capture the history it receives
let capturedContents: Content[] = [];
vi.mocked(mockContentGenerator.generateContentStream).mockImplementation(
async (req) => {
capturedContents = req.contents as Content[];
return (async function* () {
yield {
candidates: [
{
content: { parts: [{ text: 'Analysis done' }] },
finishReason: 'STOP',
},
],
} as unknown as GenerateContentResponse;
})();
},
);
const stream = await chat.sendMessageStream(
{ model: 'gemini-pro' },
audioParts,
'test-id',
new AbortController().signal,
LlmRole.MAIN,
);
for await (const _ of stream) {
// No-op
}
// Verify history expansion
// Turn 1: Tool response (cleaned)
// Turn 2: Model Ack (synthetic)
// Turn 3: User Binary data (current request)
expect(capturedContents).toHaveLength(3);
expect(capturedContents[0].role).toBe('user');
expect(capturedContents[0].parts![0].functionResponse!.response).toEqual({
output: 'Success',
});
expect(capturedContents[1].role).toBe('model');
expect(capturedContents[1].parts![0].text).toContain(
'Binary content received',
);
expect(capturedContents[1].parts![0].thoughtSignature).toBe(
SYNTHETIC_THOUGHT_SIGNATURE,
);
expect(capturedContents[2].role).toBe('user');
expect(capturedContents[2].parts![0].inlineData!.mimeType).toBe(
'audio/mpeg',
);
});
it('should handle multiple parallel binary injections', async () => {
const parallelParts = [
{
functionResponse: {
id: 'call-1',
name: 'read_file',
response: {
output: 'Success 1',
[BINARY_INJECTION_KEY]: [
{ inlineData: { mimeType: 'audio/mpeg', data: 'audio1' } },
],
},
},
},
{
functionResponse: {
id: 'call-2',
name: 'read_file',
response: {
output: 'Success 2',
[BINARY_INJECTION_KEY]: [
{ inlineData: { mimeType: 'video/mp4', data: 'video2' } },
],
},
},
},
];
let capturedContents: Content[] = [];
vi.mocked(mockContentGenerator.generateContentStream).mockImplementation(
async (req) => {
capturedContents = req.contents as Content[];
return (async function* () {
yield {
candidates: [
{
content: { parts: [{ text: 'Done' }] },
finishReason: 'STOP',
},
],
} as unknown as GenerateContentResponse;
})();
},
);
const stream = await chat.sendMessageStream(
{ model: 'gemini-pro' },
parallelParts,
'test-id',
new AbortController().signal,
LlmRole.MAIN,
);
for await (const _ of stream) {
// No-op
}
// Turn 1: Cleaned tool responses (both)
// Turn 2: Model Ack
// Turn 3: Both binary parts combined
expect(capturedContents).toHaveLength(3);
expect(capturedContents[0].parts).toHaveLength(2);
expect(capturedContents[0].parts![0].functionResponse!.response).toEqual({
output: 'Success 1',
});
expect(capturedContents[0].parts![1].functionResponse!.response).toEqual({
output: 'Success 2',
});
expect(capturedContents[2].parts).toHaveLength(2);
expect(capturedContents[2].parts![0].inlineData!.mimeType).toBe(
'audio/mpeg',
);
expect(capturedContents[2].parts![1].inlineData!.mimeType).toBe(
'video/mp4',
);
});
});
describe('recordCompletedToolCalls', () => {
it('should use originalRequestName and originalRequestArgs if present', () => {
const completedCall: CompletedToolCall = {
+52 -1
View File
@@ -50,6 +50,7 @@ import { handleFallback } from '../fallback/handler.js';
import { isFunctionResponse } from '../utils/messageInspectors.js';
import { scrubHistory } from '../utils/historyHardening.js';
import { partListUnionToString } from './geminiRequest.js';
import { BINARY_INJECTION_KEY } from '../utils/generateContentResponseUtilities.js';
import type { ModelConfigKey } from '../services/modelConfigService.js';
import { estimateTokenCountSync } from '../utils/tokenCalculation.js';
import {
@@ -336,7 +337,7 @@ export class GeminiChat {
});
this.sendPromise = streamDonePromise;
const userContent = createUserContent(message);
let userContent = createUserContent(message);
const { model } =
this.context.config.modelConfigService.getResolvedConfig(modelConfigKey);
@@ -366,6 +367,30 @@ export class GeminiChat {
}
// Add user content to history ONCE before any attempts.
const binaryInjections = this.extractBinaryInjections(userContent.parts);
if (binaryInjections) {
// Turn 1: The original tool response (now cleaned)
this.agentHistory.push(userContent);
// Turn 2: Synthetic Model Acknowledgment
this.agentHistory.push({
role: 'model',
parts: [
{
text: 'Binary content received. Proceeding with analysis.',
thought: true,
thoughtSignature: SYNTHETIC_THOUGHT_SIGNATURE,
},
],
});
// Turn 3: The actual binary data (becomes the current request message)
userContent = {
role: 'user',
parts: binaryInjections,
};
}
this.agentHistory.push(userContent);
const requestContents = this.getHistory(true);
@@ -510,6 +535,32 @@ export class GeminiChat {
return streamWithRetries.call(this);
}
private extractBinaryInjections(
parts: Part[] | undefined,
): Part[] | undefined {
if (!parts) {
return undefined;
}
const binaryInjections: Part[] = [];
for (const part of parts) {
const response = part.functionResponse?.response;
if (response && BINARY_INJECTION_KEY in response) {
// eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
const binaryParts = response[BINARY_INJECTION_KEY] as Part[];
delete response[BINARY_INJECTION_KEY];
if (Array.isArray(binaryParts)) {
binaryInjections.push(...binaryParts);
}
}
}
return binaryInjections.length > 0 ? binaryInjections : undefined;
}
private async makeApiCallAndProcessStream(
modelConfigKey: ModelConfigKey,
requestContents: readonly Content[],
@@ -158,6 +158,57 @@ describe('generateContentResponseUtilities', () => {
]);
});
it('should filter out audio/video MIME types and add a minimal system note (generic tool)', () => {
const llmContent: PartListUnion = [
{ text: 'Some text' },
{ inlineData: { mimeType: 'audio/mpeg', data: 'audio_data' } },
];
const result = convertToFunctionResponse(
'other_tool',
callId,
llmContent,
PREVIEW_GEMINI_MODEL,
);
const frPart = result.find((p) => p.functionResponse);
const response: Record<string, unknown> = {};
if (frPart?.functionResponse?.response) {
Object.assign(response, frPart.functionResponse.response);
}
const output = response['output'] as string;
expect(output).toContain(
'[SYSTEM: Binary content (audio/mpeg) stripped from response due to protocol limitations.]',
);
expect(output).not.toContain('__binary_injection__');
});
it('should use the __binary_injection__ flag for read_file and read_many_files tools', () => {
const llmContent: PartListUnion = [
{ text: 'Reading audio' },
{ inlineData: { mimeType: 'audio/mpeg', data: 'audio_data' } },
];
for (const tool of ['read_file', 'read_many_files']) {
const result = convertToFunctionResponse(
tool,
callId,
llmContent,
PREVIEW_GEMINI_MODEL,
);
const frPart = result.find((p) => p.functionResponse);
const response: Record<string, unknown> = {};
if (frPart?.functionResponse?.response) {
Object.assign(response, frPart.functionResponse.response);
}
expect(response['output']).toContain('read successfully');
expect(response['__binary_injection__']).toBeDefined();
const injection = response['__binary_injection__'] as Part[];
expect(injection[0].inlineData?.mimeType).toBe('audio/mpeg');
}
});
it('should handle llmContent with fileData for Gemini 3 model (should be siblings)', () => {
const llmContent: Part = {
fileData: { mimeType: 'application/pdf', fileUri: 'gs://...' },
@@ -15,6 +15,8 @@ import { supportsMultimodalFunctionResponse } from '../config/models.js';
import { debugLogger } from './debugLogger.js';
import type { Config } from '../config/config.js';
export const BINARY_INJECTION_KEY = '__binary_injection__';
/**
* Formats tool output for a Gemini FunctionResponse.
*/
@@ -89,6 +91,43 @@ export function convertToFunctionResponse(
// Ignore other part types
}
// build a list of unsupported MIME types for function responses
const filteredInlineDataParts: Part[] = [];
const unsupportedInlineDataParts: Part[] = [];
for (const part of inlineDataParts) {
const mimeType = part.inlineData?.mimeType;
if (
mimeType &&
(mimeType.startsWith('audio/') || mimeType.startsWith('video/'))
) {
unsupportedInlineDataParts.push(part);
} else {
filteredInlineDataParts.push(part);
}
}
if (unsupportedInlineDataParts.length > 0) {
const uniqueMimes = Array.from(
new Set(
unsupportedInlineDataParts.map((p) => p.inlineData?.mimeType ?? ''),
),
).join(', ');
const isReadFileTool =
toolName === 'read_file' || toolName === 'read_many_files';
if (isReadFileTool) {
textParts.unshift(
`Binary content (${uniqueMimes}) read successfully. Content will be injected for analysis in the next sequence.`,
);
} else {
textParts.unshift(
`[SYSTEM: Binary content (${uniqueMimes}) stripped from response due to protocol limitations.]`,
);
}
}
// Build the primary response part
const part: Part = {
functionResponse: {
@@ -98,30 +137,40 @@ export function convertToFunctionResponse(
},
};
const isReadFileTool =
toolName === 'read_file' || toolName === 'read_many_files';
if (unsupportedInlineDataParts.length > 0 && isReadFileTool) {
if (part.functionResponse) {
Object.assign(part.functionResponse.response!, {
[BINARY_INJECTION_KEY]: unsupportedInlineDataParts,
});
}
}
const isMultimodalFRSupported = supportsMultimodalFunctionResponse(
model,
config,
);
const siblingParts: Part[] = [...fileDataParts];
if (inlineDataParts.length > 0) {
if (filteredInlineDataParts.length > 0) {
if (isMultimodalFRSupported) {
// Nest inlineData if supported by the model
// eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
(part.functionResponse as unknown as { parts: Part[] }).parts =
inlineDataParts;
Object.assign(part.functionResponse!, { parts: filteredInlineDataParts });
} else {
// Otherwise treat as siblings
siblingParts.push(...inlineDataParts);
siblingParts.push(...filteredInlineDataParts);
}
}
// Add descriptive text if the response object is empty but we have binary content
if (
textParts.length === 0 &&
(inlineDataParts.length > 0 || fileDataParts.length > 0)
(filteredInlineDataParts.length > 0 || fileDataParts.length > 0)
) {
const totalBinaryItems = inlineDataParts.length + fileDataParts.length;
const totalBinaryItems =
filteredInlineDataParts.length + fileDataParts.length;
part.functionResponse!.response = {
output: `Binary content provided (${totalBinaryItems} item(s)).`,
};