mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-05-14 05:42:54 -07:00
fix(core): filter unsupported multimodal types from tool responses (#26352)
This commit is contained in:
@@ -38,6 +38,7 @@ import * as policyHelpers from '../availability/policyHelpers.js';
|
||||
import { makeResolvedModelConfig } from '../services/modelConfigServiceTestUtils.js';
|
||||
import type { HookSystem } from '../hooks/hookSystem.js';
|
||||
import { LlmRole } from '../telemetry/types.js';
|
||||
import { BINARY_INJECTION_KEY } from '../utils/generateContentResponseUtilities.js';
|
||||
|
||||
// Mock fs module to prevent actual file system operations during tests
|
||||
const mockFileSystem = new Map<string, string>();
|
||||
@@ -2575,6 +2576,153 @@ describe('GeminiChat', () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe('automated binary injection', () => {
|
||||
it('should expand history with synthetic turns when __binary_injection__ is detected', async () => {
|
||||
const audioParts = [
|
||||
{
|
||||
functionResponse: {
|
||||
id: 'call-123',
|
||||
name: 'read_file',
|
||||
response: {
|
||||
output: 'Success',
|
||||
[BINARY_INJECTION_KEY]: [
|
||||
{ inlineData: { mimeType: 'audio/mpeg', data: 'base64' } },
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
];
|
||||
|
||||
// Mock API to capture the history it receives
|
||||
let capturedContents: Content[] = [];
|
||||
vi.mocked(mockContentGenerator.generateContentStream).mockImplementation(
|
||||
async (req) => {
|
||||
capturedContents = req.contents as Content[];
|
||||
return (async function* () {
|
||||
yield {
|
||||
candidates: [
|
||||
{
|
||||
content: { parts: [{ text: 'Analysis done' }] },
|
||||
finishReason: 'STOP',
|
||||
},
|
||||
],
|
||||
} as unknown as GenerateContentResponse;
|
||||
})();
|
||||
},
|
||||
);
|
||||
|
||||
const stream = await chat.sendMessageStream(
|
||||
{ model: 'gemini-pro' },
|
||||
audioParts,
|
||||
'test-id',
|
||||
new AbortController().signal,
|
||||
LlmRole.MAIN,
|
||||
);
|
||||
|
||||
for await (const _ of stream) {
|
||||
// No-op
|
||||
}
|
||||
|
||||
// Verify history expansion
|
||||
// Turn 1: Tool response (cleaned)
|
||||
// Turn 2: Model Ack (synthetic)
|
||||
// Turn 3: User Binary data (current request)
|
||||
expect(capturedContents).toHaveLength(3);
|
||||
expect(capturedContents[0].role).toBe('user');
|
||||
expect(capturedContents[0].parts![0].functionResponse!.response).toEqual({
|
||||
output: 'Success',
|
||||
});
|
||||
expect(capturedContents[1].role).toBe('model');
|
||||
expect(capturedContents[1].parts![0].text).toContain(
|
||||
'Binary content received',
|
||||
);
|
||||
expect(capturedContents[1].parts![0].thoughtSignature).toBe(
|
||||
SYNTHETIC_THOUGHT_SIGNATURE,
|
||||
);
|
||||
expect(capturedContents[2].role).toBe('user');
|
||||
expect(capturedContents[2].parts![0].inlineData!.mimeType).toBe(
|
||||
'audio/mpeg',
|
||||
);
|
||||
});
|
||||
|
||||
it('should handle multiple parallel binary injections', async () => {
|
||||
const parallelParts = [
|
||||
{
|
||||
functionResponse: {
|
||||
id: 'call-1',
|
||||
name: 'read_file',
|
||||
response: {
|
||||
output: 'Success 1',
|
||||
[BINARY_INJECTION_KEY]: [
|
||||
{ inlineData: { mimeType: 'audio/mpeg', data: 'audio1' } },
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
functionResponse: {
|
||||
id: 'call-2',
|
||||
name: 'read_file',
|
||||
response: {
|
||||
output: 'Success 2',
|
||||
[BINARY_INJECTION_KEY]: [
|
||||
{ inlineData: { mimeType: 'video/mp4', data: 'video2' } },
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
];
|
||||
|
||||
let capturedContents: Content[] = [];
|
||||
vi.mocked(mockContentGenerator.generateContentStream).mockImplementation(
|
||||
async (req) => {
|
||||
capturedContents = req.contents as Content[];
|
||||
return (async function* () {
|
||||
yield {
|
||||
candidates: [
|
||||
{
|
||||
content: { parts: [{ text: 'Done' }] },
|
||||
finishReason: 'STOP',
|
||||
},
|
||||
],
|
||||
} as unknown as GenerateContentResponse;
|
||||
})();
|
||||
},
|
||||
);
|
||||
|
||||
const stream = await chat.sendMessageStream(
|
||||
{ model: 'gemini-pro' },
|
||||
parallelParts,
|
||||
'test-id',
|
||||
new AbortController().signal,
|
||||
LlmRole.MAIN,
|
||||
);
|
||||
|
||||
for await (const _ of stream) {
|
||||
// No-op
|
||||
}
|
||||
|
||||
// Turn 1: Cleaned tool responses (both)
|
||||
// Turn 2: Model Ack
|
||||
// Turn 3: Both binary parts combined
|
||||
expect(capturedContents).toHaveLength(3);
|
||||
expect(capturedContents[0].parts).toHaveLength(2);
|
||||
expect(capturedContents[0].parts![0].functionResponse!.response).toEqual({
|
||||
output: 'Success 1',
|
||||
});
|
||||
expect(capturedContents[0].parts![1].functionResponse!.response).toEqual({
|
||||
output: 'Success 2',
|
||||
});
|
||||
expect(capturedContents[2].parts).toHaveLength(2);
|
||||
expect(capturedContents[2].parts![0].inlineData!.mimeType).toBe(
|
||||
'audio/mpeg',
|
||||
);
|
||||
expect(capturedContents[2].parts![1].inlineData!.mimeType).toBe(
|
||||
'video/mp4',
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
describe('recordCompletedToolCalls', () => {
|
||||
it('should use originalRequestName and originalRequestArgs if present', () => {
|
||||
const completedCall: CompletedToolCall = {
|
||||
|
||||
@@ -50,6 +50,7 @@ import { handleFallback } from '../fallback/handler.js';
|
||||
import { isFunctionResponse } from '../utils/messageInspectors.js';
|
||||
import { scrubHistory } from '../utils/historyHardening.js';
|
||||
import { partListUnionToString } from './geminiRequest.js';
|
||||
import { BINARY_INJECTION_KEY } from '../utils/generateContentResponseUtilities.js';
|
||||
import type { ModelConfigKey } from '../services/modelConfigService.js';
|
||||
import { estimateTokenCountSync } from '../utils/tokenCalculation.js';
|
||||
import {
|
||||
@@ -336,7 +337,7 @@ export class GeminiChat {
|
||||
});
|
||||
this.sendPromise = streamDonePromise;
|
||||
|
||||
const userContent = createUserContent(message);
|
||||
let userContent = createUserContent(message);
|
||||
const { model } =
|
||||
this.context.config.modelConfigService.getResolvedConfig(modelConfigKey);
|
||||
|
||||
@@ -366,6 +367,30 @@ export class GeminiChat {
|
||||
}
|
||||
|
||||
// Add user content to history ONCE before any attempts.
|
||||
const binaryInjections = this.extractBinaryInjections(userContent.parts);
|
||||
if (binaryInjections) {
|
||||
// Turn 1: The original tool response (now cleaned)
|
||||
this.agentHistory.push(userContent);
|
||||
|
||||
// Turn 2: Synthetic Model Acknowledgment
|
||||
this.agentHistory.push({
|
||||
role: 'model',
|
||||
parts: [
|
||||
{
|
||||
text: 'Binary content received. Proceeding with analysis.',
|
||||
thought: true,
|
||||
thoughtSignature: SYNTHETIC_THOUGHT_SIGNATURE,
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
// Turn 3: The actual binary data (becomes the current request message)
|
||||
userContent = {
|
||||
role: 'user',
|
||||
parts: binaryInjections,
|
||||
};
|
||||
}
|
||||
|
||||
this.agentHistory.push(userContent);
|
||||
const requestContents = this.getHistory(true);
|
||||
|
||||
@@ -510,6 +535,32 @@ export class GeminiChat {
|
||||
return streamWithRetries.call(this);
|
||||
}
|
||||
|
||||
private extractBinaryInjections(
|
||||
parts: Part[] | undefined,
|
||||
): Part[] | undefined {
|
||||
if (!parts) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
const binaryInjections: Part[] = [];
|
||||
|
||||
for (const part of parts) {
|
||||
const response = part.functionResponse?.response;
|
||||
|
||||
if (response && BINARY_INJECTION_KEY in response) {
|
||||
// eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
|
||||
const binaryParts = response[BINARY_INJECTION_KEY] as Part[];
|
||||
delete response[BINARY_INJECTION_KEY];
|
||||
|
||||
if (Array.isArray(binaryParts)) {
|
||||
binaryInjections.push(...binaryParts);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return binaryInjections.length > 0 ? binaryInjections : undefined;
|
||||
}
|
||||
|
||||
private async makeApiCallAndProcessStream(
|
||||
modelConfigKey: ModelConfigKey,
|
||||
requestContents: readonly Content[],
|
||||
|
||||
@@ -158,6 +158,57 @@ describe('generateContentResponseUtilities', () => {
|
||||
]);
|
||||
});
|
||||
|
||||
it('should filter out audio/video MIME types and add a minimal system note (generic tool)', () => {
|
||||
const llmContent: PartListUnion = [
|
||||
{ text: 'Some text' },
|
||||
{ inlineData: { mimeType: 'audio/mpeg', data: 'audio_data' } },
|
||||
];
|
||||
|
||||
const result = convertToFunctionResponse(
|
||||
'other_tool',
|
||||
callId,
|
||||
llmContent,
|
||||
PREVIEW_GEMINI_MODEL,
|
||||
);
|
||||
|
||||
const frPart = result.find((p) => p.functionResponse);
|
||||
const response: Record<string, unknown> = {};
|
||||
if (frPart?.functionResponse?.response) {
|
||||
Object.assign(response, frPart.functionResponse.response);
|
||||
}
|
||||
const output = response['output'] as string;
|
||||
expect(output).toContain(
|
||||
'[SYSTEM: Binary content (audio/mpeg) stripped from response due to protocol limitations.]',
|
||||
);
|
||||
expect(output).not.toContain('__binary_injection__');
|
||||
});
|
||||
|
||||
it('should use the __binary_injection__ flag for read_file and read_many_files tools', () => {
|
||||
const llmContent: PartListUnion = [
|
||||
{ text: 'Reading audio' },
|
||||
{ inlineData: { mimeType: 'audio/mpeg', data: 'audio_data' } },
|
||||
];
|
||||
|
||||
for (const tool of ['read_file', 'read_many_files']) {
|
||||
const result = convertToFunctionResponse(
|
||||
tool,
|
||||
callId,
|
||||
llmContent,
|
||||
PREVIEW_GEMINI_MODEL,
|
||||
);
|
||||
|
||||
const frPart = result.find((p) => p.functionResponse);
|
||||
const response: Record<string, unknown> = {};
|
||||
if (frPart?.functionResponse?.response) {
|
||||
Object.assign(response, frPart.functionResponse.response);
|
||||
}
|
||||
expect(response['output']).toContain('read successfully');
|
||||
expect(response['__binary_injection__']).toBeDefined();
|
||||
const injection = response['__binary_injection__'] as Part[];
|
||||
expect(injection[0].inlineData?.mimeType).toBe('audio/mpeg');
|
||||
}
|
||||
});
|
||||
|
||||
it('should handle llmContent with fileData for Gemini 3 model (should be siblings)', () => {
|
||||
const llmContent: Part = {
|
||||
fileData: { mimeType: 'application/pdf', fileUri: 'gs://...' },
|
||||
|
||||
@@ -15,6 +15,8 @@ import { supportsMultimodalFunctionResponse } from '../config/models.js';
|
||||
import { debugLogger } from './debugLogger.js';
|
||||
import type { Config } from '../config/config.js';
|
||||
|
||||
export const BINARY_INJECTION_KEY = '__binary_injection__';
|
||||
|
||||
/**
|
||||
* Formats tool output for a Gemini FunctionResponse.
|
||||
*/
|
||||
@@ -89,6 +91,43 @@ export function convertToFunctionResponse(
|
||||
// Ignore other part types
|
||||
}
|
||||
|
||||
// build a list of unsupported MIME types for function responses
|
||||
const filteredInlineDataParts: Part[] = [];
|
||||
const unsupportedInlineDataParts: Part[] = [];
|
||||
|
||||
for (const part of inlineDataParts) {
|
||||
const mimeType = part.inlineData?.mimeType;
|
||||
if (
|
||||
mimeType &&
|
||||
(mimeType.startsWith('audio/') || mimeType.startsWith('video/'))
|
||||
) {
|
||||
unsupportedInlineDataParts.push(part);
|
||||
} else {
|
||||
filteredInlineDataParts.push(part);
|
||||
}
|
||||
}
|
||||
|
||||
if (unsupportedInlineDataParts.length > 0) {
|
||||
const uniqueMimes = Array.from(
|
||||
new Set(
|
||||
unsupportedInlineDataParts.map((p) => p.inlineData?.mimeType ?? ''),
|
||||
),
|
||||
).join(', ');
|
||||
|
||||
const isReadFileTool =
|
||||
toolName === 'read_file' || toolName === 'read_many_files';
|
||||
|
||||
if (isReadFileTool) {
|
||||
textParts.unshift(
|
||||
`Binary content (${uniqueMimes}) read successfully. Content will be injected for analysis in the next sequence.`,
|
||||
);
|
||||
} else {
|
||||
textParts.unshift(
|
||||
`[SYSTEM: Binary content (${uniqueMimes}) stripped from response due to protocol limitations.]`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Build the primary response part
|
||||
const part: Part = {
|
||||
functionResponse: {
|
||||
@@ -98,30 +137,40 @@ export function convertToFunctionResponse(
|
||||
},
|
||||
};
|
||||
|
||||
const isReadFileTool =
|
||||
toolName === 'read_file' || toolName === 'read_many_files';
|
||||
|
||||
if (unsupportedInlineDataParts.length > 0 && isReadFileTool) {
|
||||
if (part.functionResponse) {
|
||||
Object.assign(part.functionResponse.response!, {
|
||||
[BINARY_INJECTION_KEY]: unsupportedInlineDataParts,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
const isMultimodalFRSupported = supportsMultimodalFunctionResponse(
|
||||
model,
|
||||
config,
|
||||
);
|
||||
const siblingParts: Part[] = [...fileDataParts];
|
||||
|
||||
if (inlineDataParts.length > 0) {
|
||||
if (filteredInlineDataParts.length > 0) {
|
||||
if (isMultimodalFRSupported) {
|
||||
// Nest inlineData if supported by the model
|
||||
// eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
|
||||
(part.functionResponse as unknown as { parts: Part[] }).parts =
|
||||
inlineDataParts;
|
||||
Object.assign(part.functionResponse!, { parts: filteredInlineDataParts });
|
||||
} else {
|
||||
// Otherwise treat as siblings
|
||||
siblingParts.push(...inlineDataParts);
|
||||
siblingParts.push(...filteredInlineDataParts);
|
||||
}
|
||||
}
|
||||
|
||||
// Add descriptive text if the response object is empty but we have binary content
|
||||
if (
|
||||
textParts.length === 0 &&
|
||||
(inlineDataParts.length > 0 || fileDataParts.length > 0)
|
||||
(filteredInlineDataParts.length > 0 || fileDataParts.length > 0)
|
||||
) {
|
||||
const totalBinaryItems = inlineDataParts.length + fileDataParts.length;
|
||||
const totalBinaryItems =
|
||||
filteredInlineDataParts.length + fileDataParts.length;
|
||||
part.functionResponse!.response = {
|
||||
output: `Binary content provided (${totalBinaryItems} item(s)).`,
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user