fix(core): filter unsupported multimodal types from tool responses (#26352)

2026-05-14 05:42:54 -07:00 · 2026-05-04 16:31:20 -04:00
parent b6fc583b0c
commit 4d1ca92a19
4 changed files with 307 additions and 8 deletions
@@ -38,6 +38,7 @@ import * as policyHelpers from '../availability/policyHelpers.js';
 import { makeResolvedModelConfig } from '../services/modelConfigServiceTestUtils.js';
 import type { HookSystem } from '../hooks/hookSystem.js';
 import { LlmRole } from '../telemetry/types.js';
+import { BINARY_INJECTION_KEY } from '../utils/generateContentResponseUtilities.js';

 // Mock fs module to prevent actual file system operations during tests
 const mockFileSystem = new Map<string, string>();
@@ -2575,6 +2576,153 @@ describe('GeminiChat', () => {
    });
  });

+  describe('automated binary injection', () => {
+    it('should expand history with synthetic turns when __binary_injection__ is detected', async () => {
+      const audioParts = [
+        {
+          functionResponse: {
+            id: 'call-123',
+            name: 'read_file',
+            response: {
+              output: 'Success',
+              [BINARY_INJECTION_KEY]: [
+                { inlineData: { mimeType: 'audio/mpeg', data: 'base64' } },
+              ],
+            },
+          },
+        },
+      ];
+
+      // Mock API to capture the history it receives
+      let capturedContents: Content[] = [];
+      vi.mocked(mockContentGenerator.generateContentStream).mockImplementation(
+        async (req) => {
+          capturedContents = req.contents as Content[];
+          return (async function* () {
+            yield {
+              candidates: [
+                {
+                  content: { parts: [{ text: 'Analysis done' }] },
+                  finishReason: 'STOP',
+                },
+              ],
+            } as unknown as GenerateContentResponse;
+          })();
+        },
+      );
+
+      const stream = await chat.sendMessageStream(
+        { model: 'gemini-pro' },
+        audioParts,
+        'test-id',
+        new AbortController().signal,
+        LlmRole.MAIN,
+      );
+
+      for await (const _ of stream) {
+        // No-op
+      }
+
+      // Verify history expansion
+      // Turn 1: Tool response (cleaned)
+      // Turn 2: Model Ack (synthetic)
+      // Turn 3: User Binary data (current request)
+      expect(capturedContents).toHaveLength(3);
+      expect(capturedContents[0].role).toBe('user');
+      expect(capturedContents[0].parts![0].functionResponse!.response).toEqual({
+        output: 'Success',
+      });
+      expect(capturedContents[1].role).toBe('model');
+      expect(capturedContents[1].parts![0].text).toContain(
+        'Binary content received',
+      );
+      expect(capturedContents[1].parts![0].thoughtSignature).toBe(
+        SYNTHETIC_THOUGHT_SIGNATURE,
+      );
+      expect(capturedContents[2].role).toBe('user');
+      expect(capturedContents[2].parts![0].inlineData!.mimeType).toBe(
+        'audio/mpeg',
+      );
+    });
+
+    it('should handle multiple parallel binary injections', async () => {
+      const parallelParts = [
+        {
+          functionResponse: {
+            id: 'call-1',
+            name: 'read_file',
+            response: {
+              output: 'Success 1',
+              [BINARY_INJECTION_KEY]: [
+                { inlineData: { mimeType: 'audio/mpeg', data: 'audio1' } },
+              ],
+            },
+          },
+        },
+        {
+          functionResponse: {
+            id: 'call-2',
+            name: 'read_file',
+            response: {
+              output: 'Success 2',
+              [BINARY_INJECTION_KEY]: [
+                { inlineData: { mimeType: 'video/mp4', data: 'video2' } },
+              ],
+            },
+          },
+        },
+      ];
+
+      let capturedContents: Content[] = [];
+      vi.mocked(mockContentGenerator.generateContentStream).mockImplementation(
+        async (req) => {
+          capturedContents = req.contents as Content[];
+          return (async function* () {
+            yield {
+              candidates: [
+                {
+                  content: { parts: [{ text: 'Done' }] },
+                  finishReason: 'STOP',
+                },
+              ],
+            } as unknown as GenerateContentResponse;
+          })();
+        },
+      );
+
+      const stream = await chat.sendMessageStream(
+        { model: 'gemini-pro' },
+        parallelParts,
+        'test-id',
+        new AbortController().signal,
+        LlmRole.MAIN,
+      );
+
+      for await (const _ of stream) {
+        // No-op
+      }
+
+      // Turn 1: Cleaned tool responses (both)
+      // Turn 2: Model Ack
+      // Turn 3: Both binary parts combined
+      expect(capturedContents).toHaveLength(3);
+      expect(capturedContents[0].parts).toHaveLength(2);
+      expect(capturedContents[0].parts![0].functionResponse!.response).toEqual({
+        output: 'Success 1',
+      });
+      expect(capturedContents[0].parts![1].functionResponse!.response).toEqual({
+        output: 'Success 2',
+      });
+      expect(capturedContents[2].parts).toHaveLength(2);
+      expect(capturedContents[2].parts![0].inlineData!.mimeType).toBe(
+        'audio/mpeg',
+      );
+      expect(capturedContents[2].parts![1].inlineData!.mimeType).toBe(
+        'video/mp4',
+      );
+    });
+  });
+
  describe('recordCompletedToolCalls', () => {
    it('should use originalRequestName and originalRequestArgs if present', () => {
      const completedCall: CompletedToolCall = {
@@ -50,6 +50,7 @@ import { handleFallback } from '../fallback/handler.js';
 import { isFunctionResponse } from '../utils/messageInspectors.js';
 import { scrubHistory } from '../utils/historyHardening.js';
 import { partListUnionToString } from './geminiRequest.js';
+import { BINARY_INJECTION_KEY } from '../utils/generateContentResponseUtilities.js';
 import type { ModelConfigKey } from '../services/modelConfigService.js';
 import { estimateTokenCountSync } from '../utils/tokenCalculation.js';
 import {
@@ -336,7 +337,7 @@ export class GeminiChat {
    });
    this.sendPromise = streamDonePromise;

-    const userContent = createUserContent(message);
+    let userContent = createUserContent(message);
    const { model } =
      this.context.config.modelConfigService.getResolvedConfig(modelConfigKey);

@@ -366,6 +367,30 @@ export class GeminiChat {
    }

    // Add user content to history ONCE before any attempts.
+    const binaryInjections = this.extractBinaryInjections(userContent.parts);
+    if (binaryInjections) {
+      // Turn 1: The original tool response (now cleaned)
+      this.agentHistory.push(userContent);
+
+      // Turn 2: Synthetic Model Acknowledgment
+      this.agentHistory.push({
+        role: 'model',
+        parts: [
+          {
+            text: 'Binary content received. Proceeding with analysis.',
+            thought: true,
+            thoughtSignature: SYNTHETIC_THOUGHT_SIGNATURE,
+          },
+        ],
+      });
+
+      // Turn 3: The actual binary data (becomes the current request message)
+      userContent = {
+        role: 'user',
+        parts: binaryInjections,
+      };
+    }
+
    this.agentHistory.push(userContent);
    const requestContents = this.getHistory(true);

@@ -510,6 +535,32 @@ export class GeminiChat {
    return streamWithRetries.call(this);
  }

+  private extractBinaryInjections(
+    parts: Part[] | undefined,
+  ): Part[] | undefined {
+    if (!parts) {
+      return undefined;
+    }
+
+    const binaryInjections: Part[] = [];
+
+    for (const part of parts) {
+      const response = part.functionResponse?.response;
+
+      if (response && BINARY_INJECTION_KEY in response) {
+        // eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
+        const binaryParts = response[BINARY_INJECTION_KEY] as Part[];
+        delete response[BINARY_INJECTION_KEY];
+
+        if (Array.isArray(binaryParts)) {
+          binaryInjections.push(...binaryParts);
+        }
+      }
+    }
+
+    return binaryInjections.length > 0 ? binaryInjections : undefined;
+  }
+
  private async makeApiCallAndProcessStream(
    modelConfigKey: ModelConfigKey,
    requestContents: readonly Content[],
@@ -158,6 +158,57 @@ describe('generateContentResponseUtilities', () => {
      ]);
    });

+    it('should filter out audio/video MIME types and add a minimal system note (generic tool)', () => {
+      const llmContent: PartListUnion = [
+        { text: 'Some text' },
+        { inlineData: { mimeType: 'audio/mpeg', data: 'audio_data' } },
+      ];
+
+      const result = convertToFunctionResponse(
+        'other_tool',
+        callId,
+        llmContent,
+        PREVIEW_GEMINI_MODEL,
+      );
+
+      const frPart = result.find((p) => p.functionResponse);
+      const response: Record<string, unknown> = {};
+      if (frPart?.functionResponse?.response) {
+        Object.assign(response, frPart.functionResponse.response);
+      }
+      const output = response['output'] as string;
+      expect(output).toContain(
+        '[SYSTEM: Binary content (audio/mpeg) stripped from response due to protocol limitations.]',
+      );
+      expect(output).not.toContain('__binary_injection__');
+    });
+
+    it('should use the __binary_injection__ flag for read_file and read_many_files tools', () => {
+      const llmContent: PartListUnion = [
+        { text: 'Reading audio' },
+        { inlineData: { mimeType: 'audio/mpeg', data: 'audio_data' } },
+      ];
+
+      for (const tool of ['read_file', 'read_many_files']) {
+        const result = convertToFunctionResponse(
+          tool,
+          callId,
+          llmContent,
+          PREVIEW_GEMINI_MODEL,
+        );
+
+        const frPart = result.find((p) => p.functionResponse);
+        const response: Record<string, unknown> = {};
+        if (frPart?.functionResponse?.response) {
+          Object.assign(response, frPart.functionResponse.response);
+        }
+        expect(response['output']).toContain('read successfully');
+        expect(response['__binary_injection__']).toBeDefined();
+        const injection = response['__binary_injection__'] as Part[];
+        expect(injection[0].inlineData?.mimeType).toBe('audio/mpeg');
+      }
+    });
+
    it('should handle llmContent with fileData for Gemini 3 model (should be siblings)', () => {
      const llmContent: Part = {
        fileData: { mimeType: 'application/pdf', fileUri: 'gs://...' },
@@ -15,6 +15,8 @@ import { supportsMultimodalFunctionResponse } from '../config/models.js';
 import { debugLogger } from './debugLogger.js';
 import type { Config } from '../config/config.js';

+export const BINARY_INJECTION_KEY = '__binary_injection__';
+
 /**
 * Formats tool output for a Gemini FunctionResponse.
 */
@@ -89,6 +91,43 @@ export function convertToFunctionResponse(
    // Ignore other part types
  }

+  // build a list of unsupported MIME types for function responses
+  const filteredInlineDataParts: Part[] = [];
+  const unsupportedInlineDataParts: Part[] = [];
+
+  for (const part of inlineDataParts) {
+    const mimeType = part.inlineData?.mimeType;
+    if (
+      mimeType &&
+      (mimeType.startsWith('audio/') || mimeType.startsWith('video/'))
+    ) {
+      unsupportedInlineDataParts.push(part);
+    } else {
+      filteredInlineDataParts.push(part);
+    }
+  }
+
+  if (unsupportedInlineDataParts.length > 0) {
+    const uniqueMimes = Array.from(
+      new Set(
+        unsupportedInlineDataParts.map((p) => p.inlineData?.mimeType ?? ''),
+      ),
+    ).join(', ');
+
+    const isReadFileTool =
+      toolName === 'read_file' || toolName === 'read_many_files';
+
+    if (isReadFileTool) {
+      textParts.unshift(
+        `Binary content (${uniqueMimes}) read successfully. Content will be injected for analysis in the next sequence.`,
+      );
+    } else {
+      textParts.unshift(
+        `[SYSTEM: Binary content (${uniqueMimes}) stripped from response due to protocol limitations.]`,
+      );
+    }
+  }
+
  // Build the primary response part
  const part: Part = {
    functionResponse: {
@@ -98,30 +137,40 @@ export function convertToFunctionResponse(
    },
  };

+  const isReadFileTool =
+    toolName === 'read_file' || toolName === 'read_many_files';
+
+  if (unsupportedInlineDataParts.length > 0 && isReadFileTool) {
+    if (part.functionResponse) {
+      Object.assign(part.functionResponse.response!, {
+        [BINARY_INJECTION_KEY]: unsupportedInlineDataParts,
+      });
+    }
+  }
+
  const isMultimodalFRSupported = supportsMultimodalFunctionResponse(
    model,
    config,
  );
  const siblingParts: Part[] = [...fileDataParts];

-  if (inlineDataParts.length > 0) {
+  if (filteredInlineDataParts.length > 0) {
    if (isMultimodalFRSupported) {
      // Nest inlineData if supported by the model
-      // eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
-      (part.functionResponse as unknown as { parts: Part[] }).parts =
-        inlineDataParts;
+      Object.assign(part.functionResponse!, { parts: filteredInlineDataParts });
    } else {
      // Otherwise treat as siblings
-      siblingParts.push(...inlineDataParts);
+      siblingParts.push(...filteredInlineDataParts);
    }
  }

  // Add descriptive text if the response object is empty but we have binary content
  if (
    textParts.length === 0 &&
-    (inlineDataParts.length > 0 || fileDataParts.length > 0)
+    (filteredInlineDataParts.length > 0 || fileDataParts.length > 0)
  ) {
-    const totalBinaryItems = inlineDataParts.length + fileDataParts.length;
+    const totalBinaryItems =
+      filteredInlineDataParts.length + fileDataParts.length;
    part.functionResponse!.response = {
      output: `Binary content provided (${totalBinaryItems} item(s)).`,
    };