From 05930d5e25e664418ad32ae3e49c34b1e884be54 Mon Sep 17 00:00:00 2001 From: Abhi <43648792+abhipatel12@users.noreply.github.com> Date: Thu, 16 Oct 2025 14:16:24 -0400 Subject: [PATCH] fix(web-fetch): respect Content-Type header in fallback mechanism (#11284) --- packages/core/src/tools/web-fetch.test.ts | 115 ++++++++++++++++++++++ packages/core/src/tools/web-fetch.ts | 31 ++++-- 2 files changed, 138 insertions(+), 8 deletions(-) diff --git a/packages/core/src/tools/web-fetch.test.ts b/packages/core/src/tools/web-fetch.test.ts index 8dc4d3ae52..2ae9d62425 100644 --- a/packages/core/src/tools/web-fetch.test.ts +++ b/packages/core/src/tools/web-fetch.test.ts @@ -22,12 +22,17 @@ import { logWebFetchFallbackAttempt, WebFetchFallbackAttemptEvent, } from '../telemetry/index.js'; +import { convert } from 'html-to-text'; const mockGenerateContent = vi.fn(); const mockGetGeminiClient = vi.fn(() => ({ generateContent: mockGenerateContent, })); +vi.mock('html-to-text', () => ({ + convert: vi.fn((text) => `Converted: ${text}`), +})); + vi.mock('../telemetry/index.js', () => ({ logWebFetchFallbackAttempt: vi.fn(), WebFetchFallbackAttemptEvent: vi.fn(), @@ -246,6 +251,116 @@ describe('WebFetchTool', () => { }); }); + describe('execute (fallback)', () => { + beforeEach(() => { + // Force fallback by mocking primary fetch to fail + vi.spyOn(fetchUtils, 'isPrivateIp').mockReturnValue(false); + mockGenerateContent.mockResolvedValueOnce({ + candidates: [], + }); + }); + + it('should convert HTML content using html-to-text', async () => { + const htmlContent = '
No header
'; + vi.spyOn(fetchUtils, 'fetchWithTimeout').mockResolvedValue({ + ok: true, + headers: new Headers(), + text: () => Promise.resolve(content), + } as Response); + + // Mock fallback LLM call to return the content passed to it + mockGenerateContent.mockImplementationOnce(async (req) => ({ + candidates: [{ content: { parts: [{ text: req[0].parts[0].text }] } }], + })); + + const tool = new WebFetchTool(mockConfig); + const params = { prompt: 'fetch https://example.com' }; + const invocation = tool.build(params); + const result = await invocation.execute(new AbortController().signal); + + expect(convert).toHaveBeenCalledWith(content, { + wordwrap: false, + selectors: [ + { selector: 'a', options: { ignoreHref: true } }, + { selector: 'img', format: 'skip' }, + ], + }); + expect(result.llmContent).toContain(`Converted: ${content}`); + }); + }); + describe('shouldConfirmExecute', () => { it('should return confirmation details with the correct prompt and parsed urls', async () => { const tool = new WebFetchTool(mockConfig); diff --git a/packages/core/src/tools/web-fetch.ts b/packages/core/src/tools/web-fetch.ts index c7077ed440..3d631af9b1 100644 --- a/packages/core/src/tools/web-fetch.ts +++ b/packages/core/src/tools/web-fetch.ts @@ -133,14 +133,29 @@ class WebFetchToolInvocation extends BaseToolInvocation< `Request failed with status code ${response.status} ${response.statusText}`, ); } - const html = await response.text(); - const textContent = convert(html, { - wordwrap: false, - selectors: [ - { selector: 'a', options: { ignoreHref: true } }, - { selector: 'img', format: 'skip' }, - ], - }).substring(0, MAX_CONTENT_LENGTH); + + const rawContent = await response.text(); + const contentType = response.headers.get('content-type') || ''; + let textContent: string; + + // Only use html-to-text if content type is HTML, or if no content type is provided (assume HTML) + if ( + contentType.toLowerCase().includes('text/html') || + contentType === '' + ) { + textContent = convert(rawContent, { + wordwrap: false, + selectors: [ + { selector: 'a', options: { ignoreHref: true } }, + { selector: 'img', format: 'skip' }, + ], + }); + } else { + // For other content types (text/plain, application/json, etc.), use raw text + textContent = rawContent; + } + + textContent = textContent.substring(0, MAX_CONTENT_LENGTH); const geminiClient = this.config.getGeminiClient(); const fallbackPrompt = `The user requested the following: "${this.params.prompt}".