From 70336e73b1f8a836bba5320296de46ff49a2aa7e Mon Sep 17 00:00:00 2001 From: Michael Bleigh Date: Mon, 23 Feb 2026 11:50:14 -0800 Subject: [PATCH] feat(core): implement experimental direct web fetch (#19557) --- docs/cli/settings.md | 1 + docs/reference/configuration.md | 5 + packages/cli/src/config/config.test.ts | 34 ++ packages/cli/src/config/config.ts | 3 +- packages/cli/src/config/settingsSchema.ts | 10 + .../oauth-credential-storage.test.ts | 5 + packages/core/src/config/config.ts | 7 + .../core/recordingContentGenerator.test.ts | 5 + .../src/tools/confirmation-policy.test.ts | 1 + packages/core/src/tools/memoryTool.test.ts | 5 + packages/core/src/tools/web-fetch.test.ts | 405 ++++++++++++++++-- packages/core/src/tools/web-fetch.ts | 305 +++++++++++-- packages/core/src/utils/fetch.ts | 16 +- schemas/settings.schema.json | 7 + 14 files changed, 744 insertions(+), 65 deletions(-) diff --git a/docs/cli/settings.md b/docs/cli/settings.md index 5011f55b2c..111728ea59 100644 --- a/docs/cli/settings.md +++ b/docs/cli/settings.md @@ -135,6 +135,7 @@ they appear in the UI. | Use OSC 52 Copy | `experimental.useOSC52Copy` | Use OSC 52 for copying. This may be more robust than the default system when using remote terminal sessions (if your terminal is configured to allow it). | `false` | | Plan | `experimental.plan` | Enable planning features (Plan Mode and tools). | `false` | | Model Steering | `experimental.modelSteering` | Enable model steering (user hints) to guide the model during tool execution. | `false` | +| Direct Web Fetch | `experimental.directWebFetch` | Enable web fetch behavior that bypasses LLM summarization. | `false` | ### Skills diff --git a/docs/reference/configuration.md b/docs/reference/configuration.md index b9874e017b..b069b03fc2 100644 --- a/docs/reference/configuration.md +++ b/docs/reference/configuration.md @@ -969,6 +969,11 @@ their corresponding top-level category object in your `settings.json` file. during tool execution. - **Default:** `false` +- **`experimental.directWebFetch`** (boolean): + - **Description:** Enable web fetch behavior that bypasses LLM summarization. + - **Default:** `false` + - **Requires restart:** Yes + #### `skills` - **`skills.enabled`** (boolean): diff --git a/packages/cli/src/config/config.test.ts b/packages/cli/src/config/config.test.ts index 809b31cd82..75812e4442 100644 --- a/packages/cli/src/config/config.test.ts +++ b/packages/cli/src/config/config.test.ts @@ -2016,6 +2016,40 @@ describe('loadCliConfig useRipgrep', () => { }); }); +describe('loadCliConfig directWebFetch', () => { + beforeEach(() => { + vi.resetAllMocks(); + vi.mocked(os.homedir).mockReturnValue('/mock/home/user'); + vi.stubEnv('GEMINI_API_KEY', 'test-api-key'); + vi.spyOn(ExtensionManager.prototype, 'getExtensions').mockReturnValue([]); + }); + + afterEach(() => { + vi.unstubAllEnvs(); + vi.restoreAllMocks(); + }); + + it('should be false by default when directWebFetch is not set in settings', async () => { + process.argv = ['node', 'script.js']; + const argv = await parseArguments(createTestMergedSettings()); + const settings = createTestMergedSettings(); + const config = await loadCliConfig(settings, 'test-session', argv); + expect(config.getDirectWebFetch()).toBe(false); + }); + + it('should be true when directWebFetch is set to true in settings', async () => { + process.argv = ['node', 'script.js']; + const argv = await parseArguments(createTestMergedSettings()); + const settings = createTestMergedSettings({ + experimental: { + directWebFetch: true, + }, + }); + const config = await loadCliConfig(settings, 'test-session', argv); + expect(config.getDirectWebFetch()).toBe(true); + }); +}); + describe('screenReader configuration', () => { beforeEach(() => { vi.resetAllMocks(); diff --git a/packages/cli/src/config/config.ts b/packages/cli/src/config/config.ts index 38ab62ac22..50e0c2059d 100755 --- a/packages/cli/src/config/config.ts +++ b/packages/cli/src/config/config.ts @@ -826,7 +826,8 @@ export async function loadCliConfig( enableExtensionReloading: settings.experimental?.extensionReloading, enableAgents: settings.experimental?.enableAgents, plan: settings.experimental?.plan, - planSettings: settings.general.plan, + directWebFetch: settings.experimental?.directWebFetch, + planSettings: settings.general?.plan, enableEventDrivenScheduler: true, skillsSupport: settings.skills?.enabled ?? true, disabledSkills: settings.skills?.disabled, diff --git a/packages/cli/src/config/settingsSchema.ts b/packages/cli/src/config/settingsSchema.ts index cbe2df5f30..17c51d4e21 100644 --- a/packages/cli/src/config/settingsSchema.ts +++ b/packages/cli/src/config/settingsSchema.ts @@ -1693,6 +1693,16 @@ const SETTINGS_SCHEMA = { 'Enable model steering (user hints) to guide the model during tool execution.', showInDialog: true, }, + directWebFetch: { + type: 'boolean', + label: 'Direct Web Fetch', + category: 'Experimental', + requiresRestart: true, + default: false, + description: + 'Enable web fetch behavior that bypasses LLM summarization.', + showInDialog: true, + }, }, }, diff --git a/packages/core/src/code_assist/oauth-credential-storage.test.ts b/packages/core/src/code_assist/oauth-credential-storage.test.ts index fdde49662a..b1cb460368 100644 --- a/packages/core/src/code_assist/oauth-credential-storage.test.ts +++ b/packages/core/src/code_assist/oauth-credential-storage.test.ts @@ -28,6 +28,11 @@ vi.mock('node:fs', () => ({ readFile: vi.fn(), rm: vi.fn(), }, + createWriteStream: vi.fn(() => ({ + on: vi.fn(), + write: vi.fn(), + end: vi.fn(), + })), })); vi.mock('node:os'); vi.mock('node:path'); diff --git a/packages/core/src/config/config.ts b/packages/core/src/config/config.ts index 42f8508697..0f03c03db0 100644 --- a/packages/core/src/config/config.ts +++ b/packages/core/src/config/config.ts @@ -470,6 +470,7 @@ export interface ConfigParameters { eventEmitter?: EventEmitter; useWriteTodos?: boolean; policyEngineConfig?: PolicyEngineConfig; + directWebFetch?: boolean; policyUpdateConfirmationRequest?: PolicyUpdateConfirmationRequest; output?: OutputSettings; disableModelRouterForAuth?: AuthType[]; @@ -633,6 +634,7 @@ export class Config { readonly interactive: boolean; private readonly ptyInfo: string; private readonly trustedFolder: boolean | undefined; + private readonly directWebFetch: boolean; private readonly useRipgrep: boolean; private readonly enableInteractiveShell: boolean; private readonly skipNextSpeakerCheck: boolean; @@ -826,6 +828,7 @@ export class Config { this.interactive = params.interactive ?? false; this.ptyInfo = params.ptyInfo ?? 'child_process'; this.trustedFolder = params.trustedFolder; + this.directWebFetch = params.directWebFetch ?? false; this.useRipgrep = params.useRipgrep ?? true; this.useBackgroundColor = params.useBackgroundColor ?? true; this.enableInteractiveShell = params.enableInteractiveShell ?? false; @@ -2085,6 +2088,10 @@ export class Config { return this.approvedPlanPath; } + getDirectWebFetch(): boolean { + return this.directWebFetch; + } + setApprovedPlanPath(path: string | undefined): void { this.approvedPlanPath = path; } diff --git a/packages/core/src/core/recordingContentGenerator.test.ts b/packages/core/src/core/recordingContentGenerator.test.ts index cbdb239ecf..518e8585c3 100644 --- a/packages/core/src/core/recordingContentGenerator.test.ts +++ b/packages/core/src/core/recordingContentGenerator.test.ts @@ -22,6 +22,11 @@ import { LlmRole } from '../telemetry/types.js'; vi.mock('node:fs', () => ({ appendFileSync: vi.fn(), + createWriteStream: vi.fn(() => ({ + on: vi.fn(), + write: vi.fn(), + end: vi.fn(), + })), })); describe('RecordingContentGenerator', () => { diff --git a/packages/core/src/tools/confirmation-policy.test.ts b/packages/core/src/tools/confirmation-policy.test.ts index 72b6e11e21..c6ad1f5e94 100644 --- a/packages/core/src/tools/confirmation-policy.test.ts +++ b/packages/core/src/tools/confirmation-policy.test.ts @@ -71,6 +71,7 @@ describe('Tool Confirmation Policy Updates', () => { isPathWithinWorkspace: () => true, getDirectories: () => [rootDir], }), + getDirectWebFetch: () => false, storage: { getProjectTempDir: () => path.join(os.tmpdir(), 'gemini-cli-temp'), }, diff --git a/packages/core/src/tools/memoryTool.test.ts b/packages/core/src/tools/memoryTool.test.ts index 654b5943c4..12cb8baa2e 100644 --- a/packages/core/src/tools/memoryTool.test.ts +++ b/packages/core/src/tools/memoryTool.test.ts @@ -37,6 +37,11 @@ vi.mock('node:fs/promises', async (importOriginal) => { vi.mock('fs', () => ({ mkdirSync: vi.fn(), + createWriteStream: vi.fn(() => ({ + on: vi.fn(), + write: vi.fn(), + end: vi.fn(), + })), })); vi.mock('os'); diff --git a/packages/core/src/tools/web-fetch.test.ts b/packages/core/src/tools/web-fetch.test.ts index 2e06a46ee5..92ba4076b2 100644 --- a/packages/core/src/tools/web-fetch.test.ts +++ b/packages/core/src/tools/web-fetch.test.ts @@ -5,7 +5,11 @@ */ import { describe, it, expect, vi, beforeEach, type Mock } from 'vitest'; -import { WebFetchTool, parsePrompt } from './web-fetch.js'; +import { + WebFetchTool, + parsePrompt, + convertGithubUrlToRaw, +} from './web-fetch.js'; import type { Config } from '../config/config.js'; import { ApprovalMode } from '../policy/types.js'; import { ToolConfirmationOutcome } from './tools.js'; @@ -55,6 +59,72 @@ vi.mock('node:crypto', () => ({ randomUUID: vi.fn(), })); +/** + * Helper to mock fetchWithTimeout with URL matching. + */ +const mockFetch = (url: string, response: Partial | Error) => + vi + .spyOn(fetchUtils, 'fetchWithTimeout') + .mockImplementation(async (actualUrl) => { + if (actualUrl !== url) { + throw new Error( + `Unexpected fetch URL: expected "${url}", got "${actualUrl}"`, + ); + } + if (response instanceof Error) { + throw response; + } + + const headers = response.headers || new Headers(); + + // If we have text/arrayBuffer but no body, create a body mock + let body = response.body; + if (!body) { + let content: Uint8Array | undefined; + if (response.text) { + const text = await response.text(); + content = new TextEncoder().encode(text); + } else if (response.arrayBuffer) { + const ab = await response.arrayBuffer(); + content = new Uint8Array(ab); + } + + if (content) { + body = { + getReader: () => { + let sent = false; + return { + read: async () => { + if (sent) return { done: true, value: undefined }; + sent = true; + return { done: false, value: content }; + }, + releaseLock: () => {}, + cancel: async () => {}, + }; + }, + } as unknown as ReadableStream; + } + } + + return { + ok: response.status ? response.status < 400 : true, + status: 200, + headers, + text: response.text || (() => Promise.resolve('')), + arrayBuffer: + response.arrayBuffer || (() => Promise.resolve(new ArrayBuffer(0))), + body: body || { + getReader: () => ({ + read: async () => ({ done: true, value: undefined }), + releaseLock: () => {}, + cancel: async () => {}, + }), + }, + ...response, + } as unknown as Response; + }); + describe('parsePrompt', () => { it('should extract valid URLs separated by whitespace', () => { const prompt = 'Go to https://example.com and http://google.com'; @@ -128,6 +198,42 @@ describe('parsePrompt', () => { }); }); +describe('convertGithubUrlToRaw', () => { + it('should convert valid github blob urls', () => { + expect( + convertGithubUrlToRaw('https://github.com/user/repo/blob/main/README.md'), + ).toBe('https://raw.githubusercontent.com/user/repo/main/README.md'); + }); + + it('should not convert non-blob github urls', () => { + expect(convertGithubUrlToRaw('https://github.com/user/repo')).toBe( + 'https://github.com/user/repo', + ); + }); + + it('should not convert urls with similar domain names', () => { + expect( + convertGithubUrlToRaw('https://mygithub.com/user/repo/blob/main'), + ).toBe('https://mygithub.com/user/repo/blob/main'); + }); + + it('should only replace the /blob/ that separates repo from branch', () => { + expect( + convertGithubUrlToRaw('https://github.com/blob/repo/blob/main/test.ts'), + ).toBe('https://raw.githubusercontent.com/blob/repo/main/test.ts'); + }); + + it('should not convert urls if blob is not in path', () => { + expect( + convertGithubUrlToRaw('https://github.com/user/repo/tree/main'), + ).toBe('https://github.com/user/repo/tree/main'); + }); + + it('should handle invalid urls gracefully', () => { + expect(convertGithubUrlToRaw('not-a-url')).toBe('not-a-url'); + }); +}); + describe('WebFetchTool', () => { let mockConfig: Config; let bus: MessageBus; @@ -142,6 +248,7 @@ describe('WebFetchTool', () => { getProxy: vi.fn(), getGeminiClient: mockGetGeminiClient, getRetryFetchErrors: vi.fn().mockReturnValue(false), + getDirectWebFetch: vi.fn().mockReturnValue(false), modelConfigService: { getResolvedConfig: vi.fn().mockImplementation(({ model }) => ({ model, @@ -153,32 +260,79 @@ describe('WebFetchTool', () => { }); describe('validateToolParamValues', () => { - it.each([ - { - name: 'empty prompt', - prompt: '', - expectedError: "The 'prompt' parameter cannot be empty", - }, - { - name: 'prompt with no URLs', - prompt: 'hello world', - expectedError: "The 'prompt' must contain at least one valid URL", - }, - { - name: 'prompt with malformed URLs', - prompt: 'fetch httpshttps://example.com', - expectedError: 'Error(s) in prompt URLs:', - }, - ])('should throw if $name', ({ prompt, expectedError }) => { - const tool = new WebFetchTool(mockConfig, bus); - expect(() => tool.build({ prompt })).toThrow(expectedError); + describe('standard mode', () => { + it.each([ + { + name: 'empty prompt', + prompt: '', + expectedError: "The 'prompt' parameter cannot be empty", + }, + { + name: 'prompt with no URLs', + prompt: 'hello world', + expectedError: "The 'prompt' must contain at least one valid URL", + }, + { + name: 'prompt with malformed URLs', + prompt: 'fetch httpshttps://example.com', + expectedError: 'Error(s) in prompt URLs:', + }, + ])('should throw if $name', ({ prompt, expectedError }) => { + const tool = new WebFetchTool(mockConfig, bus); + expect(() => tool.build({ prompt })).toThrow(expectedError); + }); + + it('should pass if prompt contains at least one valid URL', () => { + const tool = new WebFetchTool(mockConfig, bus); + expect(() => + tool.build({ prompt: 'fetch https://example.com' }), + ).not.toThrow(); + }); }); - it('should pass if prompt contains at least one valid URL', () => { + describe('experimental mode', () => { + beforeEach(() => { + vi.spyOn(mockConfig, 'getDirectWebFetch').mockReturnValue(true); + }); + + it('should throw if url is missing', () => { + const tool = new WebFetchTool(mockConfig, bus); + expect(() => tool.build({ prompt: 'foo' })).toThrow( + "params must have required property 'url'", + ); + }); + + it('should throw if url is invalid', () => { + const tool = new WebFetchTool(mockConfig, bus); + expect(() => tool.build({ url: 'not-a-url' })).toThrow( + 'Invalid URL: "not-a-url"', + ); + }); + + it('should pass if url is valid', () => { + const tool = new WebFetchTool(mockConfig, bus); + expect(() => tool.build({ url: 'https://example.com' })).not.toThrow(); + }); + }); + }); + + describe('getSchema', () => { + it('should return standard schema by default', () => { const tool = new WebFetchTool(mockConfig, bus); - expect(() => - tool.build({ prompt: 'fetch https://example.com' }), - ).not.toThrow(); + const schema = tool.getSchema(); + expect(schema.parametersJsonSchema).toHaveProperty('properties.prompt'); + expect(schema.parametersJsonSchema).not.toHaveProperty('properties.url'); + }); + + it('should return experimental schema when enabled', () => { + vi.spyOn(mockConfig, 'getDirectWebFetch').mockReturnValue(true); + const tool = new WebFetchTool(mockConfig, bus); + const schema = tool.getSchema(); + expect(schema.parametersJsonSchema).toHaveProperty('properties.url'); + expect(schema.parametersJsonSchema).not.toHaveProperty( + 'properties.prompt', + ); + expect(schema.parametersJsonSchema).toHaveProperty('required', ['url']); }); }); @@ -205,9 +359,7 @@ describe('WebFetchTool', () => { it('should return WEB_FETCH_FALLBACK_FAILED on fallback fetch failure', async () => { vi.spyOn(fetchUtils, 'isPrivateIp').mockReturnValue(true); - vi.spyOn(fetchUtils, 'fetchWithTimeout').mockRejectedValue( - new Error('fetch failed'), - ); + mockFetch('https://private.ip/', new Error('fetch failed')); const tool = new WebFetchTool(mockConfig, bus); const params = { prompt: 'fetch https://private.ip' }; const invocation = tool.build(params); @@ -228,10 +380,9 @@ describe('WebFetchTool', () => { it('should log telemetry when falling back due to private IP', async () => { vi.spyOn(fetchUtils, 'isPrivateIp').mockReturnValue(true); // Mock fetchWithTimeout to succeed so fallback proceeds - vi.spyOn(fetchUtils, 'fetchWithTimeout').mockResolvedValue({ - ok: true, + mockFetch('https://private.ip/', { text: () => Promise.resolve('some content'), - } as Response); + }); mockGenerateContent.mockResolvedValue({ candidates: [{ content: { parts: [{ text: 'fallback response' }] } }], }); @@ -255,10 +406,9 @@ describe('WebFetchTool', () => { candidates: [], }); // Mock fetchWithTimeout to succeed so fallback proceeds - vi.spyOn(fetchUtils, 'fetchWithTimeout').mockResolvedValue({ - ok: true, + mockFetch('https://public.ip/', { text: () => Promise.resolve('some content'), - } as Response); + }); // Mock fallback LLM call mockGenerateContent.mockResolvedValueOnce({ candidates: [{ content: { parts: [{ text: 'fallback response' }] } }], @@ -320,11 +470,10 @@ describe('WebFetchTool', () => { ? new Headers({ 'content-type': contentType }) : new Headers(); - vi.spyOn(fetchUtils, 'fetchWithTimeout').mockResolvedValue({ - ok: true, + mockFetch('https://example.com/', { headers, text: () => Promise.resolve(content), - } as Response); + }); // Mock fallback LLM call to return the content passed to it mockGenerateContent.mockImplementationOnce(async (_, req) => ({ @@ -373,6 +522,24 @@ describe('WebFetchTool', () => { }); }); + it('should handle URL param in confirmation details', async () => { + vi.spyOn(mockConfig, 'getDirectWebFetch').mockReturnValue(true); + const tool = new WebFetchTool(mockConfig, bus); + const params = { url: 'https://example.com' }; + const invocation = tool.build(params); + const confirmationDetails = await invocation.shouldConfirmExecute( + new AbortController().signal, + ); + + expect(confirmationDetails).toEqual({ + type: 'info', + title: 'Confirm Web Fetch', + prompt: 'Fetch https://example.com', + urls: ['https://example.com'], + onConfirm: expect.any(Function), + }); + }); + it('should convert github urls to raw format', async () => { const tool = new WebFetchTool(mockConfig, bus); const params = { @@ -601,4 +768,170 @@ describe('WebFetchTool', () => { expect(result.llmContent).toContain('Fetched content'); }); }); + + describe('execute (experimental)', () => { + beforeEach(() => { + vi.spyOn(mockConfig, 'getDirectWebFetch').mockReturnValue(true); + vi.spyOn(fetchUtils, 'isPrivateIp').mockReturnValue(false); + }); + + it('should perform direct fetch and return text for plain text content', async () => { + const content = 'Plain text content'; + mockFetch('https://example.com/', { + status: 200, + headers: new Headers({ 'content-type': 'text/plain' }), + text: () => Promise.resolve(content), + }); + + const tool = new WebFetchTool(mockConfig, bus); + const params = { url: 'https://example.com' }; + const invocation = tool.build(params); + const result = await invocation.execute(new AbortController().signal); + + expect(result.llmContent).toBe(content); + expect(result.returnDisplay).toContain('Fetched text/plain content'); + expect(fetchUtils.fetchWithTimeout).toHaveBeenCalledWith( + 'https://example.com/', + expect.any(Number), + expect.objectContaining({ + headers: expect.objectContaining({ + Accept: expect.stringContaining('text/plain'), + }), + }), + ); + }); + + it('should use html-to-text and preserve links for HTML content', async () => { + const content = + 'Link'; + mockFetch('https://example.com/', { + status: 200, + headers: new Headers({ 'content-type': 'text/html' }), + text: () => Promise.resolve(content), + }); + + const tool = new WebFetchTool(mockConfig, bus); + const params = { url: 'https://example.com' }; + const invocation = tool.build(params); + await invocation.execute(new AbortController().signal); + + expect(convert).toHaveBeenCalledWith( + content, + expect.objectContaining({ + selectors: [ + expect.objectContaining({ + selector: 'a', + options: { ignoreHref: false, baseUrl: 'https://example.com/' }, + }), + ], + }), + ); + }); + + it('should return base64 for image content', async () => { + const buffer = Buffer.from('fake-image-data'); + mockFetch('https://example.com/image.png', { + status: 200, + headers: new Headers({ 'content-type': 'image/png' }), + arrayBuffer: () => + Promise.resolve( + buffer.buffer.slice( + buffer.byteOffset, + buffer.byteOffset + buffer.byteLength, + ), + ), + }); + + const tool = new WebFetchTool(mockConfig, bus); + const params = { url: 'https://example.com/image.png' }; + const invocation = tool.build(params); + const result = await invocation.execute(new AbortController().signal); + + expect(result.llmContent).toEqual({ + inlineData: { + data: buffer.toString('base64'), + mimeType: 'image/png', + }, + }); + }); + + it('should return raw response info for 4xx/5xx errors', async () => { + const errorBody = 'Not Found'; + mockFetch('https://example.com/404', { + status: 404, + headers: new Headers({ 'x-test': 'val' }), + text: () => Promise.resolve(errorBody), + }); + + const tool = new WebFetchTool(mockConfig, bus); + const params = { url: 'https://example.com/404' }; + const invocation = tool.build(params); + const result = await invocation.execute(new AbortController().signal); + + expect(result.llmContent).toContain('Request failed with status 404'); + expect(result.llmContent).toContain('val'); + expect(result.llmContent).toContain(errorBody); + expect(result.returnDisplay).toContain('Failed to fetch'); + }); + + it('should throw error if Content-Length exceeds limit', async () => { + mockFetch('https://example.com/large', { + headers: new Headers({ + 'content-length': (11 * 1024 * 1024).toString(), + }), + }); + + const tool = new WebFetchTool(mockConfig, bus); + const invocation = tool.build({ url: 'https://example.com/large' }); + const result = await invocation.execute(new AbortController().signal); + + expect(result.llmContent).toContain('Error'); + expect(result.llmContent).toContain('exceeds size limit'); + }); + + it('should throw error if stream exceeds limit', async () => { + const largeChunk = new Uint8Array(11 * 1024 * 1024); + mockFetch('https://example.com/large-stream', { + body: { + getReader: () => ({ + read: vi + .fn() + .mockResolvedValueOnce({ done: false, value: largeChunk }) + .mockResolvedValueOnce({ done: true }), + releaseLock: vi.fn(), + cancel: vi.fn().mockResolvedValue(undefined), + }), + } as unknown as ReadableStream, + }); + + const tool = new WebFetchTool(mockConfig, bus); + const invocation = tool.build({ + url: 'https://example.com/large-stream', + }); + const result = await invocation.execute(new AbortController().signal); + + expect(result.llmContent).toContain('Error'); + expect(result.llmContent).toContain('exceeds size limit'); + }); + + it('should return error if url is missing (experimental)', async () => { + const tool = new WebFetchTool(mockConfig, bus); + // Manually bypass build() validation to test executeExperimental safety check + const invocation = tool['createInvocation']({}, bus); + const result = await invocation.execute(new AbortController().signal); + + expect(result.llmContent).toContain('Error: No URL provided.'); + expect(result.error?.type).toBe(ToolErrorType.INVALID_TOOL_PARAMS); + }); + + it('should return error if url is invalid (experimental)', async () => { + const tool = new WebFetchTool(mockConfig, bus); + // Manually bypass build() validation to test executeExperimental safety check + const invocation = tool['createInvocation']({ url: 'not-a-url' }, bus); + const result = await invocation.execute(new AbortController().signal); + + expect(result.llmContent).toContain('Error: Invalid URL "not-a-url"'); + expect(result.error?.type).toBe(ToolErrorType.INVALID_TOOL_PARAMS); + }); + }); }); diff --git a/packages/core/src/tools/web-fetch.ts b/packages/core/src/tools/web-fetch.ts index 3521ad935b..55d2474c1c 100644 --- a/packages/core/src/tools/web-fetch.ts +++ b/packages/core/src/tools/web-fetch.ts @@ -18,6 +18,7 @@ import type { Config } from '../config/config.js'; import { ApprovalMode } from '../policy/types.js'; import { getResponseText } from '../utils/partUtils.js'; import { fetchWithTimeout, isPrivateIp } from '../utils/fetch.js'; +import { truncateString } from '../utils/textUtils.js'; import { convert } from 'html-to-text'; import { logWebFetchFallbackAttempt, @@ -33,6 +34,10 @@ import { LRUCache } from 'mnemonist'; const URL_FETCH_TIMEOUT_MS = 10000; const MAX_CONTENT_LENGTH = 100000; +const MAX_EXPERIMENTAL_FETCH_SIZE = 10 * 1024 * 1024; // 10MB +const USER_AGENT = + 'Mozilla/5.0 (compatible; Google-Gemini-CLI/1.0; +https://github.com/google-gemini/gemini-cli)'; +const TRUNCATION_WARNING = '\n\n... [Content truncated due to size limit] ...'; // Rate limiting configuration const RATE_LIMIT_WINDOW_MS = 60000; // 1 minute @@ -107,6 +112,23 @@ export function parsePrompt(text: string): { return { validUrls, errors }; } +/** + * Safely converts a GitHub blob URL to a raw content URL. + */ +export function convertGithubUrlToRaw(urlStr: string): string { + try { + const url = new URL(urlStr); + if (url.hostname === 'github.com' && url.pathname.includes('/blob/')) { + url.hostname = 'raw.githubusercontent.com'; + url.pathname = url.pathname.replace(/^\/([^/]+\/[^/]+)\/blob\//, '/$1/'); + return url.href; + } + } catch { + // Ignore invalid URLs + } + return urlStr; +} + // Interfaces for grounding metadata (similar to web-search.ts) interface GroundingChunkWeb { uri?: string; @@ -135,7 +157,11 @@ export interface WebFetchToolParams { /** * The prompt containing URL(s) (up to 20) and instructions for processing their content. */ - prompt: string; + prompt?: string; + /** + * Direct URL to fetch (experimental mode). + */ + url?: string; } interface ErrorWithStatus extends Error { @@ -157,21 +183,22 @@ class WebFetchToolInvocation extends BaseToolInvocation< } private async executeFallback(signal: AbortSignal): Promise { - const { validUrls: urls } = parsePrompt(this.params.prompt); + const { validUrls: urls } = parsePrompt(this.params.prompt!); // For now, we only support one URL for fallback let url = urls[0]; // Convert GitHub blob URL to raw URL - if (url.includes('github.com') && url.includes('/blob/')) { - url = url - .replace('github.com', 'raw.githubusercontent.com') - .replace('/blob/', '/'); - } + url = convertGithubUrlToRaw(url); try { const response = await retryWithBackoff( async () => { - const res = await fetchWithTimeout(url, URL_FETCH_TIMEOUT_MS); + const res = await fetchWithTimeout(url, URL_FETCH_TIMEOUT_MS, { + signal, + headers: { + 'User-Agent': USER_AGENT, + }, + }); if (!res.ok) { const error = new Error( `Request failed with status code ${res.status} ${res.statusText}`, @@ -186,7 +213,11 @@ class WebFetchToolInvocation extends BaseToolInvocation< }, ); - const rawContent = await response.text(); + const bodyBuffer = await this.readResponseWithLimit( + response, + MAX_EXPERIMENTAL_FETCH_SIZE, + ); + const rawContent = bodyBuffer.toString('utf8'); const contentType = response.headers.get('content-type') || ''; let textContent: string; @@ -207,7 +238,11 @@ class WebFetchToolInvocation extends BaseToolInvocation< textContent = rawContent; } - textContent = textContent.substring(0, MAX_CONTENT_LENGTH); + textContent = truncateString( + textContent, + MAX_CONTENT_LENGTH, + TRUNCATION_WARNING, + ); const geminiClient = this.config.getGeminiClient(); const fallbackPrompt = `The user requested the following: "${this.params.prompt}". @@ -245,10 +280,12 @@ ${textContent} } getDescription(): string { + if (this.params.url) { + return `Fetching content from: ${this.params.url}`; + } + const prompt = this.params.prompt || ''; const displayPrompt = - this.params.prompt.length > 100 - ? this.params.prompt.substring(0, 97) + '...' - : this.params.prompt; + prompt.length > 100 ? prompt.substring(0, 97) + '...' : prompt; return `Processing URLs and instructions from prompt: "${displayPrompt}"`; } @@ -261,22 +298,24 @@ ${textContent} return false; } - // Perform GitHub URL conversion here to differentiate between user-provided - // URL and the actual URL to be fetched. - const { validUrls } = parsePrompt(this.params.prompt); - const urls = validUrls.map((url) => { - if (url.includes('github.com') && url.includes('/blob/')) { - return url - .replace('github.com', 'raw.githubusercontent.com') - .replace('/blob/', '/'); - } - return url; - }); + let urls: string[] = []; + let prompt = this.params.prompt || ''; + + if (this.params.url) { + urls = [this.params.url]; + prompt = `Fetch ${this.params.url}`; + } else if (this.params.prompt) { + const { validUrls } = parsePrompt(this.params.prompt); + urls = validUrls; + } + + // Perform GitHub URL conversion here + urls = urls.map((url) => convertGithubUrlToRaw(url)); const confirmationDetails: ToolCallConfirmationDetails = { type: 'info', title: `Confirm Web Fetch`, - prompt: this.params.prompt, + prompt, urls, onConfirm: async (_outcome: ToolConfirmationOutcome) => { // Mode transitions (e.g. AUTO_EDIT) and policy updates are now @@ -286,8 +325,189 @@ ${textContent} return confirmationDetails; } + private async readResponseWithLimit( + response: Response, + limit: number, + ): Promise { + const contentLength = response.headers.get('content-length'); + if (contentLength && parseInt(contentLength, 10) > limit) { + throw new Error(`Content exceeds size limit of ${limit} bytes`); + } + + if (!response.body) { + return Buffer.alloc(0); + } + + const reader = response.body.getReader(); + const chunks: Uint8Array[] = []; + let totalLength = 0; + try { + while (true) { + const { done, value } = await reader.read(); + if (done) break; + totalLength += value.length; + if (totalLength > limit) { + // Attempt to cancel the reader to stop the stream + await reader.cancel().catch(() => {}); + throw new Error(`Content exceeds size limit of ${limit} bytes`); + } + chunks.push(value); + } + } finally { + reader.releaseLock(); + } + return Buffer.concat(chunks); + } + + private async executeExperimental(signal: AbortSignal): Promise { + if (!this.params.url) { + return { + llmContent: 'Error: No URL provided.', + returnDisplay: 'Error: No URL provided.', + error: { + message: 'No URL provided.', + type: ToolErrorType.INVALID_TOOL_PARAMS, + }, + }; + } + + let url: string; + try { + url = new URL(this.params.url).href; + } catch { + return { + llmContent: `Error: Invalid URL "${this.params.url}"`, + returnDisplay: `Error: Invalid URL "${this.params.url}"`, + error: { + message: `Invalid URL "${this.params.url}"`, + type: ToolErrorType.INVALID_TOOL_PARAMS, + }, + }; + } + + // Convert GitHub blob URL to raw URL + url = convertGithubUrlToRaw(url); + + try { + const response = await retryWithBackoff( + async () => { + const res = await fetchWithTimeout(url, URL_FETCH_TIMEOUT_MS, { + signal, + headers: { + Accept: + 'text/markdown, text/plain;q=0.9, application/json;q=0.9, text/html;q=0.8, application/pdf;q=0.7, video/*;q=0.7, */*;q=0.5', + 'User-Agent': USER_AGENT, + }, + }); + return res; + }, + { + retryFetchErrors: this.config.getRetryFetchErrors(), + }, + ); + + const contentType = response.headers.get('content-type') || ''; + const status = response.status; + const bodyBuffer = await this.readResponseWithLimit( + response, + MAX_EXPERIMENTAL_FETCH_SIZE, + ); + + if (status >= 400) { + const rawResponseText = bodyBuffer.toString('utf8'); + const headers: Record = {}; + response.headers.forEach((value, key) => { + headers[key] = value; + }); + const errorContent = `Request failed with status ${status} +Headers: ${JSON.stringify(headers, null, 2)} +Response: ${truncateString(rawResponseText, 10000, '\n\n... [Error response truncated] ...')}`; + return { + llmContent: errorContent, + returnDisplay: `Failed to fetch ${url} (Status: ${status})`, + }; + } + + const lowContentType = contentType.toLowerCase(); + if ( + lowContentType.includes('text/markdown') || + lowContentType.includes('text/plain') || + lowContentType.includes('application/json') + ) { + const text = truncateString( + bodyBuffer.toString('utf8'), + MAX_CONTENT_LENGTH, + TRUNCATION_WARNING, + ); + return { + llmContent: text, + returnDisplay: `Fetched ${contentType} content from ${url}`, + }; + } + + if (lowContentType.includes('text/html')) { + const html = bodyBuffer.toString('utf8'); + const textContent = truncateString( + convert(html, { + wordwrap: false, + selectors: [ + { selector: 'a', options: { ignoreHref: false, baseUrl: url } }, + ], + }), + MAX_CONTENT_LENGTH, + TRUNCATION_WARNING, + ); + return { + llmContent: textContent, + returnDisplay: `Fetched and converted HTML content from ${url}`, + }; + } + + if ( + lowContentType.startsWith('image/') || + lowContentType.startsWith('video/') || + lowContentType === 'application/pdf' + ) { + const base64Data = bodyBuffer.toString('base64'); + return { + llmContent: { + inlineData: { + data: base64Data, + mimeType: contentType.split(';')[0], + }, + }, + returnDisplay: `Fetched ${contentType} from ${url}`, + }; + } + + // Fallback for unknown types - try as text + const text = truncateString( + bodyBuffer.toString('utf8'), + MAX_CONTENT_LENGTH, + TRUNCATION_WARNING, + ); + return { + llmContent: text, + returnDisplay: `Fetched ${contentType || 'unknown'} content from ${url}`, + }; + } catch (e) { + const errorMessage = `Error during experimental fetch for ${url}: ${getErrorMessage(e)}`; + return { + llmContent: `Error: ${errorMessage}`, + returnDisplay: `Error: ${errorMessage}`, + error: { + message: errorMessage, + type: ToolErrorType.WEB_FETCH_FALLBACK_FAILED, + }, + }; + } + } + async execute(signal: AbortSignal): Promise { - const userPrompt = this.params.prompt; + if (this.config.getDirectWebFetch()) { + return this.executeExperimental(signal); + } + const userPrompt = this.params.prompt!; const { validUrls: urls } = parsePrompt(userPrompt); const url = urls[0]; @@ -475,6 +695,18 @@ export class WebFetchTool extends BaseDeclarativeTool< protected override validateToolParamValues( params: WebFetchToolParams, ): string | null { + if (this.config.getDirectWebFetch()) { + if (!params.url) { + return "The 'url' parameter is required."; + } + try { + new URL(params.url); + } catch { + return `Invalid URL: "${params.url}"`; + } + return null; + } + if (!params.prompt || params.prompt.trim() === '') { return "The 'prompt' parameter cannot be empty and must contain URL(s) and instructions."; } @@ -508,6 +740,25 @@ export class WebFetchTool extends BaseDeclarativeTool< } override getSchema(modelId?: string) { - return resolveToolDeclaration(WEB_FETCH_DEFINITION, modelId); + const schema = resolveToolDeclaration(WEB_FETCH_DEFINITION, modelId); + if (this.config.getDirectWebFetch()) { + return { + ...schema, + description: + 'Fetch content from a URL directly. Send multiple requests for this tool if multiple URL fetches are needed.', + parametersJsonSchema: { + type: 'object', + properties: { + url: { + type: 'string', + description: + 'The URL to fetch. Must be a valid http or https URL.', + }, + }, + required: ['url'], + }, + }; + } + return schema; } } diff --git a/packages/core/src/utils/fetch.ts b/packages/core/src/utils/fetch.ts index 3c59b2ef31..30d583e99f 100644 --- a/packages/core/src/utils/fetch.ts +++ b/packages/core/src/utils/fetch.ts @@ -41,12 +41,26 @@ export function isPrivateIp(url: string): boolean { export async function fetchWithTimeout( url: string, timeout: number, + options?: RequestInit, ): Promise { const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), timeout); + if (options?.signal) { + if (options.signal.aborted) { + controller.abort(); + } else { + options.signal.addEventListener('abort', () => controller.abort(), { + once: true, + }); + } + } + try { - const response = await fetch(url, { signal: controller.signal }); + const response = await fetch(url, { + ...options, + signal: controller.signal, + }); return response; } catch (error) { if (isNodeError(error) && error.code === 'ABORT_ERR') { diff --git a/schemas/settings.schema.json b/schemas/settings.schema.json index 7ef861d882..9cc8f1f71b 100644 --- a/schemas/settings.schema.json +++ b/schemas/settings.schema.json @@ -1629,6 +1629,13 @@ "markdownDescription": "Enable model steering (user hints) to guide the model during tool execution.\n\n- Category: `Experimental`\n- Requires restart: `no`\n- Default: `false`", "default": false, "type": "boolean" + }, + "directWebFetch": { + "title": "Direct Web Fetch", + "description": "Enable web fetch behavior that bypasses LLM summarization.", + "markdownDescription": "Enable web fetch behavior that bypasses LLM summarization.\n\n- Category: `Experimental`\n- Requires restart: `yes`\n- Default: `false`", + "default": false, + "type": "boolean" } }, "additionalProperties": false