feat(core): implement experimental direct web fetch (#19557)

This commit is contained in:
Michael Bleigh
2026-02-23 11:50:14 -08:00
committed by GitHub
parent 7cfbb6fb71
commit 70336e73b1
14 changed files with 744 additions and 65 deletions
+278 -27
View File
@@ -18,6 +18,7 @@ import type { Config } from '../config/config.js';
import { ApprovalMode } from '../policy/types.js';
import { getResponseText } from '../utils/partUtils.js';
import { fetchWithTimeout, isPrivateIp } from '../utils/fetch.js';
import { truncateString } from '../utils/textUtils.js';
import { convert } from 'html-to-text';
import {
logWebFetchFallbackAttempt,
@@ -33,6 +34,10 @@ import { LRUCache } from 'mnemonist';
const URL_FETCH_TIMEOUT_MS = 10000;
const MAX_CONTENT_LENGTH = 100000;
const MAX_EXPERIMENTAL_FETCH_SIZE = 10 * 1024 * 1024; // 10MB
const USER_AGENT =
'Mozilla/5.0 (compatible; Google-Gemini-CLI/1.0; +https://github.com/google-gemini/gemini-cli)';
const TRUNCATION_WARNING = '\n\n... [Content truncated due to size limit] ...';
// Rate limiting configuration
const RATE_LIMIT_WINDOW_MS = 60000; // 1 minute
@@ -107,6 +112,23 @@ export function parsePrompt(text: string): {
return { validUrls, errors };
}
/**
* Safely converts a GitHub blob URL to a raw content URL.
*/
export function convertGithubUrlToRaw(urlStr: string): string {
try {
const url = new URL(urlStr);
if (url.hostname === 'github.com' && url.pathname.includes('/blob/')) {
url.hostname = 'raw.githubusercontent.com';
url.pathname = url.pathname.replace(/^\/([^/]+\/[^/]+)\/blob\//, '/$1/');
return url.href;
}
} catch {
// Ignore invalid URLs
}
return urlStr;
}
// Interfaces for grounding metadata (similar to web-search.ts)
interface GroundingChunkWeb {
uri?: string;
@@ -135,7 +157,11 @@ export interface WebFetchToolParams {
/**
* The prompt containing URL(s) (up to 20) and instructions for processing their content.
*/
prompt: string;
prompt?: string;
/**
* Direct URL to fetch (experimental mode).
*/
url?: string;
}
interface ErrorWithStatus extends Error {
@@ -157,21 +183,22 @@ class WebFetchToolInvocation extends BaseToolInvocation<
}
private async executeFallback(signal: AbortSignal): Promise<ToolResult> {
const { validUrls: urls } = parsePrompt(this.params.prompt);
const { validUrls: urls } = parsePrompt(this.params.prompt!);
// For now, we only support one URL for fallback
let url = urls[0];
// Convert GitHub blob URL to raw URL
if (url.includes('github.com') && url.includes('/blob/')) {
url = url
.replace('github.com', 'raw.githubusercontent.com')
.replace('/blob/', '/');
}
url = convertGithubUrlToRaw(url);
try {
const response = await retryWithBackoff(
async () => {
const res = await fetchWithTimeout(url, URL_FETCH_TIMEOUT_MS);
const res = await fetchWithTimeout(url, URL_FETCH_TIMEOUT_MS, {
signal,
headers: {
'User-Agent': USER_AGENT,
},
});
if (!res.ok) {
const error = new Error(
`Request failed with status code ${res.status} ${res.statusText}`,
@@ -186,7 +213,11 @@ class WebFetchToolInvocation extends BaseToolInvocation<
},
);
const rawContent = await response.text();
const bodyBuffer = await this.readResponseWithLimit(
response,
MAX_EXPERIMENTAL_FETCH_SIZE,
);
const rawContent = bodyBuffer.toString('utf8');
const contentType = response.headers.get('content-type') || '';
let textContent: string;
@@ -207,7 +238,11 @@ class WebFetchToolInvocation extends BaseToolInvocation<
textContent = rawContent;
}
textContent = textContent.substring(0, MAX_CONTENT_LENGTH);
textContent = truncateString(
textContent,
MAX_CONTENT_LENGTH,
TRUNCATION_WARNING,
);
const geminiClient = this.config.getGeminiClient();
const fallbackPrompt = `The user requested the following: "${this.params.prompt}".
@@ -245,10 +280,12 @@ ${textContent}
}
getDescription(): string {
if (this.params.url) {
return `Fetching content from: ${this.params.url}`;
}
const prompt = this.params.prompt || '';
const displayPrompt =
this.params.prompt.length > 100
? this.params.prompt.substring(0, 97) + '...'
: this.params.prompt;
prompt.length > 100 ? prompt.substring(0, 97) + '...' : prompt;
return `Processing URLs and instructions from prompt: "${displayPrompt}"`;
}
@@ -261,22 +298,24 @@ ${textContent}
return false;
}
// Perform GitHub URL conversion here to differentiate between user-provided
// URL and the actual URL to be fetched.
const { validUrls } = parsePrompt(this.params.prompt);
const urls = validUrls.map((url) => {
if (url.includes('github.com') && url.includes('/blob/')) {
return url
.replace('github.com', 'raw.githubusercontent.com')
.replace('/blob/', '/');
}
return url;
});
let urls: string[] = [];
let prompt = this.params.prompt || '';
if (this.params.url) {
urls = [this.params.url];
prompt = `Fetch ${this.params.url}`;
} else if (this.params.prompt) {
const { validUrls } = parsePrompt(this.params.prompt);
urls = validUrls;
}
// Perform GitHub URL conversion here
urls = urls.map((url) => convertGithubUrlToRaw(url));
const confirmationDetails: ToolCallConfirmationDetails = {
type: 'info',
title: `Confirm Web Fetch`,
prompt: this.params.prompt,
prompt,
urls,
onConfirm: async (_outcome: ToolConfirmationOutcome) => {
// Mode transitions (e.g. AUTO_EDIT) and policy updates are now
@@ -286,8 +325,189 @@ ${textContent}
return confirmationDetails;
}
private async readResponseWithLimit(
response: Response,
limit: number,
): Promise<Buffer> {
const contentLength = response.headers.get('content-length');
if (contentLength && parseInt(contentLength, 10) > limit) {
throw new Error(`Content exceeds size limit of ${limit} bytes`);
}
if (!response.body) {
return Buffer.alloc(0);
}
const reader = response.body.getReader();
const chunks: Uint8Array[] = [];
let totalLength = 0;
try {
while (true) {
const { done, value } = await reader.read();
if (done) break;
totalLength += value.length;
if (totalLength > limit) {
// Attempt to cancel the reader to stop the stream
await reader.cancel().catch(() => {});
throw new Error(`Content exceeds size limit of ${limit} bytes`);
}
chunks.push(value);
}
} finally {
reader.releaseLock();
}
return Buffer.concat(chunks);
}
private async executeExperimental(signal: AbortSignal): Promise<ToolResult> {
if (!this.params.url) {
return {
llmContent: 'Error: No URL provided.',
returnDisplay: 'Error: No URL provided.',
error: {
message: 'No URL provided.',
type: ToolErrorType.INVALID_TOOL_PARAMS,
},
};
}
let url: string;
try {
url = new URL(this.params.url).href;
} catch {
return {
llmContent: `Error: Invalid URL "${this.params.url}"`,
returnDisplay: `Error: Invalid URL "${this.params.url}"`,
error: {
message: `Invalid URL "${this.params.url}"`,
type: ToolErrorType.INVALID_TOOL_PARAMS,
},
};
}
// Convert GitHub blob URL to raw URL
url = convertGithubUrlToRaw(url);
try {
const response = await retryWithBackoff(
async () => {
const res = await fetchWithTimeout(url, URL_FETCH_TIMEOUT_MS, {
signal,
headers: {
Accept:
'text/markdown, text/plain;q=0.9, application/json;q=0.9, text/html;q=0.8, application/pdf;q=0.7, video/*;q=0.7, */*;q=0.5',
'User-Agent': USER_AGENT,
},
});
return res;
},
{
retryFetchErrors: this.config.getRetryFetchErrors(),
},
);
const contentType = response.headers.get('content-type') || '';
const status = response.status;
const bodyBuffer = await this.readResponseWithLimit(
response,
MAX_EXPERIMENTAL_FETCH_SIZE,
);
if (status >= 400) {
const rawResponseText = bodyBuffer.toString('utf8');
const headers: Record<string, string> = {};
response.headers.forEach((value, key) => {
headers[key] = value;
});
const errorContent = `Request failed with status ${status}
Headers: ${JSON.stringify(headers, null, 2)}
Response: ${truncateString(rawResponseText, 10000, '\n\n... [Error response truncated] ...')}`;
return {
llmContent: errorContent,
returnDisplay: `Failed to fetch ${url} (Status: ${status})`,
};
}
const lowContentType = contentType.toLowerCase();
if (
lowContentType.includes('text/markdown') ||
lowContentType.includes('text/plain') ||
lowContentType.includes('application/json')
) {
const text = truncateString(
bodyBuffer.toString('utf8'),
MAX_CONTENT_LENGTH,
TRUNCATION_WARNING,
);
return {
llmContent: text,
returnDisplay: `Fetched ${contentType} content from ${url}`,
};
}
if (lowContentType.includes('text/html')) {
const html = bodyBuffer.toString('utf8');
const textContent = truncateString(
convert(html, {
wordwrap: false,
selectors: [
{ selector: 'a', options: { ignoreHref: false, baseUrl: url } },
],
}),
MAX_CONTENT_LENGTH,
TRUNCATION_WARNING,
);
return {
llmContent: textContent,
returnDisplay: `Fetched and converted HTML content from ${url}`,
};
}
if (
lowContentType.startsWith('image/') ||
lowContentType.startsWith('video/') ||
lowContentType === 'application/pdf'
) {
const base64Data = bodyBuffer.toString('base64');
return {
llmContent: {
inlineData: {
data: base64Data,
mimeType: contentType.split(';')[0],
},
},
returnDisplay: `Fetched ${contentType} from ${url}`,
};
}
// Fallback for unknown types - try as text
const text = truncateString(
bodyBuffer.toString('utf8'),
MAX_CONTENT_LENGTH,
TRUNCATION_WARNING,
);
return {
llmContent: text,
returnDisplay: `Fetched ${contentType || 'unknown'} content from ${url}`,
};
} catch (e) {
const errorMessage = `Error during experimental fetch for ${url}: ${getErrorMessage(e)}`;
return {
llmContent: `Error: ${errorMessage}`,
returnDisplay: `Error: ${errorMessage}`,
error: {
message: errorMessage,
type: ToolErrorType.WEB_FETCH_FALLBACK_FAILED,
},
};
}
}
async execute(signal: AbortSignal): Promise<ToolResult> {
const userPrompt = this.params.prompt;
if (this.config.getDirectWebFetch()) {
return this.executeExperimental(signal);
}
const userPrompt = this.params.prompt!;
const { validUrls: urls } = parsePrompt(userPrompt);
const url = urls[0];
@@ -475,6 +695,18 @@ export class WebFetchTool extends BaseDeclarativeTool<
protected override validateToolParamValues(
params: WebFetchToolParams,
): string | null {
if (this.config.getDirectWebFetch()) {
if (!params.url) {
return "The 'url' parameter is required.";
}
try {
new URL(params.url);
} catch {
return `Invalid URL: "${params.url}"`;
}
return null;
}
if (!params.prompt || params.prompt.trim() === '') {
return "The 'prompt' parameter cannot be empty and must contain URL(s) and instructions.";
}
@@ -508,6 +740,25 @@ export class WebFetchTool extends BaseDeclarativeTool<
}
override getSchema(modelId?: string) {
return resolveToolDeclaration(WEB_FETCH_DEFINITION, modelId);
const schema = resolveToolDeclaration(WEB_FETCH_DEFINITION, modelId);
if (this.config.getDirectWebFetch()) {
return {
...schema,
description:
'Fetch content from a URL directly. Send multiple requests for this tool if multiple URL fetches are needed.',
parametersJsonSchema: {
type: 'object',
properties: {
url: {
type: 'string',
description:
'The URL to fetch. Must be a valid http or https URL.',
},
},
required: ['url'],
},
};
}
return schema;
}
}