fix(core): process all URLs in web_fetch instead of only the first

The web_fetch tool accepts up to 20 URLs but only processed urls[0]
in both execute and fallback paths. Now iterates all URLs for
rate-limit checks and private IP validation in execute(), and fetches
all URLs in fallback mode via a new executeFallbackForUrl() helper.

Each URL receives a fair share of the content budget
(MAX_CONTENT_LENGTH / urls.length) rather than the full limit.
Abort signal is now propagated to retry logic in fallback mode.
This commit is contained in:
Bryan Morgan
2026-03-12 15:42:20 -04:00
parent 333475c41f
commit b98de3fc5a
+118 -81
View File
@@ -214,79 +214,112 @@ class WebFetchToolInvocation extends BaseToolInvocation<
);
}
private async executeFallback(signal: AbortSignal): Promise<ToolResult> {
const { validUrls: urls } = parsePrompt(this.params.prompt!);
// For now, we only support one URL for fallback
let url = urls[0];
private async executeFallbackForUrl(
url: string,
perUrlContentBudget: number,
signal: AbortSignal,
): Promise<string> {
// Convert GitHub blob URL to raw URL
url = convertGithubUrlToRaw(url);
try {
const response = await retryWithBackoff(
async () => {
const res = await fetchWithTimeout(url, URL_FETCH_TIMEOUT_MS, {
signal,
headers: {
'User-Agent': USER_AGENT,
},
});
if (!res.ok) {
const error = new Error(
`Request failed with status code ${res.status} ${res.statusText}`,
);
(error as ErrorWithStatus).status = res.status;
throw error;
}
return res;
},
{
retryFetchErrors: this.config.getRetryFetchErrors(),
onRetry: (attempt, error, delayMs) =>
this.handleRetry(attempt, error, delayMs),
},
);
const bodyBuffer = await this.readResponseWithLimit(
response,
MAX_EXPERIMENTAL_FETCH_SIZE,
);
const rawContent = bodyBuffer.toString('utf8');
const contentType = response.headers.get('content-type') || '';
let textContent: string;
// Only use html-to-text if content type is HTML, or if no content type is provided (assume HTML)
if (
contentType.toLowerCase().includes('text/html') ||
contentType === ''
) {
textContent = convert(rawContent, {
wordwrap: false,
selectors: [
{ selector: 'a', options: { ignoreHref: true } },
{ selector: 'img', format: 'skip' },
],
const response = await retryWithBackoff(
async () => {
const res = await fetchWithTimeout(url, URL_FETCH_TIMEOUT_MS, {
signal,
headers: {
'User-Agent': USER_AGENT,
},
});
} else {
// For other content types (text/plain, application/json, etc.), use raw text
textContent = rawContent;
if (!res.ok) {
const error = new Error(
`Request failed with status code ${res.status} ${res.statusText}`,
);
(error as ErrorWithStatus).status = res.status;
throw error;
}
return res;
},
{
retryFetchErrors: this.config.getRetryFetchErrors(),
onRetry: (attempt, error, delayMs) =>
this.handleRetry(attempt, error, delayMs),
signal,
},
);
const bodyBuffer = await this.readResponseWithLimit(
response,
MAX_EXPERIMENTAL_FETCH_SIZE,
);
const rawContent = bodyBuffer.toString('utf8');
const contentType = response.headers.get('content-type') || '';
let textContent: string;
// Only use html-to-text if content type is HTML, or if no content type is provided (assume HTML)
if (contentType.toLowerCase().includes('text/html') || contentType === '') {
textContent = convert(rawContent, {
wordwrap: false,
selectors: [
{ selector: 'a', options: { ignoreHref: true } },
{ selector: 'img', format: 'skip' },
],
});
} else {
// For other content types (text/plain, application/json, etc.), use raw text
textContent = rawContent;
}
return truncateString(textContent, perUrlContentBudget, TRUNCATION_WARNING);
}
private async executeFallback(signal: AbortSignal): Promise<ToolResult> {
const { validUrls: urls } = parsePrompt(this.params.prompt!);
const perUrlContentBudget = Math.floor(
MAX_CONTENT_LENGTH / Math.max(urls.length, 1),
);
const contentParts: string[] = [];
const fetchedUrls: string[] = [];
const errors: string[] = [];
for (const rawUrl of urls) {
try {
const textContent = await this.executeFallbackForUrl(
rawUrl,
perUrlContentBudget,
signal,
);
contentParts.push(
`--- Content from ${rawUrl} ---\n${textContent}\n---`,
);
fetchedUrls.push(rawUrl);
} catch (e) {
// eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
const error = e as Error;
errors.push(`Error fetching ${rawUrl}: ${error.message}`);
}
}
textContent = truncateString(
textContent,
MAX_CONTENT_LENGTH,
TRUNCATION_WARNING,
);
if (fetchedUrls.length === 0) {
const errorMessage = `Error during fallback fetch: ${errors.join('; ')}`;
return {
llmContent: `Error: ${errorMessage}`,
returnDisplay: `Error: ${errorMessage}`,
error: {
message: errorMessage,
type: ToolErrorType.WEB_FETCH_FALLBACK_FAILED,
},
};
}
try {
const geminiClient = this.config.getGeminiClient();
const fallbackPrompt = `The user requested the following: "${this.params.prompt}".
I was unable to access the URL directly. Instead, I have fetched the raw content of the page. Please use the following content to answer the request. Do not attempt to access the URL again.
I was unable to access the URL(s) directly. Instead, I have fetched the raw content. Please use the following content to answer the request. Do not attempt to access the URLs again.
---
${textContent}
---
`;
${contentParts.join('\n\n')}
${errors.length > 0 ? `\nNote: Some URLs could not be fetched: ${errors.join('; ')}` : ''}`;
const result = await geminiClient.generateContent(
{ model: 'web-fetch-fallback' },
[{ role: 'user', parts: [{ text: fallbackPrompt }] }],
@@ -296,12 +329,12 @@ ${textContent}
const resultText = getResponseText(result) || '';
return {
llmContent: resultText,
returnDisplay: `Content for ${url} processed using fallback fetch.`,
returnDisplay: `Content for ${fetchedUrls.join(', ')} processed using fallback fetch.`,
};
} catch (e) {
// eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
const error = e as Error;
const errorMessage = `Error during fallback fetch for ${url}: ${error.message}`;
const errorMessage = `Error during fallback fetch: ${error.message}`;
return {
llmContent: `Error: ${errorMessage}`,
returnDisplay: `Error: ${errorMessage}`,
@@ -560,27 +593,31 @@ Response: ${truncateString(rawResponseText, 10000, '\n\n... [Error response trun
}
const userPrompt = this.params.prompt!;
const { validUrls: urls } = parsePrompt(userPrompt);
const url = urls[0];
// Enforce rate limiting
const rateLimitResult = checkRateLimit(url);
if (!rateLimitResult.allowed) {
const waitTimeSecs = Math.ceil((rateLimitResult.waitTimeMs || 0) / 1000);
const errorMessage = `Rate limit exceeded for host. Please wait ${waitTimeSecs} seconds before trying again.`;
debugLogger.warn(`[WebFetchTool] Rate limit exceeded for ${url}`);
return {
llmContent: `Error: ${errorMessage}`,
returnDisplay: `Error: ${errorMessage}`,
error: {
message: errorMessage,
type: ToolErrorType.WEB_FETCH_PROCESSING_ERROR,
},
};
// Enforce rate limiting for all URLs
for (const url of urls) {
const rateLimitResult = checkRateLimit(url);
if (!rateLimitResult.allowed) {
const waitTimeSecs = Math.ceil(
(rateLimitResult.waitTimeMs || 0) / 1000,
);
const errorMessage = `Rate limit exceeded for host. Please wait ${waitTimeSecs} seconds before trying again.`;
debugLogger.warn(`[WebFetchTool] Rate limit exceeded for ${url}`);
return {
llmContent: `Error: ${errorMessage}`,
returnDisplay: `Error: ${errorMessage}`,
error: {
message: errorMessage,
type: ToolErrorType.WEB_FETCH_PROCESSING_ERROR,
},
};
}
}
const isPrivate = isPrivateIp(url);
// Check for private IPs across all URLs
const hasPrivateIp = urls.some((url) => isPrivateIp(url));
if (isPrivate) {
if (hasPrivateIp) {
logWebFetchFallbackAttempt(
this.config,
new WebFetchFallbackAttemptEvent('private_ip'),