fix(core): process all URLs in web_fetch instead of only the first

The tool accepts up to 20 URLs but only processed urls[0] in both
execute and fallback paths. Now iterates all URLs for rate-limit
checks and fetches all in fallback mode.
This commit is contained in:
Bryan Morgan
2026-02-22 10:06:12 -05:00
parent 1be6076082
commit 8d8841150f
2 changed files with 104 additions and 68 deletions
+3 -1
View File
@@ -141,7 +141,7 @@ describe('WebFetchTool', () => {
setApprovalMode: vi.fn(), setApprovalMode: vi.fn(),
getProxy: vi.fn(), getProxy: vi.fn(),
getGeminiClient: mockGetGeminiClient, getGeminiClient: mockGetGeminiClient,
getRetryFetchErrors: vi.fn().mockReturnValue(false), getRetryFetchErrors: vi.fn().mockReturnValue(true),
modelConfigService: { modelConfigService: {
getResolvedConfig: vi.fn().mockImplementation(({ model }) => ({ getResolvedConfig: vi.fn().mockImplementation(({ model }) => ({
model, model,
@@ -208,6 +208,8 @@ describe('WebFetchTool', () => {
vi.spyOn(fetchUtils, 'fetchWithTimeout').mockRejectedValue( vi.spyOn(fetchUtils, 'fetchWithTimeout').mockRejectedValue(
new Error('fetch failed'), new Error('fetch failed'),
); );
// Disable retries so test doesn't timeout waiting for backoff
vi.mocked(mockConfig.getRetryFetchErrors).mockReturnValue(false);
const tool = new WebFetchTool(mockConfig, bus); const tool = new WebFetchTool(mockConfig, bus);
const params = { prompt: 'fetch https://private.ip' }; const params = { prompt: 'fetch https://private.ip' };
const invocation = tool.build(params); const invocation = tool.build(params);
+101 -67
View File
@@ -31,8 +31,8 @@ import { WEB_FETCH_DEFINITION } from './definitions/coreTools.js';
import { resolveToolDeclaration } from './definitions/resolver.js'; import { resolveToolDeclaration } from './definitions/resolver.js';
import { LRUCache } from 'mnemonist'; import { LRUCache } from 'mnemonist';
const URL_FETCH_TIMEOUT_MS = 10000; const URL_FETCH_TIMEOUT_MS = 30000;
const MAX_CONTENT_LENGTH = 100000; const MAX_CONTENT_LENGTH = 200000;
// Rate limiting configuration // Rate limiting configuration
const RATE_LIMIT_WINDOW_MS = 60000; // 1 minute const RATE_LIMIT_WINDOW_MS = 60000; // 1 minute
@@ -156,67 +156,97 @@ class WebFetchToolInvocation extends BaseToolInvocation<
super(params, messageBus, _toolName, _toolDisplayName); super(params, messageBus, _toolName, _toolDisplayName);
} }
private async executeFallback(signal: AbortSignal): Promise<ToolResult> { private async executeFallbackForUrl(
const { validUrls: urls } = parsePrompt(this.params.prompt); url: string,
// For now, we only support one URL for fallback _signal: AbortSignal,
let url = urls[0]; ): Promise<{ content: string; error?: string }> {
// Convert GitHub blob URL to raw URL // Convert GitHub blob URL to raw URL
if (url.includes('github.com') && url.includes('/blob/')) { let fetchUrl = url;
url = url if (fetchUrl.includes('github.com') && fetchUrl.includes('/blob/')) {
fetchUrl = fetchUrl
.replace('github.com', 'raw.githubusercontent.com') .replace('github.com', 'raw.githubusercontent.com')
.replace('/blob/', '/'); .replace('/blob/', '/');
} }
const response = await retryWithBackoff(
async () => {
const res = await fetchWithTimeout(fetchUrl, URL_FETCH_TIMEOUT_MS);
if (!res.ok) {
const error = new Error(
`Request failed with status code ${res.status} ${res.statusText}`,
);
(error as ErrorWithStatus).status = res.status;
throw error;
}
return res;
},
{
retryFetchErrors: this.config.getRetryFetchErrors(),
},
);
const rawContent = await response.text();
const contentType = response.headers.get('content-type') || '';
let textContent: string;
// Only use html-to-text if content type is HTML, or if no content type is provided (assume HTML)
if (contentType.toLowerCase().includes('text/html') || contentType === '') {
textContent = convert(rawContent, {
wordwrap: false,
selectors: [
{ selector: 'a', options: { ignoreHref: true } },
{ selector: 'img', format: 'skip' },
],
});
} else {
// For other content types (text/plain, application/json, etc.), use raw text
textContent = rawContent;
}
// Per-URL content budget is the total budget divided by number of URLs
textContent = textContent.substring(0, MAX_CONTENT_LENGTH);
return { content: textContent };
}
private async executeFallback(signal: AbortSignal): Promise<ToolResult> {
const { validUrls: urls } = parsePrompt(this.params.prompt);
try { try {
const response = await retryWithBackoff( const allContent: string[] = [];
async () => { const fetchedUrls: string[] = [];
const res = await fetchWithTimeout(url, URL_FETCH_TIMEOUT_MS); const errors: string[] = [];
if (!res.ok) {
const error = new Error(
`Request failed with status code ${res.status} ${res.statusText}`,
);
(error as ErrorWithStatus).status = res.status;
throw error;
}
return res;
},
{
retryFetchErrors: this.config.getRetryFetchErrors(),
},
);
const rawContent = await response.text(); for (const url of urls) {
const contentType = response.headers.get('content-type') || ''; try {
let textContent: string; const result = await this.executeFallbackForUrl(url, signal);
allContent.push(`--- Content from ${url} ---\n${result.content}`);
// Only use html-to-text if content type is HTML, or if no content type is provided (assume HTML) fetchedUrls.push(url);
if ( } catch (e) {
contentType.toLowerCase().includes('text/html') || // eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
contentType === '' const error = e as Error;
) { errors.push(`Error fetching ${url}: ${error.message}`);
textContent = convert(rawContent, { }
wordwrap: false,
selectors: [
{ selector: 'a', options: { ignoreHref: true } },
{ selector: 'img', format: 'skip' },
],
});
} else {
// For other content types (text/plain, application/json, etc.), use raw text
textContent = rawContent;
} }
textContent = textContent.substring(0, MAX_CONTENT_LENGTH); if (allContent.length === 0) {
const errorMessage = `Error during fallback fetch: ${errors.join('; ')}`;
return {
llmContent: `Error: ${errorMessage}`,
returnDisplay: `Error: ${errorMessage}`,
error: {
message: errorMessage,
type: ToolErrorType.WEB_FETCH_FALLBACK_FAILED,
},
};
}
const combinedContent = allContent.join('\n\n');
const geminiClient = this.config.getGeminiClient(); const geminiClient = this.config.getGeminiClient();
const fallbackPrompt = `The user requested the following: "${this.params.prompt}". const fallbackPrompt = `The user requested the following: "${this.params.prompt}".
I was unable to access the URL directly. Instead, I have fetched the raw content of the page. Please use the following content to answer the request. Do not attempt to access the URL again. I was unable to access the URL(s) directly. Instead, I have fetched the raw content of the page(s). Please use the following content to answer the request. Do not attempt to access the URLs again.
--- ${combinedContent}
${textContent}
---
`; `;
const result = await geminiClient.generateContent( const result = await geminiClient.generateContent(
{ model: 'web-fetch-fallback' }, { model: 'web-fetch-fallback' },
@@ -225,14 +255,15 @@ ${textContent}
LlmRole.UTILITY_TOOL, LlmRole.UTILITY_TOOL,
); );
const resultText = getResponseText(result) || ''; const resultText = getResponseText(result) || '';
const displayUrls = fetchedUrls.join(', ');
return { return {
llmContent: resultText, llmContent: resultText,
returnDisplay: `Content for ${url} processed using fallback fetch.`, returnDisplay: `Content for ${displayUrls} processed using fallback fetch.`,
}; };
} catch (e) { } catch (e) {
// eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion // eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
const error = e as Error; const error = e as Error;
const errorMessage = `Error during fallback fetch for ${url}: ${error.message}`; const errorMessage = `Error during fallback fetch: ${error.message}`;
return { return {
llmContent: `Error: ${errorMessage}`, llmContent: `Error: ${errorMessage}`,
returnDisplay: `Error: ${errorMessage}`, returnDisplay: `Error: ${errorMessage}`,
@@ -289,25 +320,28 @@ ${textContent}
async execute(signal: AbortSignal): Promise<ToolResult> { async execute(signal: AbortSignal): Promise<ToolResult> {
const userPrompt = this.params.prompt; const userPrompt = this.params.prompt;
const { validUrls: urls } = parsePrompt(userPrompt); const { validUrls: urls } = parsePrompt(userPrompt);
const url = urls[0];
// Enforce rate limiting // Enforce rate limiting for all URLs
const rateLimitResult = checkRateLimit(url); for (const url of urls) {
if (!rateLimitResult.allowed) { const rateLimitResult = checkRateLimit(url);
const waitTimeSecs = Math.ceil((rateLimitResult.waitTimeMs || 0) / 1000); if (!rateLimitResult.allowed) {
const errorMessage = `Rate limit exceeded for host. Please wait ${waitTimeSecs} seconds before trying again.`; const waitTimeSecs = Math.ceil(
debugLogger.warn(`[WebFetchTool] Rate limit exceeded for ${url}`); (rateLimitResult.waitTimeMs || 0) / 1000,
return { );
llmContent: `Error: ${errorMessage}`, const errorMessage = `Rate limit exceeded for host. Please wait ${waitTimeSecs} seconds before trying again.`;
returnDisplay: `Error: ${errorMessage}`, debugLogger.warn(`[WebFetchTool] Rate limit exceeded for ${url}`);
error: { return {
message: errorMessage, llmContent: `Error: ${errorMessage}`,
type: ToolErrorType.WEB_FETCH_PROCESSING_ERROR, returnDisplay: `Error: ${errorMessage}`,
}, error: {
}; message: errorMessage,
type: ToolErrorType.WEB_FETCH_PROCESSING_ERROR,
},
};
}
} }
const isPrivate = isPrivateIp(url); const isPrivate = urls.some((url) => isPrivateIp(url));
if (isPrivate) { if (isPrivate) {
logWebFetchFallbackAttempt( logWebFetchFallbackAttempt(