mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-05-12 12:54:07 -07:00
fix(core): process all URLs in web_fetch instead of only the first
The tool accepts up to 20 URLs but only processed urls[0] in both execute and fallback paths. Now iterates all URLs for rate-limit checks and fetches all in fallback mode.
This commit is contained in:
@@ -141,7 +141,7 @@ describe('WebFetchTool', () => {
|
|||||||
setApprovalMode: vi.fn(),
|
setApprovalMode: vi.fn(),
|
||||||
getProxy: vi.fn(),
|
getProxy: vi.fn(),
|
||||||
getGeminiClient: mockGetGeminiClient,
|
getGeminiClient: mockGetGeminiClient,
|
||||||
getRetryFetchErrors: vi.fn().mockReturnValue(false),
|
getRetryFetchErrors: vi.fn().mockReturnValue(true),
|
||||||
modelConfigService: {
|
modelConfigService: {
|
||||||
getResolvedConfig: vi.fn().mockImplementation(({ model }) => ({
|
getResolvedConfig: vi.fn().mockImplementation(({ model }) => ({
|
||||||
model,
|
model,
|
||||||
@@ -208,6 +208,8 @@ describe('WebFetchTool', () => {
|
|||||||
vi.spyOn(fetchUtils, 'fetchWithTimeout').mockRejectedValue(
|
vi.spyOn(fetchUtils, 'fetchWithTimeout').mockRejectedValue(
|
||||||
new Error('fetch failed'),
|
new Error('fetch failed'),
|
||||||
);
|
);
|
||||||
|
// Disable retries so test doesn't timeout waiting for backoff
|
||||||
|
vi.mocked(mockConfig.getRetryFetchErrors).mockReturnValue(false);
|
||||||
const tool = new WebFetchTool(mockConfig, bus);
|
const tool = new WebFetchTool(mockConfig, bus);
|
||||||
const params = { prompt: 'fetch https://private.ip' };
|
const params = { prompt: 'fetch https://private.ip' };
|
||||||
const invocation = tool.build(params);
|
const invocation = tool.build(params);
|
||||||
|
|||||||
@@ -31,8 +31,8 @@ import { WEB_FETCH_DEFINITION } from './definitions/coreTools.js';
|
|||||||
import { resolveToolDeclaration } from './definitions/resolver.js';
|
import { resolveToolDeclaration } from './definitions/resolver.js';
|
||||||
import { LRUCache } from 'mnemonist';
|
import { LRUCache } from 'mnemonist';
|
||||||
|
|
||||||
const URL_FETCH_TIMEOUT_MS = 10000;
|
const URL_FETCH_TIMEOUT_MS = 30000;
|
||||||
const MAX_CONTENT_LENGTH = 100000;
|
const MAX_CONTENT_LENGTH = 200000;
|
||||||
|
|
||||||
// Rate limiting configuration
|
// Rate limiting configuration
|
||||||
const RATE_LIMIT_WINDOW_MS = 60000; // 1 minute
|
const RATE_LIMIT_WINDOW_MS = 60000; // 1 minute
|
||||||
@@ -156,67 +156,97 @@ class WebFetchToolInvocation extends BaseToolInvocation<
|
|||||||
super(params, messageBus, _toolName, _toolDisplayName);
|
super(params, messageBus, _toolName, _toolDisplayName);
|
||||||
}
|
}
|
||||||
|
|
||||||
private async executeFallback(signal: AbortSignal): Promise<ToolResult> {
|
private async executeFallbackForUrl(
|
||||||
const { validUrls: urls } = parsePrompt(this.params.prompt);
|
url: string,
|
||||||
// For now, we only support one URL for fallback
|
_signal: AbortSignal,
|
||||||
let url = urls[0];
|
): Promise<{ content: string; error?: string }> {
|
||||||
|
|
||||||
// Convert GitHub blob URL to raw URL
|
// Convert GitHub blob URL to raw URL
|
||||||
if (url.includes('github.com') && url.includes('/blob/')) {
|
let fetchUrl = url;
|
||||||
url = url
|
if (fetchUrl.includes('github.com') && fetchUrl.includes('/blob/')) {
|
||||||
|
fetchUrl = fetchUrl
|
||||||
.replace('github.com', 'raw.githubusercontent.com')
|
.replace('github.com', 'raw.githubusercontent.com')
|
||||||
.replace('/blob/', '/');
|
.replace('/blob/', '/');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const response = await retryWithBackoff(
|
||||||
|
async () => {
|
||||||
|
const res = await fetchWithTimeout(fetchUrl, URL_FETCH_TIMEOUT_MS);
|
||||||
|
if (!res.ok) {
|
||||||
|
const error = new Error(
|
||||||
|
`Request failed with status code ${res.status} ${res.statusText}`,
|
||||||
|
);
|
||||||
|
(error as ErrorWithStatus).status = res.status;
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
},
|
||||||
|
{
|
||||||
|
retryFetchErrors: this.config.getRetryFetchErrors(),
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
const rawContent = await response.text();
|
||||||
|
const contentType = response.headers.get('content-type') || '';
|
||||||
|
let textContent: string;
|
||||||
|
|
||||||
|
// Only use html-to-text if content type is HTML, or if no content type is provided (assume HTML)
|
||||||
|
if (contentType.toLowerCase().includes('text/html') || contentType === '') {
|
||||||
|
textContent = convert(rawContent, {
|
||||||
|
wordwrap: false,
|
||||||
|
selectors: [
|
||||||
|
{ selector: 'a', options: { ignoreHref: true } },
|
||||||
|
{ selector: 'img', format: 'skip' },
|
||||||
|
],
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
// For other content types (text/plain, application/json, etc.), use raw text
|
||||||
|
textContent = rawContent;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Per-URL content budget is the total budget divided by number of URLs
|
||||||
|
textContent = textContent.substring(0, MAX_CONTENT_LENGTH);
|
||||||
|
return { content: textContent };
|
||||||
|
}
|
||||||
|
|
||||||
|
private async executeFallback(signal: AbortSignal): Promise<ToolResult> {
|
||||||
|
const { validUrls: urls } = parsePrompt(this.params.prompt);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const response = await retryWithBackoff(
|
const allContent: string[] = [];
|
||||||
async () => {
|
const fetchedUrls: string[] = [];
|
||||||
const res = await fetchWithTimeout(url, URL_FETCH_TIMEOUT_MS);
|
const errors: string[] = [];
|
||||||
if (!res.ok) {
|
|
||||||
const error = new Error(
|
|
||||||
`Request failed with status code ${res.status} ${res.statusText}`,
|
|
||||||
);
|
|
||||||
(error as ErrorWithStatus).status = res.status;
|
|
||||||
throw error;
|
|
||||||
}
|
|
||||||
return res;
|
|
||||||
},
|
|
||||||
{
|
|
||||||
retryFetchErrors: this.config.getRetryFetchErrors(),
|
|
||||||
},
|
|
||||||
);
|
|
||||||
|
|
||||||
const rawContent = await response.text();
|
for (const url of urls) {
|
||||||
const contentType = response.headers.get('content-type') || '';
|
try {
|
||||||
let textContent: string;
|
const result = await this.executeFallbackForUrl(url, signal);
|
||||||
|
allContent.push(`--- Content from ${url} ---\n${result.content}`);
|
||||||
// Only use html-to-text if content type is HTML, or if no content type is provided (assume HTML)
|
fetchedUrls.push(url);
|
||||||
if (
|
} catch (e) {
|
||||||
contentType.toLowerCase().includes('text/html') ||
|
// eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
|
||||||
contentType === ''
|
const error = e as Error;
|
||||||
) {
|
errors.push(`Error fetching ${url}: ${error.message}`);
|
||||||
textContent = convert(rawContent, {
|
}
|
||||||
wordwrap: false,
|
|
||||||
selectors: [
|
|
||||||
{ selector: 'a', options: { ignoreHref: true } },
|
|
||||||
{ selector: 'img', format: 'skip' },
|
|
||||||
],
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
// For other content types (text/plain, application/json, etc.), use raw text
|
|
||||||
textContent = rawContent;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
textContent = textContent.substring(0, MAX_CONTENT_LENGTH);
|
if (allContent.length === 0) {
|
||||||
|
const errorMessage = `Error during fallback fetch: ${errors.join('; ')}`;
|
||||||
|
return {
|
||||||
|
llmContent: `Error: ${errorMessage}`,
|
||||||
|
returnDisplay: `Error: ${errorMessage}`,
|
||||||
|
error: {
|
||||||
|
message: errorMessage,
|
||||||
|
type: ToolErrorType.WEB_FETCH_FALLBACK_FAILED,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const combinedContent = allContent.join('\n\n');
|
||||||
const geminiClient = this.config.getGeminiClient();
|
const geminiClient = this.config.getGeminiClient();
|
||||||
const fallbackPrompt = `The user requested the following: "${this.params.prompt}".
|
const fallbackPrompt = `The user requested the following: "${this.params.prompt}".
|
||||||
|
|
||||||
I was unable to access the URL directly. Instead, I have fetched the raw content of the page. Please use the following content to answer the request. Do not attempt to access the URL again.
|
I was unable to access the URL(s) directly. Instead, I have fetched the raw content of the page(s). Please use the following content to answer the request. Do not attempt to access the URLs again.
|
||||||
|
|
||||||
---
|
${combinedContent}
|
||||||
${textContent}
|
|
||||||
---
|
|
||||||
`;
|
`;
|
||||||
const result = await geminiClient.generateContent(
|
const result = await geminiClient.generateContent(
|
||||||
{ model: 'web-fetch-fallback' },
|
{ model: 'web-fetch-fallback' },
|
||||||
@@ -225,14 +255,15 @@ ${textContent}
|
|||||||
LlmRole.UTILITY_TOOL,
|
LlmRole.UTILITY_TOOL,
|
||||||
);
|
);
|
||||||
const resultText = getResponseText(result) || '';
|
const resultText = getResponseText(result) || '';
|
||||||
|
const displayUrls = fetchedUrls.join(', ');
|
||||||
return {
|
return {
|
||||||
llmContent: resultText,
|
llmContent: resultText,
|
||||||
returnDisplay: `Content for ${url} processed using fallback fetch.`,
|
returnDisplay: `Content for ${displayUrls} processed using fallback fetch.`,
|
||||||
};
|
};
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
// eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
|
// eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
|
||||||
const error = e as Error;
|
const error = e as Error;
|
||||||
const errorMessage = `Error during fallback fetch for ${url}: ${error.message}`;
|
const errorMessage = `Error during fallback fetch: ${error.message}`;
|
||||||
return {
|
return {
|
||||||
llmContent: `Error: ${errorMessage}`,
|
llmContent: `Error: ${errorMessage}`,
|
||||||
returnDisplay: `Error: ${errorMessage}`,
|
returnDisplay: `Error: ${errorMessage}`,
|
||||||
@@ -289,25 +320,28 @@ ${textContent}
|
|||||||
async execute(signal: AbortSignal): Promise<ToolResult> {
|
async execute(signal: AbortSignal): Promise<ToolResult> {
|
||||||
const userPrompt = this.params.prompt;
|
const userPrompt = this.params.prompt;
|
||||||
const { validUrls: urls } = parsePrompt(userPrompt);
|
const { validUrls: urls } = parsePrompt(userPrompt);
|
||||||
const url = urls[0];
|
|
||||||
|
|
||||||
// Enforce rate limiting
|
// Enforce rate limiting for all URLs
|
||||||
const rateLimitResult = checkRateLimit(url);
|
for (const url of urls) {
|
||||||
if (!rateLimitResult.allowed) {
|
const rateLimitResult = checkRateLimit(url);
|
||||||
const waitTimeSecs = Math.ceil((rateLimitResult.waitTimeMs || 0) / 1000);
|
if (!rateLimitResult.allowed) {
|
||||||
const errorMessage = `Rate limit exceeded for host. Please wait ${waitTimeSecs} seconds before trying again.`;
|
const waitTimeSecs = Math.ceil(
|
||||||
debugLogger.warn(`[WebFetchTool] Rate limit exceeded for ${url}`);
|
(rateLimitResult.waitTimeMs || 0) / 1000,
|
||||||
return {
|
);
|
||||||
llmContent: `Error: ${errorMessage}`,
|
const errorMessage = `Rate limit exceeded for host. Please wait ${waitTimeSecs} seconds before trying again.`;
|
||||||
returnDisplay: `Error: ${errorMessage}`,
|
debugLogger.warn(`[WebFetchTool] Rate limit exceeded for ${url}`);
|
||||||
error: {
|
return {
|
||||||
message: errorMessage,
|
llmContent: `Error: ${errorMessage}`,
|
||||||
type: ToolErrorType.WEB_FETCH_PROCESSING_ERROR,
|
returnDisplay: `Error: ${errorMessage}`,
|
||||||
},
|
error: {
|
||||||
};
|
message: errorMessage,
|
||||||
|
type: ToolErrorType.WEB_FETCH_PROCESSING_ERROR,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const isPrivate = isPrivateIp(url);
|
const isPrivate = urls.some((url) => isPrivateIp(url));
|
||||||
|
|
||||||
if (isPrivate) {
|
if (isPrivate) {
|
||||||
logWebFetchFallbackAttempt(
|
logWebFetchFallbackAttempt(
|
||||||
|
|||||||
Reference in New Issue
Block a user