refactor(core): simplify webfetch logic for easier review

- Streamlined execute loop with integrated rate limiting and SSRF checks.
- Simplified fallback to all-or-nothing mode.
- inlined grounding and source list formatting for cleaner diff.
- Resolved ESLint unsafe type assertion issues using runtime type guards.
This commit is contained in:
Aishanee Shah
2026-03-09 19:50:39 +00:00
parent 0e850622a8
commit d783ccc43b
2 changed files with 136 additions and 352 deletions

View File

@@ -9,7 +9,6 @@ import {
WebFetchTool,
parsePrompt,
convertGithubUrlToRaw,
normalizeUrl,
} from './web-fetch.js';
import type { Config } from '../config/config.js';
import { ApprovalMode } from '../policy/types.js';
@@ -199,35 +198,6 @@ describe('parsePrompt', () => {
});
});
describe('normalizeUrl', () => {
it('should lowercase hostname', () => {
expect(normalizeUrl('https://EXAMPLE.COM')).toBe('https://example.com/');
});
it('should remove trailing slashes from path', () => {
expect(normalizeUrl('https://example.com/path/')).toBe(
'https://example.com/path',
);
});
it('should not remove trailing slash from root', () => {
expect(normalizeUrl('https://example.com/')).toBe('https://example.com/');
});
it('should remove default ports', () => {
expect(normalizeUrl('http://example.com:80/')).toBe('http://example.com/');
expect(normalizeUrl('https://example.com:443/')).toBe(
'https://example.com/',
);
});
it('should keep non-default ports', () => {
expect(normalizeUrl('http://example.com:8080/')).toBe(
'http://example.com:8080/',
);
});
});
describe('convertGithubUrlToRaw', () => {
it('should convert valid github blob urls', () => {
expect(
@@ -384,7 +354,9 @@ describe('WebFetchTool', () => {
// The 11th time should fail due to rate limit
const result = await invocation.execute(new AbortController().signal);
expect(result.error?.type).toBe(ToolErrorType.WEB_FETCH_PROCESSING_ERROR);
expect(result.error?.message).toContain('Rate limit exceeded');
expect(result.error?.message).toContain(
'All requested URLs were skipped',
);
});
it('should skip rate-limited URLs but fetch others', async () => {
@@ -417,73 +389,63 @@ describe('WebFetchTool', () => {
const result = await invocation.execute(new AbortController().signal);
expect(result.llmContent).toContain('healthy response');
expect(result.llmContent).toContain(
'[Warning] The following URLs were skipped due to rate limiting: https://ratelimit-multi.com/',
'[Warning] The following URLs were skipped:',
);
expect(result.llmContent).toContain(
'[Rate Limit] https://ratelimit-multi.com/',
);
expect(result.returnDisplay).toContain('(1 URL(s) skipped)');
});
it('should rescue failed public URLs via fallback', async () => {
it('should fallback to all public URLs if primary fails', async () => {
vi.spyOn(fetchUtils, 'isPrivateIp').mockReturnValue(false);
// Primary fetch fails for one URL
mockGenerateContent.mockResolvedValueOnce({
candidates: [
{
content: { parts: [{ text: 'Only partial info' }] },
urlContextMetadata: {
urlMetadata: [
{
url: 'https://success.com/',
urlRetrievalStatus: 'URL_RETRIEVAL_STATUS_SUCCESS',
},
{
url: 'https://fail.com/',
urlRetrievalStatus: 'URL_RETRIEVAL_STATUS_FAILED',
},
],
},
},
],
// Primary fetch fails
mockGenerateContent.mockRejectedValueOnce(new Error('primary fail'));
// Mock fallback fetch for BOTH URLs
mockFetch('https://url1.com/', {
text: () => Promise.resolve('content 1'),
});
mockFetch('https://url2.com/', {
text: () => Promise.resolve('content 2'),
});
// Mock fallback fetch for the failed URL
mockFetch('https://fail.com/', {
text: () => Promise.resolve('rescued content'),
// Mock fallback LLM call
mockGenerateContent.mockResolvedValueOnce({
candidates: [
{ content: { parts: [{ text: 'fallback processed response' }] } },
],
});
const tool = new WebFetchTool(mockConfig, bus);
const params = {
prompt: 'fetch https://success.com and https://fail.com',
prompt: 'fetch https://url1.com and https://url2.com',
};
const invocation = tool.build(params);
const result = await invocation.execute(new AbortController().signal);
expect(result.llmContent).toContain('Only partial info');
expect(result.llmContent).toContain('--- Rescued Content ---');
expect(result.llmContent).toContain('URL: https://fail.com/');
expect(result.llmContent).toContain('rescued content');
expect(result.llmContent).toBe('fallback processed response');
expect(result.returnDisplay).toContain(
'2 URL(s) processed using fallback fetch',
);
});
it('should NOT rescue private URLs via fallback but still fetch public ones', async () => {
it('should NOT include private URLs in fallback', async () => {
vi.mocked(fetchUtils.isPrivateIp).mockImplementation(
(url) => url === 'https://private.com/',
);
// Primary fetch for public URL
// Primary fetch fails
mockGenerateContent.mockRejectedValueOnce(new Error('primary fail'));
// Mock fallback fetch only for public URL
mockFetch('https://public.com/', {
text: () => Promise.resolve('public content'),
});
// Mock fallback LLM call
mockGenerateContent.mockResolvedValueOnce({
candidates: [
{
content: { parts: [{ text: 'public content' }] },
urlContextMetadata: {
urlMetadata: [
{
url: 'https://public.com/',
urlRetrievalStatus: 'URL_RETRIEVAL_STATUS_SUCCESS',
},
],
},
},
],
candidates: [{ content: { parts: [{ text: 'fallback response' }] } }],
});
const tool = new WebFetchTool(mockConfig, bus);
@@ -493,12 +455,8 @@ describe('WebFetchTool', () => {
const invocation = tool.build(params);
const result = await invocation.execute(new AbortController().signal);
expect(result.llmContent).toContain('public content');
expect(result.llmContent).not.toContain('--- Rescued Content ---');
expect(result.llmContent).toContain(
'[Warning] The following URLs were skipped because they point to private IP addresses: https://private.com/',
);
expect(result.returnDisplay).toContain('(1 URL(s) skipped)');
expect(result.llmContent).toBe('fallback response');
// Verify private URL was NOT fetched (mockFetch would throw if it was called for private.com)
});
it('should return WEB_FETCH_FALLBACK_FAILED on fallback fetch failure', async () => {
@@ -522,23 +480,6 @@ describe('WebFetchTool', () => {
expect(result.error?.type).toBe(ToolErrorType.WEB_FETCH_FALLBACK_FAILED);
});
it('should log telemetry when falling back due to private IP', async () => {
vi.spyOn(fetchUtils, 'isPrivateIp').mockReturnValue(true);
const tool = new WebFetchTool(mockConfig, bus);
const params = { prompt: 'fetch https://private.ip' };
const invocation = tool.build(params);
await invocation.execute(new AbortController().signal);
expect(logWebFetchFallbackAttempt).toHaveBeenCalledWith(
mockConfig,
expect.any(WebFetchFallbackAttemptEvent),
);
expect(WebFetchFallbackAttemptEvent).toHaveBeenCalledWith(
'private_ip_skipped',
);
});
it('should log telemetry when falling back due to primary fetch failure', async () => {
vi.spyOn(fetchUtils, 'isPrivateIp').mockReturnValue(false);
// Mock primary fetch to return empty response, triggering fallback

View File

@@ -76,31 +76,6 @@ function checkRateLimit(url: string): {
}
}
/**
* Normalizes a URL by converting hostname to lowercase, removing trailing slashes,
* and removing default ports.
*/
export function normalizeUrl(urlStr: string): string {
try {
const url = new URL(urlStr);
url.hostname = url.hostname.toLowerCase();
// Remove trailing slash if present in pathname (except for root '/')
if (url.pathname.endsWith('/') && url.pathname.length > 1) {
url.pathname = url.pathname.slice(0, -1);
}
// Remove default ports
if (
(url.protocol === 'http:' && url.port === '80') ||
(url.protocol === 'https:' && url.port === '443')
) {
url.port = '';
}
return url.href;
} catch {
return urlStr;
}
}
/**
* Parses a prompt to extract valid URLs and identify malformed ones.
*/
@@ -177,15 +152,6 @@ interface GroundingSupportItem {
groundingChunkIndices?: number[];
}
interface UrlMetadata {
url?: string;
urlRetrievalStatus?: string;
}
interface UrlContextMetadata {
urlMetadata?: UrlMetadata[];
}
/**
* Parameters for the WebFetch tool
*/
@@ -575,69 +541,6 @@ Response: ${truncateString(rawResponseText, 10000, '\n\n... [Error response trun
}
}
private applyGroundingSupport(
responseText: string,
groundingSupports: GroundingSupportItem[] | undefined,
): string {
if (!groundingSupports || groundingSupports.length === 0) {
return responseText;
}
const insertions: Array<{ index: number; marker: string }> = [];
groundingSupports.forEach((support: GroundingSupportItem) => {
if (support.segment && support.groundingChunkIndices) {
const citationMarker = support.groundingChunkIndices
.map((chunkIndex: number) => `[${chunkIndex + 1}]`)
.join('');
insertions.push({
index: support.segment.endIndex,
marker: citationMarker,
});
}
});
insertions.sort((a, b) => b.index - a.index);
const responseChars = responseText.split('');
insertions.forEach((insertion) => {
responseChars.splice(insertion.index, 0, insertion.marker);
});
return responseChars.join('');
}
private formatSourceList(sources: GroundingChunkItem[] | undefined): string {
if (!sources || sources.length === 0) {
return '';
}
const sourceListFormatted: string[] = [];
sources.forEach((source: GroundingChunkItem, index: number) => {
const title = source.web?.title || 'Untitled';
const uri = source.web?.uri || 'Unknown URI';
sourceListFormatted.push(`[${index + 1}] ${title} (${uri})`);
});
return `\n\nSources:\n${sourceListFormatted.join('\n')}`;
}
private async aggregateRescuedContent(
urls: string[],
signal: AbortSignal,
): Promise<string> {
const uniqueRescue = [...new Set(urls)];
const contentBudget = Math.floor(MAX_CONTENT_LENGTH / uniqueRescue.length);
const rescuedResults: string[] = [];
for (const url of uniqueRescue) {
rescuedResults.push(
await this.executeFallbackForUrl(url, signal, contentBudget),
);
}
return rescuedResults
.map((content, i) => `URL: ${uniqueRescue[i]}\nContent:\n${content}`)
.join('\n\n---\n\n');
}
async execute(signal: AbortSignal): Promise<ToolResult> {
if (this.config.getDirectWebFetch()) {
return this.executeExperimental(signal);
@@ -645,23 +548,26 @@ Response: ${truncateString(rawResponseText, 10000, '\n\n... [Error response trun
const userPrompt = this.params.prompt!;
const { validUrls } = parsePrompt(userPrompt);
// Unit 1: Normalization & Deduplication
const allUrls = [...new Set(validUrls.map(normalizeUrl))];
// Unit 2: Isolated Rate Limiting
// Filter unique URLs and perform pre-flight checks (Rate Limit & Private IP)
const uniqueUrls = [...new Set(validUrls)];
const toFetch: string[] = [];
const rateLimited: string[] = [];
for (const url of allUrls) {
const rateLimitResult = checkRateLimit(url);
if (rateLimitResult.allowed) {
toFetch.push(url);
} else {
rateLimited.push(url);
const skipped: string[] = [];
for (const url of uniqueUrls) {
if (isPrivateIp(url)) {
skipped.push(`[Private IP] ${url}`);
continue;
}
if (!checkRateLimit(url).allowed) {
skipped.push(`[Rate Limit] ${url}`);
continue;
}
toFetch.push(url);
}
if (toFetch.length === 0 && rateLimited.length > 0) {
const errorMessage = `Rate limit exceeded for all requested hosts.`;
// If everything was skipped, fail early
if (toFetch.length === 0 && skipped.length > 0) {
const errorMessage = `All requested URLs were skipped: ${skipped.join(', ')}`;
return {
llmContent: `Error: ${errorMessage}`,
returnDisplay: `Error: ${errorMessage}`,
@@ -672,154 +578,91 @@ Response: ${truncateString(rawResponseText, 10000, '\n\n... [Error response trun
};
}
const publicUrls = toFetch.filter((url) => !isPrivateIp(url));
const privateUrls = toFetch.filter((url) => isPrivateIp(url));
try {
const geminiClient = this.config.getGeminiClient();
const response = await geminiClient.generateContent(
{ model: 'web-fetch' },
[{ role: 'user', parts: [{ text: userPrompt }] }],
signal,
LlmRole.UTILITY_TOOL,
);
if (privateUrls.length > 0) {
let responseText = getResponseText(response) || '';
const groundingMetadata = response.candidates?.[0]?.groundingMetadata;
// Simple primary success check: we need some text or grounding data
if (!responseText.trim() && !groundingMetadata?.groundingChunks?.length) {
throw new Error('Primary fetch returned no content');
}
// 1. Apply Grounding Supports (Citations)
const rawGroundingSupports = groundingMetadata?.groundingSupports;
const groundingSupports = Array.isArray(rawGroundingSupports)
? rawGroundingSupports.filter(
(item): item is GroundingSupportItem =>
typeof item === 'object' && item !== null,
)
: undefined;
if (groundingSupports && groundingSupports.length > 0) {
const insertions: Array<{ index: number; marker: string }> = [];
groundingSupports.forEach((support) => {
if (support.segment && support.groundingChunkIndices) {
const citationMarker = support.groundingChunkIndices
.map((chunkIndex: number) => `[${chunkIndex + 1}]`)
.join('');
insertions.push({
index: support.segment.endIndex,
marker: citationMarker,
});
}
});
insertions.sort((a, b) => b.index - a.index);
const responseChars = responseText.split('');
insertions.forEach((insertion) => {
responseChars.splice(insertion.index, 0, insertion.marker);
});
responseText = responseChars.join('');
}
// 2. Append Source List
const rawSources = groundingMetadata?.groundingChunks;
const sources = Array.isArray(rawSources)
? rawSources.filter(
(item): item is GroundingChunkItem =>
typeof item === 'object' && item !== null,
)
: undefined;
if (sources && sources.length > 0) {
const sourceListFormatted: string[] = [];
sources.forEach((source, index) => {
const title = source.web?.title || 'Untitled';
const uri = source.web?.uri || 'Unknown URI';
sourceListFormatted.push(`[${index + 1}] ${title} (${uri})`);
});
responseText += `\n\nSources:\n${sourceListFormatted.join('\n')}`;
}
// 3. Prepend Warnings for skipped URLs
if (skipped.length > 0) {
responseText = `[Warning] The following URLs were skipped:\n${skipped.join('\n')}\n\n${responseText}`;
}
return {
llmContent: responseText,
returnDisplay: `Content processed from prompt.`,
};
} catch (error: unknown) {
debugLogger.warn(
`[WebFetchTool] Primary fetch failed, falling back: ${getErrorMessage(error)}`,
);
logWebFetchFallbackAttempt(
this.config,
new WebFetchFallbackAttemptEvent('private_ip_skipped'),
new WebFetchFallbackAttemptEvent('primary_failed'),
);
// Simple All-or-Nothing Fallback
return this.executeFallback(toFetch, signal);
}
let llmContent = '';
let returnDisplay = '';
const needsRescue: string[] = [];
if (publicUrls.length > 0) {
const geminiClient = this.config.getGeminiClient();
try {
const response = await geminiClient.generateContent(
{ model: 'web-fetch' },
[{ role: 'user', parts: [{ text: userPrompt }] }],
signal,
LlmRole.UTILITY_TOOL,
);
let responseText = getResponseText(response) || '';
const rawUrlContextMeta = response.candidates?.[0]?.urlContextMetadata;
const urlContextMeta =
rawUrlContextMeta &&
typeof rawUrlContextMeta === 'object' &&
'urlMetadata' in rawUrlContextMeta &&
Array.isArray(rawUrlContextMeta.urlMetadata)
? (rawUrlContextMeta as UrlContextMetadata)
: undefined;
const groundingMetadata = response.candidates?.[0]?.groundingMetadata;
const sources = groundingMetadata?.groundingChunks as
| GroundingChunkItem[]
| undefined;
// eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
const groundingSupports = groundingMetadata?.groundingSupports as
| GroundingSupportItem[]
| undefined;
// Error Handling & Rescue identification
let processingError = false;
if (
urlContextMeta?.urlMetadata &&
urlContextMeta.urlMetadata.length > 0
) {
const allStatuses = urlContextMeta.urlMetadata.map(
(m) => m.urlRetrievalStatus,
);
if (allStatuses.every((s) => s !== 'URL_RETRIEVAL_STATUS_SUCCESS')) {
processingError = true;
}
// Unit 3: Identify specific URLs that need rescue
for (const meta of urlContextMeta.urlMetadata) {
if (
meta.urlRetrievalStatus !== 'URL_RETRIEVAL_STATUS_SUCCESS' &&
meta.url
) {
needsRescue.push(meta.url);
}
}
} else if (!responseText.trim() && !sources?.length) {
processingError = true;
}
if (
!processingError &&
!responseText.trim() &&
(!sources || sources.length === 0)
) {
processingError = true;
}
if (processingError) {
logWebFetchFallbackAttempt(
this.config,
new WebFetchFallbackAttemptEvent('primary_failed'),
);
// If primary failed completely, rescue all public URLs that were supposed to be fetched
needsRescue.push(...publicUrls);
} else {
// Process grounding if successful
responseText = this.applyGroundingSupport(
responseText,
groundingSupports,
);
responseText += this.formatSourceList(sources);
llmContent = responseText;
returnDisplay = `Content processed from prompt.`;
}
} catch (error: unknown) {
debugLogger.error(
`[WebFetchTool] Primary fetch failed: ${getErrorMessage(error)}`,
);
needsRescue.push(...publicUrls);
}
}
// Unit 3: Surgical Fallback ("The Rescue")
if (needsRescue.length > 0) {
const aggregatedRescuedContent = await this.aggregateRescuedContent(
needsRescue,
signal,
);
if (!llmContent) {
// If no primary content, use executeFallback logic to process all rescued content via Gemini
return this.executeFallback(needsRescue, signal);
} else {
// If we have some primary content, append the rescued content as additional information
llmContent += `\n\n--- Rescued Content ---\n${aggregatedRescuedContent}`;
returnDisplay += ` (with ${needsRescue.length} rescued URL(s))`;
}
}
// Unit 2: Append rate limiting and private IP warnings
const warnings: string[] = [];
if (rateLimited.length > 0) {
warnings.push(
`[Warning] The following URLs were skipped due to rate limiting: ${rateLimited.join(
', ',
)}`,
);
}
if (privateUrls.length > 0) {
warnings.push(
`[Warning] The following URLs were skipped because they point to private IP addresses: ${privateUrls.join(
', ',
)}`,
);
}
if (warnings.length > 0) {
const combinedWarning = warnings.join('\n');
llmContent = `${combinedWarning}\n\n${llmContent}`;
returnDisplay = `${returnDisplay} (${rateLimited.length + privateUrls.length} URL(s) skipped)`;
}
return {
llmContent,
returnDisplay,
};
}
}