refactor(core): simplify webfetch logic for easier review

- Streamlined execute loop with integrated rate limiting and SSRF checks. - Simplified fallback to all-or-nothing mode. - inlined grounding and source list formatting for cleaner diff. - Resolved ESLint unsafe type assertion issues using runtime type guards.
2026-03-10 22:21:22 -07:00 · 2026-03-09 19:50:39 +00:00
parent 0e850622a8
commit d783ccc43b
2 changed files with 136 additions and 352 deletions
--- a/packages/core/src/tools/web-fetch.test.ts
+++ b/packages/core/src/tools/web-fetch.test.ts
@@ -9,7 +9,6 @@ import {
  WebFetchTool,
  parsePrompt,
  convertGithubUrlToRaw,
-  normalizeUrl,
 } from './web-fetch.js';
 import type { Config } from '../config/config.js';
 import { ApprovalMode } from '../policy/types.js';
@@ -199,35 +198,6 @@ describe('parsePrompt', () => {
  });
 });

-describe('normalizeUrl', () => {
-  it('should lowercase hostname', () => {
-    expect(normalizeUrl('https://EXAMPLE.COM')).toBe('https://example.com/');
-  });
-
-  it('should remove trailing slashes from path', () => {
-    expect(normalizeUrl('https://example.com/path/')).toBe(
-      'https://example.com/path',
-    );
-  });
-
-  it('should not remove trailing slash from root', () => {
-    expect(normalizeUrl('https://example.com/')).toBe('https://example.com/');
-  });
-
-  it('should remove default ports', () => {
-    expect(normalizeUrl('http://example.com:80/')).toBe('http://example.com/');
-    expect(normalizeUrl('https://example.com:443/')).toBe(
-      'https://example.com/',
-    );
-  });
-
-  it('should keep non-default ports', () => {
-    expect(normalizeUrl('http://example.com:8080/')).toBe(
-      'http://example.com:8080/',
-    );
-  });
-});
-
 describe('convertGithubUrlToRaw', () => {
  it('should convert valid github blob urls', () => {
    expect(
@@ -384,7 +354,9 @@ describe('WebFetchTool', () => {
      // The 11th time should fail due to rate limit
      const result = await invocation.execute(new AbortController().signal);
      expect(result.error?.type).toBe(ToolErrorType.WEB_FETCH_PROCESSING_ERROR);
-      expect(result.error?.message).toContain('Rate limit exceeded');
+      expect(result.error?.message).toContain(
+        'All requested URLs were skipped',
+      );
    });

    it('should skip rate-limited URLs but fetch others', async () => {
@@ -417,73 +389,63 @@ describe('WebFetchTool', () => {
      const result = await invocation.execute(new AbortController().signal);
      expect(result.llmContent).toContain('healthy response');
      expect(result.llmContent).toContain(
-        '[Warning] The following URLs were skipped due to rate limiting: https://ratelimit-multi.com/',
+        '[Warning] The following URLs were skipped:',
+      );
+      expect(result.llmContent).toContain(
+        '[Rate Limit] https://ratelimit-multi.com/',
      );
-      expect(result.returnDisplay).toContain('(1 URL(s) skipped)');
    });

-    it('should rescue failed public URLs via fallback', async () => {
+    it('should fallback to all public URLs if primary fails', async () => {
      vi.spyOn(fetchUtils, 'isPrivateIp').mockReturnValue(false);

-      // Primary fetch fails for one URL
-      mockGenerateContent.mockResolvedValueOnce({
-        candidates: [
-          {
-            content: { parts: [{ text: 'Only partial info' }] },
-            urlContextMetadata: {
-              urlMetadata: [
-                {
-                  url: 'https://success.com/',
-                  urlRetrievalStatus: 'URL_RETRIEVAL_STATUS_SUCCESS',
-                },
-                {
-                  url: 'https://fail.com/',
-                  urlRetrievalStatus: 'URL_RETRIEVAL_STATUS_FAILED',
-                },
-              ],
-            },
-          },
-        ],
+      // Primary fetch fails
+      mockGenerateContent.mockRejectedValueOnce(new Error('primary fail'));
+
+      // Mock fallback fetch for BOTH URLs
+      mockFetch('https://url1.com/', {
+        text: () => Promise.resolve('content 1'),
+      });
+      mockFetch('https://url2.com/', {
+        text: () => Promise.resolve('content 2'),
      });

-      // Mock fallback fetch for the failed URL
-      mockFetch('https://fail.com/', {
-        text: () => Promise.resolve('rescued content'),
+      // Mock fallback LLM call
+      mockGenerateContent.mockResolvedValueOnce({
+        candidates: [
+          { content: { parts: [{ text: 'fallback processed response' }] } },
+        ],
      });

      const tool = new WebFetchTool(mockConfig, bus);
      const params = {
-        prompt: 'fetch https://success.com and https://fail.com',
+        prompt: 'fetch https://url1.com and https://url2.com',
      };
      const invocation = tool.build(params);
      const result = await invocation.execute(new AbortController().signal);

-      expect(result.llmContent).toContain('Only partial info');
-      expect(result.llmContent).toContain('--- Rescued Content ---');
-      expect(result.llmContent).toContain('URL: https://fail.com/');
-      expect(result.llmContent).toContain('rescued content');
+      expect(result.llmContent).toBe('fallback processed response');
+      expect(result.returnDisplay).toContain(
+        '2 URL(s) processed using fallback fetch',
+      );
    });

-    it('should NOT rescue private URLs via fallback but still fetch public ones', async () => {
+    it('should NOT include private URLs in fallback', async () => {
      vi.mocked(fetchUtils.isPrivateIp).mockImplementation(
        (url) => url === 'https://private.com/',
      );

-      // Primary fetch for public URL
+      // Primary fetch fails
+      mockGenerateContent.mockRejectedValueOnce(new Error('primary fail'));
+
+      // Mock fallback fetch only for public URL
+      mockFetch('https://public.com/', {
+        text: () => Promise.resolve('public content'),
+      });
+
+      // Mock fallback LLM call
      mockGenerateContent.mockResolvedValueOnce({
-        candidates: [
-          {
-            content: { parts: [{ text: 'public content' }] },
-            urlContextMetadata: {
-              urlMetadata: [
-                {
-                  url: 'https://public.com/',
-                  urlRetrievalStatus: 'URL_RETRIEVAL_STATUS_SUCCESS',
-                },
-              ],
-            },
-          },
-        ],
+        candidates: [{ content: { parts: [{ text: 'fallback response' }] } }],
      });

      const tool = new WebFetchTool(mockConfig, bus);
@@ -493,12 +455,8 @@ describe('WebFetchTool', () => {
      const invocation = tool.build(params);
      const result = await invocation.execute(new AbortController().signal);

-      expect(result.llmContent).toContain('public content');
-      expect(result.llmContent).not.toContain('--- Rescued Content ---');
-      expect(result.llmContent).toContain(
-        '[Warning] The following URLs were skipped because they point to private IP addresses: https://private.com/',
-      );
-      expect(result.returnDisplay).toContain('(1 URL(s) skipped)');
+      expect(result.llmContent).toBe('fallback response');
+      // Verify private URL was NOT fetched (mockFetch would throw if it was called for private.com)
    });

    it('should return WEB_FETCH_FALLBACK_FAILED on fallback fetch failure', async () => {
@@ -522,23 +480,6 @@ describe('WebFetchTool', () => {
      expect(result.error?.type).toBe(ToolErrorType.WEB_FETCH_FALLBACK_FAILED);
    });

-    it('should log telemetry when falling back due to private IP', async () => {
-      vi.spyOn(fetchUtils, 'isPrivateIp').mockReturnValue(true);
-
-      const tool = new WebFetchTool(mockConfig, bus);
-      const params = { prompt: 'fetch https://private.ip' };
-      const invocation = tool.build(params);
-      await invocation.execute(new AbortController().signal);
-
-      expect(logWebFetchFallbackAttempt).toHaveBeenCalledWith(
-        mockConfig,
-        expect.any(WebFetchFallbackAttemptEvent),
-      );
-      expect(WebFetchFallbackAttemptEvent).toHaveBeenCalledWith(
-        'private_ip_skipped',
-      );
-    });
-
    it('should log telemetry when falling back due to primary fetch failure', async () => {
      vi.spyOn(fetchUtils, 'isPrivateIp').mockReturnValue(false);
      // Mock primary fetch to return empty response, triggering fallback
--- a/packages/core/src/tools/web-fetch.ts
+++ b/packages/core/src/tools/web-fetch.ts
@@ -76,31 +76,6 @@ function checkRateLimit(url: string): {
  }
 }

-/**
- * Normalizes a URL by converting hostname to lowercase, removing trailing slashes,
- * and removing default ports.
- */
-export function normalizeUrl(urlStr: string): string {
-  try {
-    const url = new URL(urlStr);
-    url.hostname = url.hostname.toLowerCase();
-    // Remove trailing slash if present in pathname (except for root '/')
-    if (url.pathname.endsWith('/') && url.pathname.length > 1) {
-      url.pathname = url.pathname.slice(0, -1);
-    }
-    // Remove default ports
-    if (
-      (url.protocol === 'http:' && url.port === '80') ||
-      (url.protocol === 'https:' && url.port === '443')
-    ) {
-      url.port = '';
-    }
-    return url.href;
-  } catch {
-    return urlStr;
-  }
-}
-
 /**
 * Parses a prompt to extract valid URLs and identify malformed ones.
 */
@@ -177,15 +152,6 @@ interface GroundingSupportItem {
  groundingChunkIndices?: number[];
 }

-interface UrlMetadata {
-  url?: string;
-  urlRetrievalStatus?: string;
-}
-
-interface UrlContextMetadata {
-  urlMetadata?: UrlMetadata[];
-}
-
 /**
 * Parameters for the WebFetch tool
 */
@@ -575,69 +541,6 @@ Response: ${truncateString(rawResponseText, 10000, '\n\n... [Error response trun
    }
  }

-  private applyGroundingSupport(
-    responseText: string,
-    groundingSupports: GroundingSupportItem[] | undefined,
-  ): string {
-    if (!groundingSupports || groundingSupports.length === 0) {
-      return responseText;
-    }
-
-    const insertions: Array<{ index: number; marker: string }> = [];
-    groundingSupports.forEach((support: GroundingSupportItem) => {
-      if (support.segment && support.groundingChunkIndices) {
-        const citationMarker = support.groundingChunkIndices
-          .map((chunkIndex: number) => `[${chunkIndex + 1}]`)
-          .join('');
-        insertions.push({
-          index: support.segment.endIndex,
-          marker: citationMarker,
-        });
-      }
-    });
-
-    insertions.sort((a, b) => b.index - a.index);
-    const responseChars = responseText.split('');
-    insertions.forEach((insertion) => {
-      responseChars.splice(insertion.index, 0, insertion.marker);
-    });
-    return responseChars.join('');
-  }
-
-  private formatSourceList(sources: GroundingChunkItem[] | undefined): string {
-    if (!sources || sources.length === 0) {
-      return '';
-    }
-
-    const sourceListFormatted: string[] = [];
-    sources.forEach((source: GroundingChunkItem, index: number) => {
-      const title = source.web?.title || 'Untitled';
-      const uri = source.web?.uri || 'Unknown URI';
-      sourceListFormatted.push(`[${index + 1}] ${title} (${uri})`);
-    });
-
-    return `\n\nSources:\n${sourceListFormatted.join('\n')}`;
-  }
-
-  private async aggregateRescuedContent(
-    urls: string[],
-    signal: AbortSignal,
-  ): Promise<string> {
-    const uniqueRescue = [...new Set(urls)];
-    const contentBudget = Math.floor(MAX_CONTENT_LENGTH / uniqueRescue.length);
-    const rescuedResults: string[] = [];
-
-    for (const url of uniqueRescue) {
-      rescuedResults.push(
-        await this.executeFallbackForUrl(url, signal, contentBudget),
-      );
-    }
-
-    return rescuedResults
-      .map((content, i) => `URL: ${uniqueRescue[i]}\nContent:\n${content}`)
-      .join('\n\n---\n\n');
-  }
-
  async execute(signal: AbortSignal): Promise<ToolResult> {
    if (this.config.getDirectWebFetch()) {
      return this.executeExperimental(signal);
@@ -645,23 +548,26 @@ Response: ${truncateString(rawResponseText, 10000, '\n\n... [Error response trun
    const userPrompt = this.params.prompt!;
    const { validUrls } = parsePrompt(userPrompt);

-    // Unit 1: Normalization & Deduplication
-    const allUrls = [...new Set(validUrls.map(normalizeUrl))];
-
-    // Unit 2: Isolated Rate Limiting
+    // Filter unique URLs and perform pre-flight checks (Rate Limit & Private IP)
+    const uniqueUrls = [...new Set(validUrls)];
    const toFetch: string[] = [];
-    const rateLimited: string[] = [];
-    for (const url of allUrls) {
-      const rateLimitResult = checkRateLimit(url);
-      if (rateLimitResult.allowed) {
-        toFetch.push(url);
-      } else {
-        rateLimited.push(url);
+    const skipped: string[] = [];
+
+    for (const url of uniqueUrls) {
+      if (isPrivateIp(url)) {
+        skipped.push(`[Private IP] ${url}`);
+        continue;
      }
+      if (!checkRateLimit(url).allowed) {
+        skipped.push(`[Rate Limit] ${url}`);
+        continue;
+      }
+      toFetch.push(url);
    }

-    if (toFetch.length === 0 && rateLimited.length > 0) {
-      const errorMessage = `Rate limit exceeded for all requested hosts.`;
+    // If everything was skipped, fail early
+    if (toFetch.length === 0 && skipped.length > 0) {
+      const errorMessage = `All requested URLs were skipped: ${skipped.join(', ')}`;
      return {
        llmContent: `Error: ${errorMessage}`,
        returnDisplay: `Error: ${errorMessage}`,
@@ -672,154 +578,91 @@ Response: ${truncateString(rawResponseText, 10000, '\n\n... [Error response trun
      };
    }

-    const publicUrls = toFetch.filter((url) => !isPrivateIp(url));
-    const privateUrls = toFetch.filter((url) => isPrivateIp(url));
+    try {
+      const geminiClient = this.config.getGeminiClient();
+      const response = await geminiClient.generateContent(
+        { model: 'web-fetch' },
+        [{ role: 'user', parts: [{ text: userPrompt }] }],
+        signal,
+        LlmRole.UTILITY_TOOL,
+      );

-    if (privateUrls.length > 0) {
+      let responseText = getResponseText(response) || '';
+      const groundingMetadata = response.candidates?.[0]?.groundingMetadata;
+
+      // Simple primary success check: we need some text or grounding data
+      if (!responseText.trim() && !groundingMetadata?.groundingChunks?.length) {
+        throw new Error('Primary fetch returned no content');
+      }
+
+      // 1. Apply Grounding Supports (Citations)
+      const rawGroundingSupports = groundingMetadata?.groundingSupports;
+      const groundingSupports = Array.isArray(rawGroundingSupports)
+        ? rawGroundingSupports.filter(
+            (item): item is GroundingSupportItem =>
+              typeof item === 'object' && item !== null,
+          )
+        : undefined;
+      if (groundingSupports && groundingSupports.length > 0) {
+        const insertions: Array<{ index: number; marker: string }> = [];
+        groundingSupports.forEach((support) => {
+          if (support.segment && support.groundingChunkIndices) {
+            const citationMarker = support.groundingChunkIndices
+              .map((chunkIndex: number) => `[${chunkIndex + 1}]`)
+              .join('');
+            insertions.push({
+              index: support.segment.endIndex,
+              marker: citationMarker,
+            });
+          }
+        });
+
+        insertions.sort((a, b) => b.index - a.index);
+        const responseChars = responseText.split('');
+        insertions.forEach((insertion) => {
+          responseChars.splice(insertion.index, 0, insertion.marker);
+        });
+        responseText = responseChars.join('');
+      }
+
+      // 2. Append Source List
+      const rawSources = groundingMetadata?.groundingChunks;
+      const sources = Array.isArray(rawSources)
+        ? rawSources.filter(
+            (item): item is GroundingChunkItem =>
+              typeof item === 'object' && item !== null,
+          )
+        : undefined;
+      if (sources && sources.length > 0) {
+        const sourceListFormatted: string[] = [];
+        sources.forEach((source, index) => {
+          const title = source.web?.title || 'Untitled';
+          const uri = source.web?.uri || 'Unknown URI';
+          sourceListFormatted.push(`[${index + 1}] ${title} (${uri})`);
+        });
+        responseText += `\n\nSources:\n${sourceListFormatted.join('\n')}`;
+      }
+
+      // 3. Prepend Warnings for skipped URLs
+      if (skipped.length > 0) {
+        responseText = `[Warning] The following URLs were skipped:\n${skipped.join('\n')}\n\n${responseText}`;
+      }
+
+      return {
+        llmContent: responseText,
+        returnDisplay: `Content processed from prompt.`,
+      };
+    } catch (error: unknown) {
+      debugLogger.warn(
+        `[WebFetchTool] Primary fetch failed, falling back: ${getErrorMessage(error)}`,
+      );
      logWebFetchFallbackAttempt(
        this.config,
-        new WebFetchFallbackAttemptEvent('private_ip_skipped'),
+        new WebFetchFallbackAttemptEvent('primary_failed'),
      );
+      // Simple All-or-Nothing Fallback
+      return this.executeFallback(toFetch, signal);
    }
-
-    let llmContent = '';
-    let returnDisplay = '';
-    const needsRescue: string[] = [];
-
-    if (publicUrls.length > 0) {
-      const geminiClient = this.config.getGeminiClient();
-      try {
-        const response = await geminiClient.generateContent(
-          { model: 'web-fetch' },
-          [{ role: 'user', parts: [{ text: userPrompt }] }],
-          signal,
-          LlmRole.UTILITY_TOOL,
-        );
-
-        let responseText = getResponseText(response) || '';
-        const rawUrlContextMeta = response.candidates?.[0]?.urlContextMetadata;
-        const urlContextMeta =
-          rawUrlContextMeta &&
-          typeof rawUrlContextMeta === 'object' &&
-          'urlMetadata' in rawUrlContextMeta &&
-          Array.isArray(rawUrlContextMeta.urlMetadata)
-            ? (rawUrlContextMeta as UrlContextMetadata)
-            : undefined;
-
-        const groundingMetadata = response.candidates?.[0]?.groundingMetadata;
-        const sources = groundingMetadata?.groundingChunks as
-          | GroundingChunkItem[]
-          | undefined;
-        // eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
-        const groundingSupports = groundingMetadata?.groundingSupports as
-          | GroundingSupportItem[]
-          | undefined;
-
-        // Error Handling & Rescue identification
-        let processingError = false;
-
-        if (
-          urlContextMeta?.urlMetadata &&
-          urlContextMeta.urlMetadata.length > 0
-        ) {
-          const allStatuses = urlContextMeta.urlMetadata.map(
-            (m) => m.urlRetrievalStatus,
-          );
-          if (allStatuses.every((s) => s !== 'URL_RETRIEVAL_STATUS_SUCCESS')) {
-            processingError = true;
-          }
-
-          // Unit 3: Identify specific URLs that need rescue
-          for (const meta of urlContextMeta.urlMetadata) {
-            if (
-              meta.urlRetrievalStatus !== 'URL_RETRIEVAL_STATUS_SUCCESS' &&
-              meta.url
-            ) {
-              needsRescue.push(meta.url);
-            }
-          }
-        } else if (!responseText.trim() && !sources?.length) {
-          processingError = true;
-        }
-
-        if (
-          !processingError &&
-          !responseText.trim() &&
-          (!sources || sources.length === 0)
-        ) {
-          processingError = true;
-        }
-
-        if (processingError) {
-          logWebFetchFallbackAttempt(
-            this.config,
-            new WebFetchFallbackAttemptEvent('primary_failed'),
-          );
-          // If primary failed completely, rescue all public URLs that were supposed to be fetched
-          needsRescue.push(...publicUrls);
-        } else {
-          // Process grounding if successful
-          responseText = this.applyGroundingSupport(
-            responseText,
-            groundingSupports,
-          );
-          responseText += this.formatSourceList(sources);
-
-          llmContent = responseText;
-          returnDisplay = `Content processed from prompt.`;
-        }
-      } catch (error: unknown) {
-        debugLogger.error(
-          `[WebFetchTool] Primary fetch failed: ${getErrorMessage(error)}`,
-        );
-        needsRescue.push(...publicUrls);
-      }
-    }
-
-    // Unit 3: Surgical Fallback ("The Rescue")
-    if (needsRescue.length > 0) {
-      const aggregatedRescuedContent = await this.aggregateRescuedContent(
-        needsRescue,
-        signal,
-      );
-
-      if (!llmContent) {
-        // If no primary content, use executeFallback logic to process all rescued content via Gemini
-        return this.executeFallback(needsRescue, signal);
-      } else {
-        // If we have some primary content, append the rescued content as additional information
-        llmContent += `\n\n--- Rescued Content ---\n${aggregatedRescuedContent}`;
-        returnDisplay += ` (with ${needsRescue.length} rescued URL(s))`;
-      }
-    }
-
-    // Unit 2: Append rate limiting and private IP warnings
-    const warnings: string[] = [];
-    if (rateLimited.length > 0) {
-      warnings.push(
-        `[Warning] The following URLs were skipped due to rate limiting: ${rateLimited.join(
-          ', ',
-        )}`,
-      );
-    }
-    if (privateUrls.length > 0) {
-      warnings.push(
-        `[Warning] The following URLs were skipped because they point to private IP addresses: ${privateUrls.join(
-          ', ',
-        )}`,
-      );
-    }
-
-    if (warnings.length > 0) {
-      const combinedWarning = warnings.join('\n');
-      llmContent = `${combinedWarning}\n\n${llmContent}`;
-      returnDisplay = `${returnDisplay} (${rateLimited.length + privateUrls.length} URL(s) skipped)`;
-    }
-
-    return {
-      llmContent,
-      returnDisplay,
-    };
  }
 }