fix(browser): exclude visual prompt section when vision is disabled

The system prompt always included the VISUAL IDENTIFICATION section telling the model about analyze_screenshot, even when visualModel was not configured. This caused the model to attempt calling the tool despite it not being registered. - Convert BROWSER_SYSTEM_PROMPT to buildBrowserSystemPrompt(visionEnabled) - Pass vision state from factory to definition builder - Remove analyze_screenshot reference from click_at tool description - Add tests for conditional prompt inclusion/exclusion - Fix misleading test comment about tool count
2026-07-09 09:30:41 -07:00 · 2026-02-17 11:58:18 -08:00
parent 4e2856c4dd
commit 7718709f01
4 changed files with 119 additions and 13 deletions
@@ -39,11 +39,23 @@ export const BrowserTaskResultSchema = z.object({
    .describe('Optional extracted data from the task'),
 });

+const VISUAL_SECTION = `
+VISUAL IDENTIFICATION (analyze_screenshot):
+When you need to identify elements by visual attributes not in the AX tree (e.g., "click the yellow button", "find the red error message"), or need precise pixel coordinates:
+1. Call analyze_screenshot with a clear instruction describing what to find
+2. It returns visual analysis with coordinates/descriptions — it does NOT perform actions
+3. Use the returned coordinates with click_at(x, y) or other tools yourself
+4. If the analysis is insufficient, call it again with a more specific instruction
+`;
+
 /**
 * System prompt for the semantic browser agent.
 * Extracted from prototype (computer_use_subagent_cdt branch).
+ *
+ * @param visionEnabled Whether visual tools (analyze_screenshot, click_at) are available.
 */
-export const BROWSER_SYSTEM_PROMPT = `You are an expert browser automation agent (Orchestrator). Your goal is to completely fulfill the user's request.
+export function buildBrowserSystemPrompt(visionEnabled: boolean): string {
+  return `You are an expert browser automation agent (Orchestrator). Your goal is to completely fulfill the user's request.

 IMPORTANT: You will receive an accessibility tree snapshot showing elements with uid values (e.g., uid=87_4 button "Login"). 
 Use these uid values directly with your tools:
@@ -63,13 +75,7 @@ Before interacting with page content, scan the accessibility tree for blocking o
 - Common patterns: elements with role="dialog", role="tooltip", role="alertdialog", or aria-modal="true"
 - If you see such elements, DISMISS THEM FIRST by clicking close/dismiss buttons before proceeding
 - If a click seems to have no effect, check if an overlay appeared or is blocking the target
-
-VISUAL IDENTIFICATION (analyze_screenshot):
-When you need to identify elements by visual attributes not in the AX tree (e.g., "click the yellow button", "find the red error message"), or need precise pixel coordinates:
-1. Call analyze_screenshot with a clear instruction describing what to find
-2. It returns visual analysis with coordinates/descriptions — it does NOT perform actions
-3. Use the returned coordinates with click_at(x, y) or other tools yourself
-4. If the analysis is insufficient, call it again with a more specific instruction
+${visionEnabled ? VISUAL_SECTION : ''}

 COMPLEX WEB APPS (spreadsheets, rich editors, canvas apps):
 Many web apps (Google Sheets/Docs, Notion, Figma, etc.) use custom rendering rather than standard HTML inputs.
@@ -87,6 +93,7 @@ Some errors are unrecoverable and retrying will never help. When you see ANY of
 Do NOT keep retrying terminal errors. Report them with actionable remediation steps and exit immediately.

 CRITICAL: When you have fully completed the user's task, you MUST call the complete_task tool with a summary of what you accomplished. Do NOT just return text - you must explicitly call complete_task to exit the loop.`;
+}

 /**
 * Browser Agent Definition Factory.
@@ -98,6 +105,7 @@ CRITICAL: When you have fully completed the user's task, you MUST call the compl
 */
 export const BrowserAgentDefinition = (
  config: Config,
+  visionEnabled = false,
 ): LocalAgentDefinition<typeof BrowserTaskResultSchema> => {
  // Use Preview Flash model if the main model is any of the preview models.
  // If the main model is not a preview model, use the default flash model.
@@ -161,7 +169,7 @@ export const BrowserAgentDefinition = (
 </task>

 First, use new_page to open the relevant URL. Then call take_snapshot to see the page and proceed with your task.`,
-      systemPrompt: BROWSER_SYSTEM_PROMPT,
+      systemPrompt: buildBrowserSystemPrompt(visionEnabled),
    },
  };
 };
@@ -43,6 +43,11 @@ vi.mock('../../utils/debugLogger.js', () => ({
  },
 }));

+import {
+  buildBrowserSystemPrompt,
+  BROWSER_AGENT_NAME,
+} from './browserAgentDefinition.js';
+
 describe('browserAgentFactory', () => {
  let mockConfig: Config;
  let mockMessageBus: MessageBus;
@@ -100,8 +105,8 @@ describe('browserAgentFactory', () => {
        mockMessageBus,
      );

-      expect(definition.name).toBe('browser_agent');
-      // 5 MCP tools + 1 analyze_screenshot tool
+      expect(definition.name).toBe(BROWSER_AGENT_NAME);
+      // 5 MCP tools + 1 type_text composite tool (no analyze_screenshot without visualModel)
      expect(definition.toolConfig?.tools).toHaveLength(6);
    });

@@ -137,6 +142,73 @@ describe('browserAgentFactory', () => {
      expect(definition.outputConfig).toBeDefined();
      expect(definition.promptConfig).toBeDefined();
    });
+
+    it('should exclude visual prompt section when visualModel is not configured', async () => {
+      const { definition } = await createBrowserAgentDefinition(
+        mockConfig,
+        mockMessageBus,
+      );
+
+      const systemPrompt = definition.promptConfig?.systemPrompt ?? '';
+      expect(systemPrompt).not.toContain('analyze_screenshot');
+      expect(systemPrompt).not.toContain('VISUAL IDENTIFICATION');
+    });
+
+    it('should include visual prompt section when visualModel is configured', async () => {
+      const configWithVision = makeFakeConfig({
+        agents: {
+          overrides: {
+            browser_agent: {
+              enabled: true,
+              customConfig: {
+                headless: false,
+                visualModel: 'gemini-2.5-flash-preview',
+              },
+            },
+          },
+        },
+      });
+
+      const { definition } = await createBrowserAgentDefinition(
+        configWithVision,
+        mockMessageBus,
+      );
+
+      const systemPrompt = definition.promptConfig?.systemPrompt ?? '';
+      expect(systemPrompt).toContain('analyze_screenshot');
+      expect(systemPrompt).toContain('VISUAL IDENTIFICATION');
+    });
+
+    it('should include analyze_screenshot tool when visualModel is configured', async () => {
+      const configWithVision = makeFakeConfig({
+        agents: {
+          overrides: {
+            browser_agent: {
+              enabled: true,
+              customConfig: {
+                headless: false,
+                visualModel: 'gemini-2.5-flash-preview',
+              },
+            },
+          },
+        },
+      });
+
+      const { definition } = await createBrowserAgentDefinition(
+        configWithVision,
+        mockMessageBus,
+      );
+
+      // 5 MCP tools + 1 type_text + 1 analyze_screenshot
+      expect(definition.toolConfig?.tools).toHaveLength(7);
+      const toolNames =
+        definition.toolConfig?.tools
+          ?.filter(
+            (t): t is { name: string } => typeof t === 'object' && 'name' in t,
+          )
+          .map((t) => t.name) ?? [];
+      expect(toolNames).toContain('analyze_screenshot');
+    });
  });

  describe('cleanupBrowserAgent', () => {
@@ -158,3 +230,29 @@ describe('browserAgentFactory', () => {
    });
  });
 });
+
+describe('buildBrowserSystemPrompt', () => {
+  it('should include visual section when vision is enabled', () => {
+    const prompt = buildBrowserSystemPrompt(true);
+    expect(prompt).toContain('VISUAL IDENTIFICATION');
+    expect(prompt).toContain('analyze_screenshot');
+    expect(prompt).toContain('click_at');
+  });
+
+  it('should exclude visual section when vision is disabled', () => {
+    const prompt = buildBrowserSystemPrompt(false);
+    expect(prompt).not.toContain('VISUAL IDENTIFICATION');
+    expect(prompt).not.toContain('analyze_screenshot');
+  });
+
+  it('should always include core sections regardless of vision', () => {
+    for (const visionEnabled of [true, false]) {
+      const prompt = buildBrowserSystemPrompt(visionEnabled);
+      expect(prompt).toContain('PARALLEL TOOL CALLS');
+      expect(prompt).toContain('OVERLAY/POPUP HANDLING');
+      expect(prompt).toContain('COMPLEX WEB APPS');
+      expect(prompt).toContain('TERMINAL FAILURES');
+      expect(prompt).toContain('complete_task');
+    }
+  });
+});
@@ -131,7 +131,7 @@ export async function createBrowserAgentDefinition(

  // Create configured definition with tools
  // BrowserAgentDefinition is a factory function - call it with config
-  const baseDefinition = BrowserAgentDefinition(config);
+  const baseDefinition = BrowserAgentDefinition(config, !visionDisabledReason);
  const definition: LocalAgentDefinition<typeof BrowserTaskResultSchema> = {
    ...baseDefinition,
    toolConfig: {
@@ -464,7 +464,7 @@ function augmentToolDescription(toolName: string, description: string): string {
      ' Fills multiple standard HTML form fields at once. Same limitations as fill — does not work on canvas/custom widgets.',
    fill: ' Fills standard HTML form fields (<input>, <textarea>, <select>) by uid. Does NOT work on custom/canvas-based widgets (e.g., Google Sheets cells, Notion blocks). If fill times out or fails, click the element first then use press_key with individual characters instead.',
    click_at:
-      ' Clicks at exact pixel coordinates (x, y). Use with analyze_screenshot to find coordinates of visual elements.',
+      ' Clicks at exact pixel coordinates (x, y). Use when you have specific coordinates for visual elements.',
    click:
      ' Use the element uid from the accessibility tree snapshot (e.g., uid="87_4"). UIDs are invalidated after this action — call take_snapshot before using another uid.',
    hover: