diff --git a/packages/core/src/agents/browser/browserAgentDefinition.ts b/packages/core/src/agents/browser/browserAgentDefinition.ts index 84cb4fc5a9..8d0f3799c2 100644 --- a/packages/core/src/agents/browser/browserAgentDefinition.ts +++ b/packages/core/src/agents/browser/browserAgentDefinition.ts @@ -39,11 +39,23 @@ export const BrowserTaskResultSchema = z.object({ .describe('Optional extracted data from the task'), }); +const VISUAL_SECTION = ` +VISUAL IDENTIFICATION (analyze_screenshot): +When you need to identify elements by visual attributes not in the AX tree (e.g., "click the yellow button", "find the red error message"), or need precise pixel coordinates: +1. Call analyze_screenshot with a clear instruction describing what to find +2. It returns visual analysis with coordinates/descriptions — it does NOT perform actions +3. Use the returned coordinates with click_at(x, y) or other tools yourself +4. If the analysis is insufficient, call it again with a more specific instruction +`; + /** * System prompt for the semantic browser agent. * Extracted from prototype (computer_use_subagent_cdt branch). + * + * @param visionEnabled Whether visual tools (analyze_screenshot, click_at) are available. */ -export const BROWSER_SYSTEM_PROMPT = `You are an expert browser automation agent (Orchestrator). Your goal is to completely fulfill the user's request. +export function buildBrowserSystemPrompt(visionEnabled: boolean): string { + return `You are an expert browser automation agent (Orchestrator). Your goal is to completely fulfill the user's request. IMPORTANT: You will receive an accessibility tree snapshot showing elements with uid values (e.g., uid=87_4 button "Login"). Use these uid values directly with your tools: @@ -63,13 +75,7 @@ Before interacting with page content, scan the accessibility tree for blocking o - Common patterns: elements with role="dialog", role="tooltip", role="alertdialog", or aria-modal="true" - If you see such elements, DISMISS THEM FIRST by clicking close/dismiss buttons before proceeding - If a click seems to have no effect, check if an overlay appeared or is blocking the target - -VISUAL IDENTIFICATION (analyze_screenshot): -When you need to identify elements by visual attributes not in the AX tree (e.g., "click the yellow button", "find the red error message"), or need precise pixel coordinates: -1. Call analyze_screenshot with a clear instruction describing what to find -2. It returns visual analysis with coordinates/descriptions — it does NOT perform actions -3. Use the returned coordinates with click_at(x, y) or other tools yourself -4. If the analysis is insufficient, call it again with a more specific instruction +${visionEnabled ? VISUAL_SECTION : ''} COMPLEX WEB APPS (spreadsheets, rich editors, canvas apps): Many web apps (Google Sheets/Docs, Notion, Figma, etc.) use custom rendering rather than standard HTML inputs. @@ -87,6 +93,7 @@ Some errors are unrecoverable and retrying will never help. When you see ANY of Do NOT keep retrying terminal errors. Report them with actionable remediation steps and exit immediately. CRITICAL: When you have fully completed the user's task, you MUST call the complete_task tool with a summary of what you accomplished. Do NOT just return text - you must explicitly call complete_task to exit the loop.`; +} /** * Browser Agent Definition Factory. @@ -98,6 +105,7 @@ CRITICAL: When you have fully completed the user's task, you MUST call the compl */ export const BrowserAgentDefinition = ( config: Config, + visionEnabled = false, ): LocalAgentDefinition => { // Use Preview Flash model if the main model is any of the preview models. // If the main model is not a preview model, use the default flash model. @@ -161,7 +169,7 @@ export const BrowserAgentDefinition = ( First, use new_page to open the relevant URL. Then call take_snapshot to see the page and proceed with your task.`, - systemPrompt: BROWSER_SYSTEM_PROMPT, + systemPrompt: buildBrowserSystemPrompt(visionEnabled), }, }; }; diff --git a/packages/core/src/agents/browser/browserAgentFactory.test.ts b/packages/core/src/agents/browser/browserAgentFactory.test.ts index e8deeeee8d..3a128ad4c6 100644 --- a/packages/core/src/agents/browser/browserAgentFactory.test.ts +++ b/packages/core/src/agents/browser/browserAgentFactory.test.ts @@ -43,6 +43,11 @@ vi.mock('../../utils/debugLogger.js', () => ({ }, })); +import { + buildBrowserSystemPrompt, + BROWSER_AGENT_NAME, +} from './browserAgentDefinition.js'; + describe('browserAgentFactory', () => { let mockConfig: Config; let mockMessageBus: MessageBus; @@ -100,8 +105,8 @@ describe('browserAgentFactory', () => { mockMessageBus, ); - expect(definition.name).toBe('browser_agent'); - // 5 MCP tools + 1 analyze_screenshot tool + expect(definition.name).toBe(BROWSER_AGENT_NAME); + // 5 MCP tools + 1 type_text composite tool (no analyze_screenshot without visualModel) expect(definition.toolConfig?.tools).toHaveLength(6); }); @@ -137,6 +142,73 @@ describe('browserAgentFactory', () => { expect(definition.outputConfig).toBeDefined(); expect(definition.promptConfig).toBeDefined(); }); + + it('should exclude visual prompt section when visualModel is not configured', async () => { + const { definition } = await createBrowserAgentDefinition( + mockConfig, + mockMessageBus, + ); + + const systemPrompt = definition.promptConfig?.systemPrompt ?? ''; + expect(systemPrompt).not.toContain('analyze_screenshot'); + expect(systemPrompt).not.toContain('VISUAL IDENTIFICATION'); + }); + + it('should include visual prompt section when visualModel is configured', async () => { + const configWithVision = makeFakeConfig({ + agents: { + overrides: { + browser_agent: { + enabled: true, + customConfig: { + headless: false, + visualModel: 'gemini-2.5-flash-preview', + }, + }, + }, + }, + }); + + const { definition } = await createBrowserAgentDefinition( + configWithVision, + mockMessageBus, + ); + + const systemPrompt = definition.promptConfig?.systemPrompt ?? ''; + expect(systemPrompt).toContain('analyze_screenshot'); + expect(systemPrompt).toContain('VISUAL IDENTIFICATION'); + }); + + it('should include analyze_screenshot tool when visualModel is configured', async () => { + const configWithVision = makeFakeConfig({ + agents: { + overrides: { + browser_agent: { + enabled: true, + customConfig: { + headless: false, + visualModel: 'gemini-2.5-flash-preview', + }, + }, + }, + }, + }); + + const { definition } = await createBrowserAgentDefinition( + configWithVision, + mockMessageBus, + ); + + // 5 MCP tools + 1 type_text + 1 analyze_screenshot + expect(definition.toolConfig?.tools).toHaveLength(7); + const toolNames = + definition.toolConfig?.tools + ?.filter( + (t): t is { name: string } => typeof t === 'object' && 'name' in t, + ) + .map((t) => t.name) ?? []; + expect(toolNames).toContain('analyze_screenshot'); + }); }); describe('cleanupBrowserAgent', () => { @@ -158,3 +230,29 @@ describe('browserAgentFactory', () => { }); }); }); + +describe('buildBrowserSystemPrompt', () => { + it('should include visual section when vision is enabled', () => { + const prompt = buildBrowserSystemPrompt(true); + expect(prompt).toContain('VISUAL IDENTIFICATION'); + expect(prompt).toContain('analyze_screenshot'); + expect(prompt).toContain('click_at'); + }); + + it('should exclude visual section when vision is disabled', () => { + const prompt = buildBrowserSystemPrompt(false); + expect(prompt).not.toContain('VISUAL IDENTIFICATION'); + expect(prompt).not.toContain('analyze_screenshot'); + }); + + it('should always include core sections regardless of vision', () => { + for (const visionEnabled of [true, false]) { + const prompt = buildBrowserSystemPrompt(visionEnabled); + expect(prompt).toContain('PARALLEL TOOL CALLS'); + expect(prompt).toContain('OVERLAY/POPUP HANDLING'); + expect(prompt).toContain('COMPLEX WEB APPS'); + expect(prompt).toContain('TERMINAL FAILURES'); + expect(prompt).toContain('complete_task'); + } + }); +}); diff --git a/packages/core/src/agents/browser/browserAgentFactory.ts b/packages/core/src/agents/browser/browserAgentFactory.ts index cacc15760b..8b6fabf45f 100644 --- a/packages/core/src/agents/browser/browserAgentFactory.ts +++ b/packages/core/src/agents/browser/browserAgentFactory.ts @@ -131,7 +131,7 @@ export async function createBrowserAgentDefinition( // Create configured definition with tools // BrowserAgentDefinition is a factory function - call it with config - const baseDefinition = BrowserAgentDefinition(config); + const baseDefinition = BrowserAgentDefinition(config, !visionDisabledReason); const definition: LocalAgentDefinition = { ...baseDefinition, toolConfig: { diff --git a/packages/core/src/agents/browser/mcpToolWrapper.ts b/packages/core/src/agents/browser/mcpToolWrapper.ts index 163f42a3ff..8572ca9d21 100644 --- a/packages/core/src/agents/browser/mcpToolWrapper.ts +++ b/packages/core/src/agents/browser/mcpToolWrapper.ts @@ -464,7 +464,7 @@ function augmentToolDescription(toolName: string, description: string): string { ' Fills multiple standard HTML form fields at once. Same limitations as fill — does not work on canvas/custom widgets.', fill: ' Fills standard HTML form fields (,