fix(browser): exclude visual prompt section when vision is disabled

The system prompt always included the VISUAL IDENTIFICATION section
telling the model about analyze_screenshot, even when visualModel was
not configured. This caused the model to attempt calling the tool
despite it not being registered.

- Convert BROWSER_SYSTEM_PROMPT to buildBrowserSystemPrompt(visionEnabled)
- Pass vision state from factory to definition builder
- Remove analyze_screenshot reference from click_at tool description
- Add tests for conditional prompt inclusion/exclusion
- Fix misleading test comment about tool count
This commit is contained in:
Gaurav Ghosh
2026-02-17 11:58:18 -08:00
parent 4e2856c4dd
commit 7718709f01
4 changed files with 119 additions and 13 deletions
@@ -39,11 +39,23 @@ export const BrowserTaskResultSchema = z.object({
.describe('Optional extracted data from the task'),
});
const VISUAL_SECTION = `
VISUAL IDENTIFICATION (analyze_screenshot):
When you need to identify elements by visual attributes not in the AX tree (e.g., "click the yellow button", "find the red error message"), or need precise pixel coordinates:
1. Call analyze_screenshot with a clear instruction describing what to find
2. It returns visual analysis with coordinates/descriptions — it does NOT perform actions
3. Use the returned coordinates with click_at(x, y) or other tools yourself
4. If the analysis is insufficient, call it again with a more specific instruction
`;
/**
* System prompt for the semantic browser agent.
* Extracted from prototype (computer_use_subagent_cdt branch).
*
* @param visionEnabled Whether visual tools (analyze_screenshot, click_at) are available.
*/
export const BROWSER_SYSTEM_PROMPT = `You are an expert browser automation agent (Orchestrator). Your goal is to completely fulfill the user's request.
export function buildBrowserSystemPrompt(visionEnabled: boolean): string {
return `You are an expert browser automation agent (Orchestrator). Your goal is to completely fulfill the user's request.
IMPORTANT: You will receive an accessibility tree snapshot showing elements with uid values (e.g., uid=87_4 button "Login").
Use these uid values directly with your tools:
@@ -63,13 +75,7 @@ Before interacting with page content, scan the accessibility tree for blocking o
- Common patterns: elements with role="dialog", role="tooltip", role="alertdialog", or aria-modal="true"
- If you see such elements, DISMISS THEM FIRST by clicking close/dismiss buttons before proceeding
- If a click seems to have no effect, check if an overlay appeared or is blocking the target
VISUAL IDENTIFICATION (analyze_screenshot):
When you need to identify elements by visual attributes not in the AX tree (e.g., "click the yellow button", "find the red error message"), or need precise pixel coordinates:
1. Call analyze_screenshot with a clear instruction describing what to find
2. It returns visual analysis with coordinates/descriptions — it does NOT perform actions
3. Use the returned coordinates with click_at(x, y) or other tools yourself
4. If the analysis is insufficient, call it again with a more specific instruction
${visionEnabled ? VISUAL_SECTION : ''}
COMPLEX WEB APPS (spreadsheets, rich editors, canvas apps):
Many web apps (Google Sheets/Docs, Notion, Figma, etc.) use custom rendering rather than standard HTML inputs.
@@ -87,6 +93,7 @@ Some errors are unrecoverable and retrying will never help. When you see ANY of
Do NOT keep retrying terminal errors. Report them with actionable remediation steps and exit immediately.
CRITICAL: When you have fully completed the user's task, you MUST call the complete_task tool with a summary of what you accomplished. Do NOT just return text - you must explicitly call complete_task to exit the loop.`;
}
/**
* Browser Agent Definition Factory.
@@ -98,6 +105,7 @@ CRITICAL: When you have fully completed the user's task, you MUST call the compl
*/
export const BrowserAgentDefinition = (
config: Config,
visionEnabled = false,
): LocalAgentDefinition<typeof BrowserTaskResultSchema> => {
// Use Preview Flash model if the main model is any of the preview models.
// If the main model is not a preview model, use the default flash model.
@@ -161,7 +169,7 @@ export const BrowserAgentDefinition = (
</task>
First, use new_page to open the relevant URL. Then call take_snapshot to see the page and proceed with your task.`,
systemPrompt: BROWSER_SYSTEM_PROMPT,
systemPrompt: buildBrowserSystemPrompt(visionEnabled),
},
};
};
@@ -43,6 +43,11 @@ vi.mock('../../utils/debugLogger.js', () => ({
},
}));
import {
buildBrowserSystemPrompt,
BROWSER_AGENT_NAME,
} from './browserAgentDefinition.js';
describe('browserAgentFactory', () => {
let mockConfig: Config;
let mockMessageBus: MessageBus;
@@ -100,8 +105,8 @@ describe('browserAgentFactory', () => {
mockMessageBus,
);
expect(definition.name).toBe('browser_agent');
// 5 MCP tools + 1 analyze_screenshot tool
expect(definition.name).toBe(BROWSER_AGENT_NAME);
// 5 MCP tools + 1 type_text composite tool (no analyze_screenshot without visualModel)
expect(definition.toolConfig?.tools).toHaveLength(6);
});
@@ -137,6 +142,73 @@ describe('browserAgentFactory', () => {
expect(definition.outputConfig).toBeDefined();
expect(definition.promptConfig).toBeDefined();
});
it('should exclude visual prompt section when visualModel is not configured', async () => {
const { definition } = await createBrowserAgentDefinition(
mockConfig,
mockMessageBus,
);
const systemPrompt = definition.promptConfig?.systemPrompt ?? '';
expect(systemPrompt).not.toContain('analyze_screenshot');
expect(systemPrompt).not.toContain('VISUAL IDENTIFICATION');
});
it('should include visual prompt section when visualModel is configured', async () => {
const configWithVision = makeFakeConfig({
agents: {
overrides: {
browser_agent: {
enabled: true,
customConfig: {
headless: false,
visualModel: 'gemini-2.5-flash-preview',
},
},
},
},
});
const { definition } = await createBrowserAgentDefinition(
configWithVision,
mockMessageBus,
);
const systemPrompt = definition.promptConfig?.systemPrompt ?? '';
expect(systemPrompt).toContain('analyze_screenshot');
expect(systemPrompt).toContain('VISUAL IDENTIFICATION');
});
it('should include analyze_screenshot tool when visualModel is configured', async () => {
const configWithVision = makeFakeConfig({
agents: {
overrides: {
browser_agent: {
enabled: true,
customConfig: {
headless: false,
visualModel: 'gemini-2.5-flash-preview',
},
},
},
},
});
const { definition } = await createBrowserAgentDefinition(
configWithVision,
mockMessageBus,
);
// 5 MCP tools + 1 type_text + 1 analyze_screenshot
expect(definition.toolConfig?.tools).toHaveLength(7);
const toolNames =
definition.toolConfig?.tools
?.filter(
(t): t is { name: string } => typeof t === 'object' && 'name' in t,
)
.map((t) => t.name) ?? [];
expect(toolNames).toContain('analyze_screenshot');
});
});
describe('cleanupBrowserAgent', () => {
@@ -158,3 +230,29 @@ describe('browserAgentFactory', () => {
});
});
});
describe('buildBrowserSystemPrompt', () => {
it('should include visual section when vision is enabled', () => {
const prompt = buildBrowserSystemPrompt(true);
expect(prompt).toContain('VISUAL IDENTIFICATION');
expect(prompt).toContain('analyze_screenshot');
expect(prompt).toContain('click_at');
});
it('should exclude visual section when vision is disabled', () => {
const prompt = buildBrowserSystemPrompt(false);
expect(prompt).not.toContain('VISUAL IDENTIFICATION');
expect(prompt).not.toContain('analyze_screenshot');
});
it('should always include core sections regardless of vision', () => {
for (const visionEnabled of [true, false]) {
const prompt = buildBrowserSystemPrompt(visionEnabled);
expect(prompt).toContain('PARALLEL TOOL CALLS');
expect(prompt).toContain('OVERLAY/POPUP HANDLING');
expect(prompt).toContain('COMPLEX WEB APPS');
expect(prompt).toContain('TERMINAL FAILURES');
expect(prompt).toContain('complete_task');
}
});
});
@@ -131,7 +131,7 @@ export async function createBrowserAgentDefinition(
// Create configured definition with tools
// BrowserAgentDefinition is a factory function - call it with config
const baseDefinition = BrowserAgentDefinition(config);
const baseDefinition = BrowserAgentDefinition(config, !visionDisabledReason);
const definition: LocalAgentDefinition<typeof BrowserTaskResultSchema> = {
...baseDefinition,
toolConfig: {
@@ -464,7 +464,7 @@ function augmentToolDescription(toolName: string, description: string): string {
' Fills multiple standard HTML form fields at once. Same limitations as fill — does not work on canvas/custom widgets.',
fill: ' Fills standard HTML form fields (<input>, <textarea>, <select>) by uid. Does NOT work on custom/canvas-based widgets (e.g., Google Sheets cells, Notion blocks). If fill times out or fails, click the element first then use press_key with individual characters instead.',
click_at:
' Clicks at exact pixel coordinates (x, y). Use with analyze_screenshot to find coordinates of visual elements.',
' Clicks at exact pixel coordinates (x, y). Use when you have specific coordinates for visual elements.',
click:
' Use the element uid from the accessibility tree snapshot (e.g., uid="87_4"). UIDs are invalidated after this action — call take_snapshot before using another uid.',
hover: