mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-05-24 18:52:29 -07:00
fix(browser): exclude visual prompt section when vision is disabled
The system prompt always included the VISUAL IDENTIFICATION section telling the model about analyze_screenshot, even when visualModel was not configured. This caused the model to attempt calling the tool despite it not being registered. - Convert BROWSER_SYSTEM_PROMPT to buildBrowserSystemPrompt(visionEnabled) - Pass vision state from factory to definition builder - Remove analyze_screenshot reference from click_at tool description - Add tests for conditional prompt inclusion/exclusion - Fix misleading test comment about tool count
This commit is contained in:
@@ -39,11 +39,23 @@ export const BrowserTaskResultSchema = z.object({
|
||||
.describe('Optional extracted data from the task'),
|
||||
});
|
||||
|
||||
const VISUAL_SECTION = `
|
||||
VISUAL IDENTIFICATION (analyze_screenshot):
|
||||
When you need to identify elements by visual attributes not in the AX tree (e.g., "click the yellow button", "find the red error message"), or need precise pixel coordinates:
|
||||
1. Call analyze_screenshot with a clear instruction describing what to find
|
||||
2. It returns visual analysis with coordinates/descriptions — it does NOT perform actions
|
||||
3. Use the returned coordinates with click_at(x, y) or other tools yourself
|
||||
4. If the analysis is insufficient, call it again with a more specific instruction
|
||||
`;
|
||||
|
||||
/**
|
||||
* System prompt for the semantic browser agent.
|
||||
* Extracted from prototype (computer_use_subagent_cdt branch).
|
||||
*
|
||||
* @param visionEnabled Whether visual tools (analyze_screenshot, click_at) are available.
|
||||
*/
|
||||
export const BROWSER_SYSTEM_PROMPT = `You are an expert browser automation agent (Orchestrator). Your goal is to completely fulfill the user's request.
|
||||
export function buildBrowserSystemPrompt(visionEnabled: boolean): string {
|
||||
return `You are an expert browser automation agent (Orchestrator). Your goal is to completely fulfill the user's request.
|
||||
|
||||
IMPORTANT: You will receive an accessibility tree snapshot showing elements with uid values (e.g., uid=87_4 button "Login").
|
||||
Use these uid values directly with your tools:
|
||||
@@ -63,13 +75,7 @@ Before interacting with page content, scan the accessibility tree for blocking o
|
||||
- Common patterns: elements with role="dialog", role="tooltip", role="alertdialog", or aria-modal="true"
|
||||
- If you see such elements, DISMISS THEM FIRST by clicking close/dismiss buttons before proceeding
|
||||
- If a click seems to have no effect, check if an overlay appeared or is blocking the target
|
||||
|
||||
VISUAL IDENTIFICATION (analyze_screenshot):
|
||||
When you need to identify elements by visual attributes not in the AX tree (e.g., "click the yellow button", "find the red error message"), or need precise pixel coordinates:
|
||||
1. Call analyze_screenshot with a clear instruction describing what to find
|
||||
2. It returns visual analysis with coordinates/descriptions — it does NOT perform actions
|
||||
3. Use the returned coordinates with click_at(x, y) or other tools yourself
|
||||
4. If the analysis is insufficient, call it again with a more specific instruction
|
||||
${visionEnabled ? VISUAL_SECTION : ''}
|
||||
|
||||
COMPLEX WEB APPS (spreadsheets, rich editors, canvas apps):
|
||||
Many web apps (Google Sheets/Docs, Notion, Figma, etc.) use custom rendering rather than standard HTML inputs.
|
||||
@@ -87,6 +93,7 @@ Some errors are unrecoverable and retrying will never help. When you see ANY of
|
||||
Do NOT keep retrying terminal errors. Report them with actionable remediation steps and exit immediately.
|
||||
|
||||
CRITICAL: When you have fully completed the user's task, you MUST call the complete_task tool with a summary of what you accomplished. Do NOT just return text - you must explicitly call complete_task to exit the loop.`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Browser Agent Definition Factory.
|
||||
@@ -98,6 +105,7 @@ CRITICAL: When you have fully completed the user's task, you MUST call the compl
|
||||
*/
|
||||
export const BrowserAgentDefinition = (
|
||||
config: Config,
|
||||
visionEnabled = false,
|
||||
): LocalAgentDefinition<typeof BrowserTaskResultSchema> => {
|
||||
// Use Preview Flash model if the main model is any of the preview models.
|
||||
// If the main model is not a preview model, use the default flash model.
|
||||
@@ -161,7 +169,7 @@ export const BrowserAgentDefinition = (
|
||||
</task>
|
||||
|
||||
First, use new_page to open the relevant URL. Then call take_snapshot to see the page and proceed with your task.`,
|
||||
systemPrompt: BROWSER_SYSTEM_PROMPT,
|
||||
systemPrompt: buildBrowserSystemPrompt(visionEnabled),
|
||||
},
|
||||
};
|
||||
};
|
||||
|
||||
@@ -43,6 +43,11 @@ vi.mock('../../utils/debugLogger.js', () => ({
|
||||
},
|
||||
}));
|
||||
|
||||
import {
|
||||
buildBrowserSystemPrompt,
|
||||
BROWSER_AGENT_NAME,
|
||||
} from './browserAgentDefinition.js';
|
||||
|
||||
describe('browserAgentFactory', () => {
|
||||
let mockConfig: Config;
|
||||
let mockMessageBus: MessageBus;
|
||||
@@ -100,8 +105,8 @@ describe('browserAgentFactory', () => {
|
||||
mockMessageBus,
|
||||
);
|
||||
|
||||
expect(definition.name).toBe('browser_agent');
|
||||
// 5 MCP tools + 1 analyze_screenshot tool
|
||||
expect(definition.name).toBe(BROWSER_AGENT_NAME);
|
||||
// 5 MCP tools + 1 type_text composite tool (no analyze_screenshot without visualModel)
|
||||
expect(definition.toolConfig?.tools).toHaveLength(6);
|
||||
});
|
||||
|
||||
@@ -137,6 +142,73 @@ describe('browserAgentFactory', () => {
|
||||
expect(definition.outputConfig).toBeDefined();
|
||||
expect(definition.promptConfig).toBeDefined();
|
||||
});
|
||||
|
||||
it('should exclude visual prompt section when visualModel is not configured', async () => {
|
||||
const { definition } = await createBrowserAgentDefinition(
|
||||
mockConfig,
|
||||
mockMessageBus,
|
||||
);
|
||||
|
||||
const systemPrompt = definition.promptConfig?.systemPrompt ?? '';
|
||||
expect(systemPrompt).not.toContain('analyze_screenshot');
|
||||
expect(systemPrompt).not.toContain('VISUAL IDENTIFICATION');
|
||||
});
|
||||
|
||||
it('should include visual prompt section when visualModel is configured', async () => {
|
||||
const configWithVision = makeFakeConfig({
|
||||
agents: {
|
||||
overrides: {
|
||||
browser_agent: {
|
||||
enabled: true,
|
||||
customConfig: {
|
||||
headless: false,
|
||||
visualModel: 'gemini-2.5-flash-preview',
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
const { definition } = await createBrowserAgentDefinition(
|
||||
configWithVision,
|
||||
mockMessageBus,
|
||||
);
|
||||
|
||||
const systemPrompt = definition.promptConfig?.systemPrompt ?? '';
|
||||
expect(systemPrompt).toContain('analyze_screenshot');
|
||||
expect(systemPrompt).toContain('VISUAL IDENTIFICATION');
|
||||
});
|
||||
|
||||
it('should include analyze_screenshot tool when visualModel is configured', async () => {
|
||||
const configWithVision = makeFakeConfig({
|
||||
agents: {
|
||||
overrides: {
|
||||
browser_agent: {
|
||||
enabled: true,
|
||||
customConfig: {
|
||||
headless: false,
|
||||
visualModel: 'gemini-2.5-flash-preview',
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
const { definition } = await createBrowserAgentDefinition(
|
||||
configWithVision,
|
||||
mockMessageBus,
|
||||
);
|
||||
|
||||
// 5 MCP tools + 1 type_text + 1 analyze_screenshot
|
||||
expect(definition.toolConfig?.tools).toHaveLength(7);
|
||||
const toolNames =
|
||||
definition.toolConfig?.tools
|
||||
?.filter(
|
||||
(t): t is { name: string } => typeof t === 'object' && 'name' in t,
|
||||
)
|
||||
.map((t) => t.name) ?? [];
|
||||
expect(toolNames).toContain('analyze_screenshot');
|
||||
});
|
||||
});
|
||||
|
||||
describe('cleanupBrowserAgent', () => {
|
||||
@@ -158,3 +230,29 @@ describe('browserAgentFactory', () => {
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe('buildBrowserSystemPrompt', () => {
|
||||
it('should include visual section when vision is enabled', () => {
|
||||
const prompt = buildBrowserSystemPrompt(true);
|
||||
expect(prompt).toContain('VISUAL IDENTIFICATION');
|
||||
expect(prompt).toContain('analyze_screenshot');
|
||||
expect(prompt).toContain('click_at');
|
||||
});
|
||||
|
||||
it('should exclude visual section when vision is disabled', () => {
|
||||
const prompt = buildBrowserSystemPrompt(false);
|
||||
expect(prompt).not.toContain('VISUAL IDENTIFICATION');
|
||||
expect(prompt).not.toContain('analyze_screenshot');
|
||||
});
|
||||
|
||||
it('should always include core sections regardless of vision', () => {
|
||||
for (const visionEnabled of [true, false]) {
|
||||
const prompt = buildBrowserSystemPrompt(visionEnabled);
|
||||
expect(prompt).toContain('PARALLEL TOOL CALLS');
|
||||
expect(prompt).toContain('OVERLAY/POPUP HANDLING');
|
||||
expect(prompt).toContain('COMPLEX WEB APPS');
|
||||
expect(prompt).toContain('TERMINAL FAILURES');
|
||||
expect(prompt).toContain('complete_task');
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
@@ -131,7 +131,7 @@ export async function createBrowserAgentDefinition(
|
||||
|
||||
// Create configured definition with tools
|
||||
// BrowserAgentDefinition is a factory function - call it with config
|
||||
const baseDefinition = BrowserAgentDefinition(config);
|
||||
const baseDefinition = BrowserAgentDefinition(config, !visionDisabledReason);
|
||||
const definition: LocalAgentDefinition<typeof BrowserTaskResultSchema> = {
|
||||
...baseDefinition,
|
||||
toolConfig: {
|
||||
|
||||
@@ -464,7 +464,7 @@ function augmentToolDescription(toolName: string, description: string): string {
|
||||
' Fills multiple standard HTML form fields at once. Same limitations as fill — does not work on canvas/custom widgets.',
|
||||
fill: ' Fills standard HTML form fields (<input>, <textarea>, <select>) by uid. Does NOT work on custom/canvas-based widgets (e.g., Google Sheets cells, Notion blocks). If fill times out or fails, click the element first then use press_key with individual characters instead.',
|
||||
click_at:
|
||||
' Clicks at exact pixel coordinates (x, y). Use with analyze_screenshot to find coordinates of visual elements.',
|
||||
' Clicks at exact pixel coordinates (x, y). Use when you have specific coordinates for visual elements.',
|
||||
click:
|
||||
' Use the element uid from the accessibility tree snapshot (e.g., uid="87_4"). UIDs are invalidated after this action — call take_snapshot before using another uid.',
|
||||
hover:
|
||||
|
||||
Reference in New Issue
Block a user