feat(browser): implement visual agent for coordinate-based interactions

Implement the visual agent using the LocalAgentDefinition pattern:
- VisualAgentDefinition: Agent metadata for coordinate-based visual tasks
- delegateToVisualAgent.ts: Tool for semantic agent to delegate visual tasks
- Uses gemini-2.5-computer-use-preview-10-2025 model for Computer Use capability

The visual agent handles tasks requiring visual identification or precise
coordinate-based actions that cannot be done via the accessibility tree.
This commit is contained in:
Gaurav Ghosh
2026-01-08 05:14:37 +00:00
parent f4100baf6b
commit 1620c7d82f
12 changed files with 871 additions and 37 deletions
@@ -0,0 +1,246 @@
/**
* @license
* Copyright 2025 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, it, expect, vi, beforeEach } from 'vitest';
import { createAnalyzeScreenshotTool } from './analyzeScreenshot.js';
import type { BrowserManager, McpToolCallResult } from './browserManager.js';
import type { Config } from '../../config/config.js';
import type { MessageBus } from '../../confirmation-bus/message-bus.js';
const mockMessageBus = {
waitForConfirmation: vi.fn().mockResolvedValue({ approved: true }),
} as unknown as MessageBus;
function createMockBrowserManager(
callToolResult?: McpToolCallResult,
): BrowserManager {
return {
callTool: vi.fn().mockResolvedValue(
callToolResult ?? {
content: [
{ type: 'text', text: 'Screenshot captured' },
{
type: 'image',
data: 'base64encodeddata',
mimeType: 'image/png',
},
],
},
),
} as unknown as BrowserManager;
}
function createMockConfig(
generateContentResult?: unknown,
generateContentError?: Error,
): Config {
const generateContent = generateContentError
? vi.fn().mockRejectedValue(generateContentError)
: vi.fn().mockResolvedValue(
generateContentResult ?? {
candidates: [
{
content: {
parts: [
{
text: 'The blue submit button is at coordinates (250, 400).',
},
],
},
},
],
},
);
return {
getBrowserAgentConfig: vi.fn().mockReturnValue({
customConfig: { visualModel: 'test-visual-model' },
}),
getContentGenerator: vi.fn().mockReturnValue({
generateContent,
}),
} as unknown as Config;
}
describe('analyzeScreenshot', () => {
beforeEach(() => {
vi.clearAllMocks();
});
describe('createAnalyzeScreenshotTool', () => {
it('creates a tool with the correct name and schema', () => {
const browserManager = createMockBrowserManager();
const config = createMockConfig();
const tool = createAnalyzeScreenshotTool(
browserManager,
config,
mockMessageBus,
);
expect(tool.name).toBe('analyze_screenshot');
});
});
describe('AnalyzeScreenshotInvocation', () => {
it('captures a screenshot and returns visual analysis', async () => {
const browserManager = createMockBrowserManager();
const config = createMockConfig();
const tool = createAnalyzeScreenshotTool(
browserManager,
config,
mockMessageBus,
);
const invocation = tool.build({
instruction: 'Find the blue submit button',
});
const result = await invocation.execute(new AbortController().signal);
// Verify screenshot was captured
expect(browserManager.callTool).toHaveBeenCalledWith(
'take_screenshot',
{},
);
// Verify the visual model was called
const contentGenerator = config.getContentGenerator();
expect(contentGenerator.generateContent).toHaveBeenCalledWith(
expect.objectContaining({
model: 'test-visual-model',
contents: expect.arrayContaining([
expect.objectContaining({
role: 'user',
parts: expect.arrayContaining([
expect.objectContaining({
inlineData: {
mimeType: 'image/png',
data: 'base64encodeddata',
},
}),
]),
}),
]),
}),
'visual-analysis',
);
// Verify result
expect(result.llmContent).toContain('Visual Analysis Result');
expect(result.llmContent).toContain(
'The blue submit button is at coordinates (250, 400).',
);
expect(result.error).toBeUndefined();
});
it('returns an error when screenshot capture fails (no image)', async () => {
const browserManager = createMockBrowserManager({
content: [{ type: 'text', text: 'No screenshot available' }],
});
const config = createMockConfig();
const tool = createAnalyzeScreenshotTool(
browserManager,
config,
mockMessageBus,
);
const invocation = tool.build({
instruction: 'Find the button',
});
const result = await invocation.execute(new AbortController().signal);
expect(result.error).toBeDefined();
expect(result.llmContent).toContain('Failed to capture screenshot');
// Should NOT call the visual model
const contentGenerator = config.getContentGenerator();
expect(contentGenerator.generateContent).not.toHaveBeenCalled();
});
it('returns an error when visual model returns empty response', async () => {
const browserManager = createMockBrowserManager();
const config = createMockConfig({
candidates: [{ content: { parts: [] } }],
});
const tool = createAnalyzeScreenshotTool(
browserManager,
config,
mockMessageBus,
);
const invocation = tool.build({
instruction: 'Check the layout',
});
const result = await invocation.execute(new AbortController().signal);
expect(result.error).toBeDefined();
expect(result.llmContent).toContain('Visual model returned no analysis');
});
it('returns a model-unavailability fallback for 404 errors', async () => {
const browserManager = createMockBrowserManager();
const config = createMockConfig(
undefined,
new Error('Model not found: 404'),
);
const tool = createAnalyzeScreenshotTool(
browserManager,
config,
mockMessageBus,
);
const invocation = tool.build({
instruction: 'Find the red error',
});
const result = await invocation.execute(new AbortController().signal);
expect(result.error).toBeDefined();
expect(result.llmContent).toContain(
'Visual analysis model is not available',
);
});
it('returns a model-unavailability fallback for 403 errors', async () => {
const browserManager = createMockBrowserManager();
const config = createMockConfig(
undefined,
new Error('permission denied: 403'),
);
const tool = createAnalyzeScreenshotTool(
browserManager,
config,
mockMessageBus,
);
const invocation = tool.build({
instruction: 'Identify the element',
});
const result = await invocation.execute(new AbortController().signal);
expect(result.error).toBeDefined();
expect(result.llmContent).toContain(
'Visual analysis model is not available',
);
});
it('returns a generic error for non-model errors', async () => {
const browserManager = createMockBrowserManager();
const config = createMockConfig(undefined, new Error('Network timeout'));
const tool = createAnalyzeScreenshotTool(
browserManager,
config,
mockMessageBus,
);
const invocation = tool.build({
instruction: 'Find something',
});
const result = await invocation.execute(new AbortController().signal);
expect(result.error).toBeDefined();
expect(result.llmContent).toContain('Visual analysis failed');
expect(result.llmContent).toContain('Network timeout');
});
});
});
@@ -0,0 +1,254 @@
/**
* @license
* Copyright 2025 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
/**
* @fileoverview Tool for visual identification via a single model call.
*
* The semantic browser agent uses this tool when it needs to identify
* elements by visual attributes not present in the accessibility tree
* (e.g., color, layout, precise coordinates).
*
* Unlike the semantic agent which works with the accessibility tree,
* this tool sends a screenshot to a computer-use model for visual analysis.
* It returns the model's analysis (coordinates, element descriptions) back
* to the browser agent, which retains full control of subsequent actions.
*/
import {
DeclarativeTool,
BaseToolInvocation,
Kind,
type ToolResult,
type ToolInvocation,
} from '../../tools/tools.js';
import type { MessageBus } from '../../confirmation-bus/message-bus.js';
import type { BrowserManager } from './browserManager.js';
import type { Config } from '../../config/config.js';
import { getVisualAgentModel } from './modelAvailability.js';
import { debugLogger } from '../../utils/debugLogger.js';
/**
* System prompt for the visual analysis model call.
*/
const VISUAL_SYSTEM_PROMPT = `You are a Visual Analysis Agent. You receive a screenshot of a browser page and an instruction.
Your job is to ANALYZE the screenshot and provide precise information that a browser automation agent can act on.
COORDINATE SYSTEM:
- Coordinates are pixel-based relative to the viewport
- (0,0) is top-left of the visible area
- Estimate element positions from the screenshot
RESPONSE FORMAT:
- For coordinate identification: provide exact (x, y) pixel coordinates
- For element identification: describe the element's visual location and appearance
- For layout analysis: describe the spatial relationships between elements
- Be concise and actionable — the browser agent will use your response to decide what action to take
IMPORTANT:
- You are NOT performing actions — you are only providing visual analysis
- Include coordinates when possible so the caller can use click_at(x, y)
- If the element is not visible in the screenshot, say so explicitly`;
/**
* Invocation for the analyze_screenshot tool.
* Makes a single generateContent call with a screenshot.
*/
class AnalyzeScreenshotInvocation extends BaseToolInvocation<
Record<string, unknown>,
ToolResult
> {
constructor(
private readonly browserManager: BrowserManager,
private readonly config: Config,
params: Record<string, unknown>,
messageBus: MessageBus,
) {
super(params, messageBus, 'analyze_screenshot', 'Analyze Screenshot');
}
getDescription(): string {
const instruction = String(this.params['instruction'] ?? '');
const preview =
instruction.length > 50
? instruction.substring(0, 50) + '...'
: instruction;
return `Visual analysis: "${preview}"`;
}
async execute(signal: AbortSignal): Promise<ToolResult> {
try {
const instruction = String(this.params['instruction'] ?? '');
debugLogger.log(`Visual analysis requested: ${instruction}`);
// Capture screenshot via MCP tool
const screenshotResult = await this.browserManager.callTool(
'take_screenshot',
{},
);
// Extract base64 image data from MCP response.
// Search ALL content items for image type — MCP returns [text, image]
// where content[0] is a text description and content[1] is the actual PNG.
let screenshotBase64 = '';
let mimeType = 'image/png';
if (screenshotResult.content && Array.isArray(screenshotResult.content)) {
for (const item of screenshotResult.content) {
if (item.type === 'image' && item.data) {
screenshotBase64 = item.data;
mimeType = item.mimeType ?? 'image/png';
break;
}
}
}
if (!screenshotBase64) {
return {
llmContent:
'Failed to capture screenshot for visual analysis. Use accessibility tree elements instead.',
returnDisplay: 'Screenshot capture failed',
error: { message: 'Screenshot capture failed' },
};
}
// Make a single generateContent call with the visual model
const visualModel = getVisualAgentModel(this.config);
const contentGenerator = this.config.getContentGenerator();
const response = await contentGenerator.generateContent(
{
model: visualModel,
config: {
temperature: 0,
topP: 0.95,
systemInstruction: VISUAL_SYSTEM_PROMPT,
abortSignal: signal,
},
contents: [
{
role: 'user',
parts: [
{
text: `Analyze this screenshot and respond to the following instruction:\n\n${instruction}`,
},
{
inlineData: {
mimeType,
data: screenshotBase64,
},
},
],
},
],
},
'visual-analysis',
);
// Extract text from response
const responseText =
response.candidates?.[0]?.content?.parts
?.filter((p) => p.text)
.map((p) => p.text)
.join('\n') ?? '';
if (!responseText) {
return {
llmContent:
'Visual model returned no analysis. Use accessibility tree elements instead.',
returnDisplay: 'Visual analysis returned empty response',
error: { message: 'Empty visual analysis response' },
};
}
debugLogger.log(
`Visual analysis complete: ${responseText.slice(0, 100)}`,
);
return {
llmContent: `Visual Analysis Result:\n${responseText}`,
returnDisplay: `Visual Analysis Result:\n${responseText}`,
};
} catch (error) {
const errorMsg = error instanceof Error ? error.message : String(error);
debugLogger.error(`Visual analysis failed: ${errorMsg}`);
// Provide a graceful fallback message for model unavailability
const isModelError =
errorMsg.includes('404') ||
errorMsg.includes('403') ||
errorMsg.includes('not found') ||
errorMsg.includes('permission');
const fallbackMsg = isModelError
? 'Visual analysis model is not available. Use accessibility tree elements (uids from take_snapshot) for all interactions instead.'
: `Visual analysis failed: ${errorMsg}. Use accessibility tree elements instead.`;
return {
llmContent: fallbackMsg,
returnDisplay: fallbackMsg,
error: { message: errorMsg },
};
}
}
}
/**
* DeclarativeTool for screenshot-based visual analysis.
*/
class AnalyzeScreenshotTool extends DeclarativeTool<
Record<string, unknown>,
ToolResult
> {
constructor(
private readonly browserManager: BrowserManager,
private readonly config: Config,
messageBus: MessageBus,
) {
super(
'analyze_screenshot',
'analyze_screenshot',
'Analyze the current page visually using a screenshot. Use when you need to identify elements by visual attributes (color, layout, position) not available in the accessibility tree, or when you need precise pixel coordinates for click_at. Returns visual analysis — you perform the actions yourself.',
Kind.Other,
{
type: 'object',
properties: {
instruction: {
type: 'string',
description:
'What to identify or analyze visually (e.g., "Find the coordinates of the blue submit button", "What is the layout of the navigation menu?").',
},
},
required: ['instruction'],
},
messageBus,
true, // isOutputMarkdown
false, // canUpdateOutput
);
}
build(
params: Record<string, unknown>,
): ToolInvocation<Record<string, unknown>, ToolResult> {
return new AnalyzeScreenshotInvocation(
this.browserManager,
this.config,
params,
this.messageBus,
);
}
}
/**
* Creates the analyze_screenshot tool for the browser agent.
*/
export function createAnalyzeScreenshotTool(
browserManager: BrowserManager,
config: Config,
messageBus: MessageBus,
): AnalyzeScreenshotTool {
return new AnalyzeScreenshotTool(browserManager, config, messageBus);
}
@@ -65,7 +65,12 @@ Before interacting with page content, scan the accessibility tree for blocking o
- If you see such elements, DISMISS THEM FIRST by clicking close/dismiss buttons before proceeding
- If a click seems to have no effect, check if an overlay appeared or is blocking the target
For complex visual interactions (coordinate-based clicks, dragging) OR when you need to identify elements by visual attributes not present in the AX tree (e.g., "click the yellow button", "find the red error message"), use delegate_to_visual_agent with a clear instruction.
VISUAL IDENTIFICATION (analyze_screenshot):
When you need to identify elements by visual attributes not in the AX tree (e.g., "click the yellow button", "find the red error message"), or need precise pixel coordinates:
1. Call analyze_screenshot with a clear instruction describing what to find
2. It returns visual analysis with coordinates/descriptions — it does NOT perform actions
3. Use the returned coordinates with click_at(x, y) or other tools yourself
4. If the analysis is insufficient, call it again with a more specific instruction
CRITICAL: When you have fully completed the user's task, you MUST call the complete_task tool with a summary of what you accomplished. Do NOT just return text - you must explicitly call complete_task to exit the loop.`;
@@ -12,15 +12,19 @@ import {
import { makeFakeConfig } from '../../test-utils/config.js';
import type { Config } from '../../config/config.js';
import type { MessageBus } from '../../confirmation-bus/message-bus.js';
import { BrowserAgentDefinition } from './browserAgentDefinition.js';
import type { BrowserManager } from './browserManager.js';
// Create mock browser manager
const mockBrowserManager = {
ensureConnection: vi.fn().mockResolvedValue(undefined),
getDiscoveredTools: vi.fn().mockResolvedValue([
// Semantic tools
{ name: 'take_snapshot', description: 'Take snapshot' },
{ name: 'click', description: 'Click element' },
{ name: 'fill', description: 'Fill form field' },
{ name: 'navigate_page', description: 'Navigate to URL' },
// Visual tools (from --experimental-vision)
{ name: 'click_at', description: 'Click at coordinates' },
]),
callTool: vi.fn().mockResolvedValue({ content: [] }),
close: vi.fn().mockResolvedValue(undefined),
@@ -34,6 +38,7 @@ vi.mock('./browserManager.js', () => ({
vi.mock('../../utils/debugLogger.js', () => ({
debugLogger: {
log: vi.fn(),
warn: vi.fn(),
error: vi.fn(),
},
}));
@@ -48,8 +53,13 @@ describe('browserAgentFactory', () => {
// Reset mock implementations
mockBrowserManager.ensureConnection.mockResolvedValue(undefined);
mockBrowserManager.getDiscoveredTools.mockResolvedValue([
// Semantic tools
{ name: 'take_snapshot', description: 'Take snapshot' },
{ name: 'click', description: 'Click element' },
{ name: 'fill', description: 'Fill form field' },
{ name: 'navigate_page', description: 'Navigate to URL' },
// Visual tools (from --experimental-vision)
{ name: 'click_at', description: 'Click at coordinates' },
]);
mockBrowserManager.close.mockResolvedValue(undefined);
@@ -90,8 +100,9 @@ describe('browserAgentFactory', () => {
mockMessageBus,
);
expect(definition.name).toBe(BrowserAgentDefinition.name);
expect(definition.toolConfig?.tools).toHaveLength(2);
expect(definition.name).toBe('browser_agent');
// 5 MCP tools + 1 analyze_screenshot tool
expect(definition.toolConfig?.tools).toHaveLength(6);
});
it('should return browser manager for cleanup', async () => {
@@ -16,14 +16,17 @@
*/
import type { Config } from '../../config/config.js';
import { AuthType } from '../../core/contentGenerator.js';
import type { LocalAgentDefinition } from '../types.js';
import type { MessageBus } from '../../confirmation-bus/message-bus.js';
import type { AnyDeclarativeTool } from '../../tools/tools.js';
import { BrowserManager } from './browserManager.js';
import {
BrowserAgentDefinition,
type BrowserTaskResultSchema,
} from './browserAgentDefinition.js';
import { createMcpDeclarativeTools } from './mcpToolWrapper.js';
import { createAnalyzeScreenshotTool } from './analyzeScreenshot.js';
import { debugLogger } from '../../utils/debugLogger.js';
/**
@@ -61,17 +64,91 @@ export async function createBrowserAgentDefinition(
// Create declarative tools from dynamically discovered MCP tools
// These tools dispatch to browserManager's isolated client
const mcpTools = await createMcpDeclarativeTools(browserManager, messageBus);
const availableToolNames = mcpTools.map((t) => t.name);
// Validate required semantic tools are available
const requiredSemanticTools = [
'click',
'fill',
'navigate_page',
'take_snapshot',
];
const missingSemanticTools = requiredSemanticTools.filter(
(t) => !availableToolNames.includes(t),
);
if (missingSemanticTools.length > 0) {
debugLogger.warn(
`Semantic tools missing (${missingSemanticTools.join(', ')}). ` +
'Some browser interactions may not work correctly.',
);
}
// Only click_at is strictly required — text input can use press_key or fill.
const requiredVisualTools = ['click_at'];
const missingVisualTools = requiredVisualTools.filter(
(t) => !availableToolNames.includes(t),
);
// Check if visual agent model is available for current auth type.
// The visual agent model (computer-use) is only available via Gemini API key
// or Vertex AI, not via GCA/OAuth or Cloud Shell.
const isVisualModelAvailable = (() => {
const authType = config.getContentGeneratorConfig()?.authType;
if (
authType === AuthType.LOGIN_WITH_GOOGLE ||
authType === AuthType.LEGACY_CLOUD_SHELL ||
authType === AuthType.COMPUTE_ADC
) {
return false;
}
return true;
})();
// Create all tools - visual delegation only if visual tools are available
const allTools: AnyDeclarativeTool[] = [...mcpTools];
if (missingVisualTools.length > 0) {
debugLogger.log(
`Visual tools missing (${missingVisualTools.join(', ')}). ` +
`Visual agent delegation disabled. Ensure chrome-devtools-mcp is started with --experimental-vision.`,
);
if (printOutput) {
printOutput(
`⚠️ Visual tools unavailable - coordinate-based actions disabled.`,
);
}
} else if (!isVisualModelAvailable) {
debugLogger.log(
`Visual agent model not available for current auth type. ` +
`Visual agent delegation disabled.`,
);
if (printOutput) {
printOutput(
`⚠️ Visual agent unavailable for current auth type - coordinate-based actions disabled.`,
);
}
} else {
// Create visual analysis tool only if visual tools are available
const visualDelegationTool = createAnalyzeScreenshotTool(
browserManager,
config,
messageBus,
);
allTools.push(visualDelegationTool);
}
debugLogger.log(
`Created ${mcpTools.length} isolated MCP tools for browser agent: ` +
mcpTools.map((t) => t.name).join(', '),
`Created ${allTools.length} tools for browser agent: ` +
allTools.map((t) => t.name).join(', '),
);
// Create configured definition with tools
// BrowserAgentDefinition is a factory function - call it with config
const baseDefinition = BrowserAgentDefinition(config);
const definition: LocalAgentDefinition<typeof BrowserTaskResultSchema> = {
...BrowserAgentDefinition,
...baseDefinition,
toolConfig: {
tools: mcpTools,
tools: allTools,
},
};
@@ -25,7 +25,6 @@ import {
createBrowserAgentDefinition,
cleanupBrowserAgent,
} from './browserAgentFactory.js';
import { BrowserAgentDefinition } from './browserAgentDefinition.js';
const INPUT_PREVIEW_MAX_LENGTH = 50;
const DESCRIPTION_MAX_LENGTH = 200;
@@ -48,11 +47,12 @@ export class BrowserAgentInvocation extends BaseToolInvocation<
_toolName?: string,
_toolDisplayName?: string,
) {
// Note: BrowserAgentDefinition is a factory function, so we use hardcoded names
super(
params,
messageBus,
_toolName ?? BrowserAgentDefinition.name,
_toolDisplayName ?? BrowserAgentDefinition.displayName,
_toolName ?? 'browser_agent',
_toolDisplayName ?? 'Browser Agent',
);
}
@@ -156,7 +156,7 @@ export class BrowserManager {
data?: string;
mimeType?: string;
}) => ({
type: (item.type === 'image' ? 'image' : 'text'),
type: item.type === 'image' ? 'image' : 'text',
text: item.text,
data: item.data,
mimeType: item.mimeType,
@@ -236,15 +236,17 @@ export class BrowserManager {
);
// Build args for chrome-devtools-mcp
const browserConfig = this.config.getBrowserAgentConfig();
const sessionMode = browserConfig.customConfig.sessionMode ?? 'isolated';
const mcpArgs = [
'-y',
`chrome-devtools-mcp@${CHROME_DEVTOOLS_MCP_VERSION}`,
'--isolated',
sessionMode === 'existing' ? '--existing' : '--isolated',
'--experimental-vision',
];
// Add optional settings from config
const browserConfig = this.config.getBrowserAgentConfig();
if (browserConfig.customConfig.headless) {
mcpArgs.push('--headless');
}
@@ -256,7 +258,7 @@ export class BrowserManager {
}
debugLogger.log(
`Launching chrome-devtools-mcp with args: ${mcpArgs.join(' ')}`,
`Launching chrome-devtools-mcp (${sessionMode} mode) with args: ${mcpArgs.join(' ')}`,
);
// Create stdio transport to npx chrome-devtools-mcp
@@ -81,10 +81,11 @@ describe('mcpToolWrapper', () => {
mockMessageBus,
);
expect(tools[0].description).toBe(
// Descriptions include augmented hints, so we check they contain the original
expect(tools[0].description).toContain(
'Take a snapshot of the page accessibility tree',
);
expect(tools[1].description).toBe('Click on an element by uid');
expect(tools[1].description).toContain('Click on an element by uid');
});
it('should return tools with proper FunctionDeclaration schema', async () => {
@@ -17,6 +17,7 @@
import type { FunctionDeclaration } from '@google/genai';
import type { Tool as McpTool } from '@modelcontextprotocol/sdk/types.js';
import type { ToolConfirmationOutcome } from '../../tools/tools.js';
import {
DeclarativeTool,
BaseToolInvocation,
@@ -24,6 +25,7 @@ import {
type ToolResult,
type ToolInvocation,
type ToolCallConfirmationDetails,
type PolicyUpdateOptions,
} from '../../tools/tools.js';
import type { MessageBus } from '../../confirmation-bus/message-bus.js';
import type { BrowserManager, McpToolCallResult } from './browserManager.js';
@@ -49,16 +51,31 @@ class McpToolInvocation extends BaseToolInvocation<
return `Calling MCP tool: ${this.toolName}`;
}
/**
* TODO: Remove this override once subagent tool confirmation is implemented
* in the framework. Currently, subagent tools auto-approve by bypassing
* the MessageBus confirmation flow. This matches how codebase_investigator
* and other subagents work.
*/
override async shouldConfirmExecute(
protected override async getConfirmationDetails(
_abortSignal: AbortSignal,
): Promise<ToolCallConfirmationDetails | false> {
return false;
if (!this.messageBus) {
return false;
}
return {
type: 'mcp',
title: `Confirm MCP Tool: ${this.toolName}`,
serverName: 'browser-agent',
toolName: this.toolName,
toolDisplayName: this.toolName,
onConfirm: async (outcome: ToolConfirmationOutcome) => {
await this.publishPolicyUpdate(outcome);
},
};
}
protected override getPolicyUpdateOptions(
_outcome: ToolConfirmationOutcome,
): PolicyUpdateOptions | undefined {
return {
mcpName: 'browser-agent',
};
}
async execute(signal: AbortSignal): Promise<ToolResult> {
@@ -79,17 +96,23 @@ class McpToolInvocation extends BaseToolInvocation<
.join('\n');
}
// Post-process to add contextual hints for common error patterns
const processedContent = postProcessToolResult(
this.toolName,
textContent,
);
if (result.isError) {
return {
llmContent: `Error: ${textContent}`,
returnDisplay: `Error: ${textContent}`,
llmContent: `Error: ${processedContent}`,
returnDisplay: `Error: ${processedContent}`,
error: { message: textContent },
};
}
return {
llmContent: textContent || 'Tool executed successfully.',
returnDisplay: textContent || 'Tool executed successfully.',
llmContent: processedContent || 'Tool executed successfully.',
returnDisplay: processedContent || 'Tool executed successfully.',
};
} catch (error) {
const errorMsg = error instanceof Error ? error.message : String(error);
@@ -166,10 +189,15 @@ export async function createMcpDeclarativeTools(
return mcpTools.map((mcpTool) => {
const schema = convertMcpToolToFunctionDeclaration(mcpTool);
// Augment description with uid-context hints
const augmentedDescription = augmentToolDescription(
mcpTool.name,
mcpTool.description ?? '',
);
return new McpDeclarativeTool(
browserManager,
mcpTool.name,
mcpTool.description ?? '',
augmentedDescription,
schema.parametersJsonSchema,
messageBus,
);
@@ -193,3 +221,97 @@ function convertMcpToolToFunctionDeclaration(
},
};
}
/**
* Augments MCP tool descriptions with uid-context hints.
* Adds semantic guidance for tools that work with accessibility tree elements.
*/
function augmentToolDescription(toolName: string, description: string): string {
const uidHints: Record<string, string> = {
click:
' Use the element uid from the accessibility tree snapshot (e.g., uid="87_4" for a button).',
fill: ' Use the element uid from the accessibility tree snapshot for input/select elements.',
hover:
' Use the element uid from the accessibility tree snapshot to hover over elements.',
type: ' Type text into the currently focused element.',
scroll:
' Scroll the page in the specified direction. Use after take_snapshot to see more content.',
take_snapshot:
' Returns the accessibility tree with uid values for each element. Call this first to see available elements.',
navigate_page:
' Navigate to the specified URL. Call take_snapshot after to see the new page.',
new_page:
' Opens a new page/tab with the specified URL. Call take_snapshot after to see the new page.',
press_key:
' Press a keyboard key. Use for Enter, Tab, Escape, arrow keys, etc.',
};
// Check for partial matches (e.g., "click" matches "click_element")
for (const [key, hint] of Object.entries(uidHints)) {
if (toolName.toLowerCase().includes(key)) {
return description + hint;
}
}
return description;
}
/**
* Post-processes tool results to add contextual hints for common error patterns.
* This helps the agent recover from overlay blocking, element not found, etc.
* Also strips embedded snapshots to prevent token bloat.
*/
export function postProcessToolResult(
toolName: string,
result: string,
): string {
// Strip embedded snapshots to prevent token bloat (except for take_snapshot,
// whose accessibility tree the model needs for uid-based interactions).
let processedResult = result;
if (
toolName !== 'take_snapshot' &&
result.includes('## Latest page snapshot')
) {
const parts = result.split('## Latest page snapshot');
processedResult = parts[0].trim();
if (parts[1]) {
debugLogger.log('Stripped embedded snapshot from tool response');
}
}
// Detect overlay/interactable issues
const overlayPatterns = [
'not interactable',
'obscured',
'intercept',
'blocked',
'element is not visible',
'element not found',
];
const isOverlayIssue = overlayPatterns.some((pattern) =>
processedResult.toLowerCase().includes(pattern),
);
if (isOverlayIssue && (toolName === 'click' || toolName.includes('click'))) {
return (
processedResult +
'\n\n⚠️ This action may have been blocked by an overlay, popup, or tooltip. ' +
'Look for close/dismiss buttons (×, Close, "Got it", "Accept") in the accessibility tree and click them first.'
);
}
// Detect stale element references
if (
processedResult.toLowerCase().includes('stale') ||
processedResult.toLowerCase().includes('detached')
) {
return (
processedResult +
'\n\n⚠️ The element reference is stale. Call take_snapshot to get fresh element uids.'
);
}
return processedResult;
}
@@ -0,0 +1,89 @@
/**
* @license
* Copyright 2025 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, it, expect, vi, beforeEach } from 'vitest';
import { createMcpDeclarativeTools } from './mcpToolWrapper.js';
import type { BrowserManager } from './browserManager.js';
import type { MessageBus } from '../../confirmation-bus/message-bus.js';
import { MessageBusType } from '../../confirmation-bus/types.js';
import { ToolConfirmationOutcome } from '../../tools/tools.js';
describe('mcpToolWrapper Confirmation', () => {
let mockBrowserManager: BrowserManager;
let mockMessageBus: MessageBus;
beforeEach(() => {
mockBrowserManager = {
getDiscoveredTools: vi
.fn()
.mockResolvedValue([
{ name: 'test_tool', description: 'desc', inputSchema: {} },
]),
callTool: vi.fn(),
} as unknown as BrowserManager;
// We accept any cast here because we are mocking the interface
mockMessageBus = {
publish: vi.fn().mockResolvedValue(undefined),
subscribe: vi.fn(),
unsubscribe: vi.fn(),
} as unknown as MessageBus;
});
it('getConfirmationDetails returns specific MCP details', async () => {
const tools = await createMcpDeclarativeTools(
mockBrowserManager,
mockMessageBus,
);
const invocation = tools[0].build({});
// Use "any" to access protected method
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const details = await (invocation as any).getConfirmationDetails(
new AbortController().signal,
);
expect(details).toEqual(
expect.objectContaining({
type: 'mcp',
serverName: 'browser-agent',
toolName: 'test_tool',
}),
);
// Verify onConfirm publishes policy update
const outcome = ToolConfirmationOutcome.ProceedAlways;
await details.onConfirm(outcome);
expect(mockMessageBus.publish).toHaveBeenCalledWith(
expect.objectContaining({
type: MessageBusType.UPDATE_POLICY,
mcpName: 'browser-agent',
persist: false, // ProceedAlwaysServer doesn't persist by default unless specified otherwise in logic?
// Wait, BaseToolInvocation.publishPolicyUpdate handles logic.
// If outcome is ProceedAlwaysServer, BaseToolInvocation doesn't do anything by default!
}),
);
});
it('getPolicyUpdateOptions returns correct options', async () => {
const tools = await createMcpDeclarativeTools(
mockBrowserManager,
mockMessageBus,
);
const invocation = tools[0].build({});
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const options = (invocation as any).getPolicyUpdateOptions(
ToolConfirmationOutcome.ProceedAlways,
);
expect(options).toEqual({
mcpName: 'browser-agent',
});
});
});
@@ -0,0 +1,34 @@
/**
* @license
* Copyright 2025 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
/**
* @fileoverview Model configuration for browser agent.
*
* Provides the default visual agent model and utilities for resolving
* the configured model.
*/
import type { Config } from '../../config/config.js';
import { debugLogger } from '../../utils/debugLogger.js';
/**
* Default model for the visual agent (Computer Use capable).
*/
export const VISUAL_AGENT_MODEL = 'gemini-2.5-computer-use-preview-10-2025';
/**
* Gets the visual agent model from config, falling back to default.
*
* @param config Runtime configuration
* @returns The model to use for visual agent
*/
export function getVisualAgentModel(config: Config): string {
const browserConfig = config.getBrowserAgentConfig();
const model = browserConfig.customConfig.visualModel ?? VISUAL_AGENT_MODEL;
debugLogger.log(`Visual agent model: ${model}`);
return model;
}
-7
View File
@@ -12,7 +12,6 @@ import { loadAgentsFromDirectory } from './agentLoader.js';
import { CodebaseInvestigatorAgent } from './codebase-investigator.js';
import { CliHelpAgent } from './cli-help-agent.js';
import { GeneralistAgent } from './generalist-agent.js';
import { IntrospectionAgent } from './introspection-agent.js';
import { BrowserAgentDefinition } from './browser/browserAgentDefinition.js';
import { A2AClientManager } from './a2a-client-manager.js';
import { ADCHandler } from './remote-invocation.js';
@@ -204,12 +203,6 @@ export class AgentRegistry {
this.registerLocalAgent(CliHelpAgent(this.config));
this.registerLocalAgent(GeneralistAgent(this.config));
// Register the introspection agent if it's explicitly enabled.
const introspectionSettings = this.config.getIntrospectionAgentSettings();
if (introspectionSettings.enabled) {
this.registerLocalAgent(IntrospectionAgent);
}
// Register the browser agent if enabled in settings.
// Tools are configured dynamically at invocation time via browserAgentFactory.
const browserConfig = this.config.getBrowserAgentConfig();