Files
gemini-cli/packages/core/src/agents/browser/mcpToolWrapper.ts
T
Aditya Bijalwan 5caa192cfc feat: add pulsating blue border automation overlay to browser agent (#21173)
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Gaurav <39389231+gsquared94@users.noreply.github.com>
2026-03-10 20:15:03 +00:00

547 lines
17 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
/**
* @fileoverview Creates DeclarativeTool classes for MCP tools.
*
* These tools are ONLY registered in the browser agent's isolated ToolRegistry,
* NOT in the main agent's registry. They dispatch to the BrowserManager's
* isolated MCP client directly.
*
* Tool definitions are dynamically discovered from chrome-devtools-mcp
* at runtime, not hardcoded.
*/
import type { FunctionDeclaration } from '@google/genai';
import type { Tool as McpTool } from '@modelcontextprotocol/sdk/types.js';
import {
type ToolConfirmationOutcome,
DeclarativeTool,
BaseToolInvocation,
Kind,
type ToolResult,
type ToolInvocation,
type ToolCallConfirmationDetails,
type PolicyUpdateOptions,
} from '../../tools/tools.js';
import type { MessageBus } from '../../confirmation-bus/message-bus.js';
import type { BrowserManager, McpToolCallResult } from './browserManager.js';
import { debugLogger } from '../../utils/debugLogger.js';
/**
* Tool invocation that dispatches to BrowserManager's isolated MCP client.
*/
class McpToolInvocation extends BaseToolInvocation<
Record<string, unknown>,
ToolResult
> {
constructor(
protected readonly browserManager: BrowserManager,
protected readonly toolName: string,
params: Record<string, unknown>,
messageBus: MessageBus,
) {
super(params, messageBus, toolName, toolName);
}
getDescription(): string {
return `Calling MCP tool: ${this.toolName}`;
}
protected override async getConfirmationDetails(
_abortSignal: AbortSignal,
): Promise<ToolCallConfirmationDetails | false> {
if (!this.messageBus) {
return false;
}
return {
type: 'mcp',
title: `Confirm MCP Tool: ${this.toolName}`,
serverName: 'browser-agent',
toolName: this.toolName,
toolDisplayName: this.toolName,
onConfirm: async (outcome: ToolConfirmationOutcome) => {
await this.publishPolicyUpdate(outcome);
},
};
}
override getPolicyUpdateOptions(
_outcome: ToolConfirmationOutcome,
): PolicyUpdateOptions | undefined {
return {
mcpName: 'browser-agent',
};
}
async execute(signal: AbortSignal): Promise<ToolResult> {
try {
const callToolPromise = this.browserManager.callTool(
this.toolName,
this.params,
signal,
);
const result: McpToolCallResult = await callToolPromise;
// Extract text content from MCP response
let textContent = '';
if (result.content && Array.isArray(result.content)) {
textContent = result.content
.filter((c) => c.type === 'text' && c.text)
.map((c) => c.text)
.join('\n');
}
// Post-process to add contextual hints for common error patterns
const processedContent = postProcessToolResult(
this.toolName,
textContent,
);
if (result.isError) {
return {
llmContent: `Error: ${processedContent}`,
returnDisplay: `Error: ${processedContent}`,
error: { message: textContent },
};
}
return {
llmContent: processedContent || 'Tool executed successfully.',
returnDisplay: processedContent || 'Tool executed successfully.',
};
} catch (error) {
const errorMsg = error instanceof Error ? error.message : String(error);
// Chrome connection errors are fatal — re-throw to terminate the agent
// immediately instead of returning a result the LLM would retry.
if (errorMsg.includes('Could not connect to Chrome')) {
throw error;
}
debugLogger.error(`MCP tool ${this.toolName} failed: ${errorMsg}`);
return {
llmContent: `Error: ${errorMsg}`,
returnDisplay: `Error: ${errorMsg}`,
error: { message: errorMsg },
};
}
}
}
/**
* Composite tool invocation that types a full string by calling press_key
* for each character internally, avoiding N model round-trips.
*/
class TypeTextInvocation extends BaseToolInvocation<
Record<string, unknown>,
ToolResult
> {
constructor(
private readonly browserManager: BrowserManager,
private readonly text: string,
private readonly submitKey: string | undefined,
messageBus: MessageBus,
) {
super({ text, submitKey }, messageBus, 'type_text', 'type_text');
}
getDescription(): string {
const preview = `"${this.text.substring(0, 50)}${this.text.length > 50 ? '...' : ''}"`;
return this.submitKey
? `type_text: ${preview} + ${this.submitKey}`
: `type_text: ${preview}`;
}
protected override async getConfirmationDetails(
_abortSignal: AbortSignal,
): Promise<ToolCallConfirmationDetails | false> {
if (!this.messageBus) {
return false;
}
return {
type: 'mcp',
title: `Confirm Tool: type_text`,
serverName: 'browser-agent',
toolName: 'type_text',
toolDisplayName: 'type_text',
onConfirm: async (outcome: ToolConfirmationOutcome) => {
await this.publishPolicyUpdate(outcome);
},
};
}
override getPolicyUpdateOptions(
_outcome: ToolConfirmationOutcome,
): PolicyUpdateOptions | undefined {
return {
mcpName: 'browser-agent',
};
}
override async execute(signal: AbortSignal): Promise<ToolResult> {
try {
if (signal.aborted) {
return {
llmContent: 'Error: Operation cancelled before typing started.',
returnDisplay: 'Operation cancelled before typing started.',
error: { message: 'Operation cancelled' },
};
}
await this.typeCharByChar(signal);
// Optionally press a submit key (Enter, Tab, etc.) after typing
if (this.submitKey && !signal.aborted) {
const keyResult = await this.browserManager.callTool(
'press_key',
{ key: this.submitKey },
signal,
);
if (keyResult.isError) {
const errText = this.extractErrorText(keyResult);
debugLogger.warn(
`type_text: submitKey("${this.submitKey}") failed: ${errText}`,
);
}
}
const summary = this.submitKey
? `Successfully typed "${this.text}" and pressed ${this.submitKey}`
: `Successfully typed "${this.text}"`;
return {
llmContent: summary,
returnDisplay: summary,
};
} catch (error) {
const errorMsg = error instanceof Error ? error.message : String(error);
// Chrome connection errors are fatal
if (errorMsg.includes('Could not connect to Chrome')) {
throw error;
}
debugLogger.error(`type_text failed: ${errorMsg}`);
return {
llmContent: `Error: ${errorMsg}`,
returnDisplay: `Error: ${errorMsg}`,
error: { message: errorMsg },
};
}
}
/** Types each character via individual press_key MCP calls. */
private async typeCharByChar(signal: AbortSignal): Promise<void> {
const chars = [...this.text]; // Handle Unicode correctly
for (const char of chars) {
if (signal.aborted) return;
// Map special characters to key names
const key = char === ' ' ? 'Space' : char;
const result = await this.browserManager.callTool(
'press_key',
{ key },
signal,
);
if (result.isError) {
debugLogger.warn(
`type_text: press_key("${key}") failed: ${this.extractErrorText(result)}`,
);
}
}
}
/** Extract error text from an MCP tool result. */
private extractErrorText(result: McpToolCallResult): string {
return (
result.content
?.filter(
(c: { type: string; text?: string }) => c.type === 'text' && c.text,
)
.map((c: { type: string; text?: string }) => c.text)
.join('\n') || 'Unknown error'
);
}
}
/**
* DeclarativeTool wrapper for an MCP tool.
*/
class McpDeclarativeTool extends DeclarativeTool<
Record<string, unknown>,
ToolResult
> {
constructor(
protected readonly browserManager: BrowserManager,
name: string,
description: string,
parameterSchema: unknown,
messageBus: MessageBus,
) {
super(
name,
name,
description,
Kind.Other,
parameterSchema,
messageBus,
/* isOutputMarkdown */ true,
/* canUpdateOutput */ false,
);
}
build(
params: Record<string, unknown>,
): ToolInvocation<Record<string, unknown>, ToolResult> {
return new McpToolInvocation(
this.browserManager,
this.name,
params,
this.messageBus,
);
}
}
/**
* DeclarativeTool for the custom type_text composite tool.
*/
class TypeTextDeclarativeTool extends DeclarativeTool<
Record<string, unknown>,
ToolResult
> {
constructor(
private readonly browserManager: BrowserManager,
messageBus: MessageBus,
) {
super(
'type_text',
'type_text',
'Types a full text string into the currently focused element. ' +
'Much faster than calling press_key for each character individually. ' +
'Use this to enter text into form fields, search boxes, spreadsheet cells, or any focused input. ' +
'The element must already be focused (e.g., after a click). ' +
'Use submitKey to press a key after typing (e.g., submitKey="Enter" to submit a form or confirm a value, submitKey="Tab" to move to the next field).',
Kind.Other,
{
type: 'object',
properties: {
text: {
type: 'string',
description: 'The text to type into the focused element.',
},
submitKey: {
type: 'string',
description:
'Optional key to press after typing (e.g., "Enter", "Tab", "Escape"). ' +
'Useful for submitting form fields or moving to the next cell in a spreadsheet.',
},
},
required: ['text'],
},
messageBus,
/* isOutputMarkdown */ true,
/* canUpdateOutput */ false,
);
}
build(
params: Record<string, unknown>,
): ToolInvocation<Record<string, unknown>, ToolResult> {
const submitKey =
// eslint-disable-next-line no-restricted-syntax
typeof params['submitKey'] === 'string' && params['submitKey']
? params['submitKey']
: undefined;
return new TypeTextInvocation(
this.browserManager,
String(params['text'] ?? ''),
submitKey,
this.messageBus,
);
}
}
/**
* Creates DeclarativeTool instances from dynamically discovered MCP tools,
* plus custom composite tools (like type_text).
*
* These tools are registered in the browser agent's isolated ToolRegistry,
* NOT in the main agent's registry.
*
* Tool definitions are fetched dynamically from the MCP server at runtime.
*
* @param browserManager The browser manager with isolated MCP client
* @param messageBus Message bus for tool invocations
* @returns Array of DeclarativeTools that dispatch to the isolated MCP client
*/
export async function createMcpDeclarativeTools(
browserManager: BrowserManager,
messageBus: MessageBus,
): Promise<Array<McpDeclarativeTool | TypeTextDeclarativeTool>> {
// Get dynamically discovered tools from the MCP server
const mcpTools = await browserManager.getDiscoveredTools();
debugLogger.log(
`Creating ${mcpTools.length} declarative tools for browser agent`,
);
const tools: Array<McpDeclarativeTool | TypeTextDeclarativeTool> =
mcpTools.map((mcpTool) => {
const schema = convertMcpToolToFunctionDeclaration(mcpTool);
// Augment description with uid-context hints
const augmentedDescription = augmentToolDescription(
mcpTool.name,
mcpTool.description ?? '',
);
return new McpDeclarativeTool(
browserManager,
mcpTool.name,
augmentedDescription,
schema.parametersJsonSchema,
messageBus,
);
});
// Add custom composite tools
tools.push(new TypeTextDeclarativeTool(browserManager, messageBus));
debugLogger.log(
`Total tools registered: ${tools.length} (${mcpTools.length} MCP + 1 custom)`,
);
return tools;
}
/**
* Converts MCP tool definition to Gemini FunctionDeclaration.
*/
function convertMcpToolToFunctionDeclaration(
mcpTool: McpTool,
): FunctionDeclaration {
// MCP tool inputSchema is a JSON Schema object
// We pass it directly as parametersJsonSchema
return {
name: mcpTool.name,
description: mcpTool.description ?? '',
parametersJsonSchema: mcpTool.inputSchema ?? {
type: 'object',
properties: {},
},
};
}
/**
* Augments MCP tool descriptions with usage guidance.
* Adds semantic hints and usage rules directly in tool descriptions
* so the model makes correct tool choices without system prompt overhead.
*
* Actual chrome-devtools-mcp tools:
* Input: click, drag, fill, fill_form, handle_dialog, hover, press_key, upload_file
* Navigation: close_page, list_pages, navigate_page, new_page, select_page, wait_for
* Emulation: emulate, resize_page
* Performance: performance_analyze_insight, performance_start_trace, performance_stop_trace
* Network: get_network_request, list_network_requests
* Debugging: evaluate_script, get_console_message, list_console_messages, take_screenshot, take_snapshot
* Vision (--experimental-vision): click_at, analyze_screenshot
*/
function augmentToolDescription(toolName: string, description: string): string {
// More-specific keys MUST come before shorter keys to prevent
// partial matching from short-circuiting (e.g., fill_form before fill).
const hints: Record<string, string> = {
fill_form:
' Fills multiple standard HTML form fields at once. Same limitations as fill — does not work on canvas/custom widgets.',
fill: ' Fills standard HTML form fields (<input>, <textarea>, <select>) by uid. Does NOT work on custom/canvas-based widgets (e.g., Google Sheets cells, Notion blocks). If fill times out or fails, click the element first then use press_key with individual characters instead.',
click_at:
' Clicks at exact pixel coordinates (x, y). Use when you have specific coordinates for visual elements.',
click:
' Use the element uid from the accessibility tree snapshot (e.g., uid="87_4"). UIDs are invalidated after this action — call take_snapshot before using another uid.',
hover:
' Use the element uid from the accessibility tree snapshot to hover over elements.',
take_snapshot:
' Returns the accessibility tree with uid values for each element. Call this FIRST to see available elements, and AFTER every state-changing action (click, fill, press_key) before using any uid.',
navigate_page:
' Navigate to the specified URL. Call take_snapshot after to see the new page.',
new_page:
' Opens a new page/tab with the specified URL. Call take_snapshot after to see the new page.',
press_key:
' Press a SINGLE keyboard key (e.g., "Enter", "Tab", "Escape", "ArrowDown", "a", "8"). ONLY accepts one key name — do NOT pass multi-character strings like "Hello" or "A1\\nEnter". To type text, use type_text instead of calling press_key for each character.',
};
// Check for partial matches — order matters! More-specific keys first.
for (const [key, hint] of Object.entries(hints)) {
if (toolName.toLowerCase().includes(key)) {
return description + hint;
}
}
return description;
}
/**
* Post-processes tool results to add contextual hints for common error patterns.
* This helps the agent recover from overlay blocking, element not found, etc.
* Also strips embedded snapshots to prevent token bloat.
*/
export function postProcessToolResult(
toolName: string,
result: string,
): string {
// Strip embedded snapshots to prevent token bloat (except for take_snapshot,
// whose accessibility tree the model needs for uid-based interactions).
let processedResult = result;
if (
toolName !== 'take_snapshot' &&
result.includes('## Latest page snapshot')
) {
const parts = result.split('## Latest page snapshot');
processedResult = parts[0].trim();
if (parts[1]) {
debugLogger.log('Stripped embedded snapshot from tool response');
}
}
// Detect overlay/interactable issues
const overlayPatterns = [
'not interactable',
'obscured',
'intercept',
'blocked',
'element is not visible',
'element not found',
];
const isOverlayIssue = overlayPatterns.some((pattern) =>
processedResult.toLowerCase().includes(pattern),
);
if (isOverlayIssue && (toolName === 'click' || toolName.includes('click'))) {
return (
processedResult +
'\n\n⚠️ This action may have been blocked by an overlay, popup, or tooltip. ' +
'Look for close/dismiss buttons (×, Close, "Got it", "Accept") in the accessibility tree and click them first.'
);
}
// Detect stale element references
if (
processedResult.toLowerCase().includes('stale') ||
processedResult.toLowerCase().includes('detached')
) {
return (
processedResult +
'\n\n⚠️ The element reference is stale. Call take_snapshot to get fresh element uids.'
);
}
return processedResult;
}