diff --git a/packages/core/src/agents/browser/browserAgentDefinition.ts b/packages/core/src/agents/browser/browserAgentDefinition.ts index 7c09ac9e16..0c91d81a79 100644 --- a/packages/core/src/agents/browser/browserAgentDefinition.ts +++ b/packages/core/src/agents/browser/browserAgentDefinition.ts @@ -55,7 +55,6 @@ PARALLEL TOOL CALLS - CRITICAL: - Do NOT make parallel calls for actions that change page state (click, fill, press_key, etc.) - Each action changes the DOM and invalidates UIDs from the current snapshot - Make state-changing actions ONE AT A TIME, then observe the results -- For typing text, prefer type_text tool instead of multiple press_key calls OVERLAY/POPUP HANDLING: Before interacting with page content, scan the accessibility tree for blocking overlays: @@ -72,6 +71,24 @@ When you need to identify elements by visual attributes not in the AX tree (e.g. 3. Use the returned coordinates with click_at(x, y) or other tools yourself 4. If the analysis is insufficient, call it again with a more specific instruction +COMPLEX WEB APPS (spreadsheets, rich editors, canvas apps): +Many web apps (Google Sheets/Docs, Notion, Figma, etc.) use custom rendering rather than standard HTML inputs. +- fill does NOT work on these apps. Instead, click the target element, then use type_text to enter the value. +- Navigate cells/fields using keyboard shortcuts (Tab, Enter, ArrowDown) — more reliable than clicking cell UIDs. +- For spreadsheets: click a cell → type_text("value") → press_key("Enter") to confirm and move to the next cell. +- Use the Name Box (cell reference input, usually showing "A1") to jump to specific cells. + +TERMINAL FAILURES — STOP IMMEDIATELY: +Some errors are unrecoverable and retrying will never help. When you see ANY of these, call complete_task immediately with success=false and include the specific remediation steps in your summary: +- "Could not connect to Chrome" — Include ALL of these instructions in your summary: + 1. Open Chrome (version 144+) + 2. Go to chrome://inspect/#remote-debugging and enable remote debugging + 3. Or change sessionMode to "persistent" in settings.json to let the agent launch its own browser +- "Browser closed" or "Target closed" or "Session closed" — The browser process has terminated. Tell the user to restart and try again. +- "net::ERR_" network errors on the SAME URL after 2 retries — the site is unreachable. Report the URL and error. +- Any error that appears IDENTICALLY 3+ times in a row — it will not resolve by retrying. +Do NOT keep retrying terminal errors. Report them with actionable remediation steps and exit immediately. + CRITICAL: When you have fully completed the user's task, you MUST call the complete_task tool with a summary of what you accomplished. Do NOT just return text - you must explicitly call complete_task to exit the loop.`; /** diff --git a/packages/core/src/agents/browser/mcpToolWrapper.ts b/packages/core/src/agents/browser/mcpToolWrapper.ts index ccd99559a4..10e18eb72c 100644 --- a/packages/core/src/agents/browser/mcpToolWrapper.ts +++ b/packages/core/src/agents/browser/mcpToolWrapper.ts @@ -116,6 +116,13 @@ class McpToolInvocation extends BaseToolInvocation< }; } catch (error) { const errorMsg = error instanceof Error ? error.message : String(error); + + // Chrome connection errors are fatal — re-throw to terminate the agent + // immediately instead of returning a result the LLM would retry. + if (errorMsg.includes('Could not connect to Chrome')) { + throw error; + } + debugLogger.error(`MCP tool ${this.toolName} failed: ${errorMsg}`); return { llmContent: `Error: ${errorMsg}`, @@ -126,6 +133,112 @@ class McpToolInvocation extends BaseToolInvocation< } } +/** + * Composite tool invocation that types a full string by calling press_key + * for each character internally, avoiding N model round-trips. + */ +class TypeTextInvocation extends BaseToolInvocation< + Record, + ToolResult +> { + constructor( + private readonly browserManager: BrowserManager, + private readonly text: string, + messageBus: MessageBus, + ) { + super({ text }, messageBus, 'type_text', 'type_text'); + } + + getDescription(): string { + return `type_text: "${this.text.substring(0, 50)}${this.text.length > 50 ? '...' : ''}"`; + } + + protected override async getConfirmationDetails( + _abortSignal: AbortSignal, + ): Promise { + if (!this.messageBus) { + return false; + } + + return { + type: 'mcp', + title: `Confirm Tool: type_text`, + serverName: 'browser-agent', + toolName: 'type_text', + toolDisplayName: 'type_text', + onConfirm: async (outcome: ToolConfirmationOutcome) => { + await this.publishPolicyUpdate(outcome); + }, + }; + } + + protected override getPolicyUpdateOptions( + _outcome: ToolConfirmationOutcome, + ): PolicyUpdateOptions | undefined { + return { + mcpName: 'browser-agent', + }; + } + + override async execute(): Promise { + try { + const chars = [...this.text]; // Handle Unicode correctly + let successCount = 0; + let lastError: string | undefined; + + for (const char of chars) { + // Map special characters to key names + const key = char === ' ' ? 'Space' : char; + const result: McpToolCallResult = await this.browserManager.callTool( + 'press_key', + { key }, + ); + + if (result.isError) { + const errorText = result.content + ?.filter( + (c: { type: string; text?: string }) => + c.type === 'text' && c.text, + ) + .map((c: { type: string; text?: string }) => c.text) + .join('\n'); + lastError = errorText || 'Unknown error'; + // Continue typing remaining characters on soft errors + debugLogger.warn( + `type_text: press_key("${key}") failed: ${lastError}`, + ); + } else { + successCount++; + } + } + + const summary = + successCount === chars.length + ? `Successfully typed ${chars.length} characters: "${this.text}"` + : `Typed ${successCount}/${chars.length} characters. Last error: ${lastError}`; + + return { + llmContent: summary, + returnDisplay: summary, + }; + } catch (error) { + const errorMsg = error instanceof Error ? error.message : String(error); + + // Chrome connection errors are fatal + if (errorMsg.includes('Could not connect to Chrome')) { + throw error; + } + + debugLogger.error(`type_text failed: ${errorMsg}`); + return { + llmContent: `Error: ${errorMsg}`, + returnDisplay: `Error: ${errorMsg}`, + error: { message: errorMsg }, + }; + } + } +} + /** * DeclarativeTool wrapper for an MCP tool. */ @@ -165,7 +278,56 @@ class McpDeclarativeTool extends DeclarativeTool< } /** - * Creates DeclarativeTool instances from dynamically discovered MCP tools. + * DeclarativeTool for the custom type_text composite tool. + */ +class TypeTextDeclarativeTool extends DeclarativeTool< + Record, + ToolResult +> { + constructor( + private readonly browserManager: BrowserManager, + messageBus: MessageBus, + ) { + super( + 'type_text', + 'type_text', + 'Types a full text string into the currently focused element by pressing each key in sequence. ' + + 'Much faster than calling press_key for each character individually. ' + + 'Use this to enter text into form fields, spreadsheet cells, or any focused input. ' + + 'The element must already be focused (e.g., after a click). ' + + 'Does NOT press Enter at the end — call press_key("Enter") separately if needed.', + Kind.Other, + { + type: 'object', + properties: { + text: { + type: 'string', + description: + 'The text to type. Each character will be pressed in sequence.', + }, + }, + required: ['text'], + }, + messageBus, + /* isOutputMarkdown */ true, + /* canUpdateOutput */ false, + ); + } + + build( + params: Record, + ): ToolInvocation, ToolResult> { + return new TypeTextInvocation( + this.browserManager, + String(params['text'] ?? ''), + this.messageBus, + ); + } +} + +/** + * Creates DeclarativeTool instances from dynamically discovered MCP tools, + * plus custom composite tools (like type_text). * * These tools are registered in the browser agent's isolated ToolRegistry, * NOT in the main agent's registry. @@ -179,7 +341,7 @@ class McpDeclarativeTool extends DeclarativeTool< export async function createMcpDeclarativeTools( browserManager: BrowserManager, messageBus: MessageBus, -): Promise { +): Promise> { // Get dynamically discovered tools from the MCP server const mcpTools = await browserManager.getDiscoveredTools(); @@ -187,21 +349,31 @@ export async function createMcpDeclarativeTools( `Creating ${mcpTools.length} declarative tools for browser agent`, ); - return mcpTools.map((mcpTool) => { - const schema = convertMcpToolToFunctionDeclaration(mcpTool); - // Augment description with uid-context hints - const augmentedDescription = augmentToolDescription( - mcpTool.name, - mcpTool.description ?? '', - ); - return new McpDeclarativeTool( - browserManager, - mcpTool.name, - augmentedDescription, - schema.parametersJsonSchema, - messageBus, - ); - }); + const tools: Array = + mcpTools.map((mcpTool) => { + const schema = convertMcpToolToFunctionDeclaration(mcpTool); + // Augment description with uid-context hints + const augmentedDescription = augmentToolDescription( + mcpTool.name, + mcpTool.description ?? '', + ); + return new McpDeclarativeTool( + browserManager, + mcpTool.name, + augmentedDescription, + schema.parametersJsonSchema, + messageBus, + ); + }); + + // Add custom composite tools + tools.push(new TypeTextDeclarativeTool(browserManager, messageBus)); + + debugLogger.log( + `Total tools registered: ${tools.length} (${mcpTools.length} MCP + 1 custom)`, + ); + + return tools; } /** @@ -223,31 +395,44 @@ function convertMcpToolToFunctionDeclaration( } /** - * Augments MCP tool descriptions with uid-context hints. - * Adds semantic guidance for tools that work with accessibility tree elements. + * Augments MCP tool descriptions with usage guidance. + * Adds semantic hints and usage rules directly in tool descriptions + * so the model makes correct tool choices without system prompt overhead. + * + * Actual chrome-devtools-mcp tools: + * Input: click, drag, fill, fill_form, handle_dialog, hover, press_key, upload_file + * Navigation: close_page, list_pages, navigate_page, new_page, select_page, wait_for + * Emulation: emulate, resize_page + * Performance: performance_analyze_insight, performance_start_trace, performance_stop_trace + * Network: get_network_request, list_network_requests + * Debugging: evaluate_script, get_console_message, list_console_messages, take_screenshot, take_snapshot + * Vision (--experimental-vision): click_at, analyze_screenshot */ function augmentToolDescription(toolName: string, description: string): string { - const uidHints: Record = { + // More-specific keys MUST come before shorter keys to prevent + // partial matching from short-circuiting (e.g., fill_form before fill). + const hints: Record = { + fill_form: + ' Fills multiple standard HTML form fields at once. Same limitations as fill — does not work on canvas/custom widgets.', + fill: ' Fills standard HTML form fields (,