diff --git a/docs/core/subagents.md b/docs/core/subagents.md index 3619609e95..e84f46dd8c 100644 --- a/docs/core/subagents.md +++ b/docs/core/subagents.md @@ -80,6 +80,122 @@ Gemini CLI comes with the following built-in subagents: invoked by the user. - **Configuration:** Enabled by default. No specific configuration options. +### Browser Agent (experimental) + +- **Name:** `browser_agent` +- **Purpose:** Automate web browser tasks — navigating websites, filling forms, + clicking buttons, and extracting information from web pages — using the + accessibility tree. +- **When to use:** "Go to example.com and fill out the contact form," "Extract + the pricing table from this page," "Click the login button and enter my + credentials." + +> **Note:** This is a preview feature currently under active development. + +#### Prerequisites + +The browser agent requires: + +- **Chrome** version 144 or later (any recent stable release will work). +- **Node.js** with `npx` available (used to launch the + [`chrome-devtools-mcp`](https://www.npmjs.com/package/chrome-devtools-mcp) + server). + +#### Enabling the browser agent + +The browser agent is disabled by default. Enable it in your `settings.json`: + +```json +{ + "agents": { + "overrides": { + "browser_agent": { + "enabled": true + } + } + } +} +``` + +#### Session modes + +The `sessionMode` setting controls how Chrome is launched and managed. Set it +under `agents.browser`: + +```json +{ + "agents": { + "overrides": { + "browser_agent": { + "enabled": true + } + }, + "browser": { + "sessionMode": "persistent" + } + } +} +``` + +The available modes are: + +| Mode | Description | +| :----------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `persistent` | **(Default)** Launches Chrome with a persistent profile stored at `~/.gemini/cli-browser-profile/`. Cookies, history, and settings are preserved between sessions. | +| `isolated` | Launches Chrome with a temporary profile that is deleted after each session. Use this for clean-state automation. | +| `existing` | Attaches to an already-running Chrome instance. You must enable remote debugging first by navigating to `chrome://inspect/#remote-debugging` in Chrome. No new browser process is launched. | + +#### Configuration reference + +All browser-specific settings go under `agents.browser` in your `settings.json`. + +| Setting | Type | Default | Description | +| :------------ | :-------- | :------------- | :---------------------------------------------------------------------------------------------- | +| `sessionMode` | `string` | `"persistent"` | How Chrome is managed: `"persistent"`, `"isolated"`, or `"existing"`. | +| `headless` | `boolean` | `false` | Run Chrome in headless mode (no visible window). | +| `profilePath` | `string` | — | Custom path to a browser profile directory. | +| `visualModel` | `string` | — | Model override for the visual agent (for example, `"gemini-2.5-computer-use-preview-10-2025"`). | + +#### Security + +The browser agent enforces the following security restrictions: + +- **Blocked URL patterns:** `file://`, `javascript:`, `data:text/html`, + `chrome://extensions`, and `chrome://settings/passwords` are always blocked. +- **Sensitive action confirmation:** Actions like form filling, file uploads, + and form submissions require user confirmation through the standard policy + engine. + +#### Visual agent + +By default, the browser agent interacts with pages through the accessibility +tree using element `uid` values. For tasks that require visual identification +(for example, "click the yellow button" or "find the red error message"), you +can enable the visual agent by setting a `visualModel`: + +```json +{ + "agents": { + "overrides": { + "browser_agent": { + "enabled": true + } + }, + "browser": { + "visualModel": "gemini-2.5-computer-use-preview-10-2025" + } + } +} +``` + +When enabled, the agent gains access to the `analyze_screenshot` tool, which +captures a screenshot and sends it to the vision model for analysis. The model +returns coordinates and element descriptions that the browser agent uses with +the `click_at` tool for precise, coordinate-based interactions. + +> **Note:** The visual agent requires API key or Vertex AI authentication. It is +> not available when using Google Login. + ## Creating custom subagents You can create your own subagents to automate specific workflows or enforce diff --git a/docs/reference/configuration.md b/docs/reference/configuration.md index 077d8e6f66..6bf28215c1 100644 --- a/docs/reference/configuration.md +++ b/docs/reference/configuration.md @@ -646,6 +646,27 @@ their corresponding top-level category object in your `settings.json` file. - **Default:** `{}` - **Requires restart:** Yes +- **`agents.browser.sessionMode`** (enum): + - **Description:** Session mode: 'persistent', 'isolated', or 'existing'. + - **Default:** `"persistent"` + - **Values:** `"persistent"`, `"isolated"`, `"existing"` + - **Requires restart:** Yes + +- **`agents.browser.headless`** (boolean): + - **Description:** Run browser in headless mode. + - **Default:** `false` + - **Requires restart:** Yes + +- **`agents.browser.profilePath`** (string): + - **Description:** Path to browser profile directory for session persistence. + - **Default:** `undefined` + - **Requires restart:** Yes + +- **`agents.browser.visualModel`** (string): + - **Description:** Model override for the visual agent. + - **Default:** `undefined` + - **Requires restart:** Yes + #### `context` - **`context.fileName`** (string | string[]): diff --git a/docs/tools/index.md b/docs/tools/index.md index f496ad591a..6bdf298fea 100644 --- a/docs/tools/index.md +++ b/docs/tools/index.md @@ -52,6 +52,9 @@ These tools help the model manage its plan and interact with you. complex plans. - **[Agent Skills](../cli/skills.md) (`activate_skill`):** Loads specialized procedural expertise when needed. +- **[Browser agent](../core/subagents.md#browser-agent-experimental) + (`browser_agent`):** Automates web browser tasks through the accessibility + tree. - **Internal docs (`get_internal_docs`):** Accesses Gemini CLI's own documentation to help answer your questions. diff --git a/packages/cli/src/config/settingsSchema.ts b/packages/cli/src/config/settingsSchema.ts index 5c04cea9b5..ee60731b5c 100644 --- a/packages/cli/src/config/settingsSchema.ts +++ b/packages/cli/src/config/settingsSchema.ts @@ -974,6 +974,60 @@ const SETTINGS_SCHEMA = { ref: 'AgentOverride', }, }, + browser: { + type: 'object', + label: 'Browser Agent', + category: 'Advanced', + requiresRestart: true, + default: {}, + description: 'Settings specific to the browser agent.', + showInDialog: false, + properties: { + sessionMode: { + type: 'enum', + label: 'Browser Session Mode', + category: 'Advanced', + requiresRestart: true, + default: 'persistent', + description: + "Session mode: 'persistent', 'isolated', or 'existing'.", + showInDialog: false, + options: [ + { value: 'persistent', label: 'Persistent' }, + { value: 'isolated', label: 'Isolated' }, + { value: 'existing', label: 'Existing' }, + ], + }, + headless: { + type: 'boolean', + label: 'Browser Headless', + category: 'Advanced', + requiresRestart: true, + default: false, + description: 'Run browser in headless mode.', + showInDialog: false, + }, + profilePath: { + type: 'string', + label: 'Browser Profile Path', + category: 'Advanced', + requiresRestart: true, + default: undefined as string | undefined, + description: + 'Path to browser profile directory for session persistence.', + showInDialog: false, + }, + visualModel: { + type: 'string', + label: 'Browser Visual Model', + category: 'Advanced', + requiresRestart: true, + default: undefined as string | undefined, + description: 'Model override for the visual agent.', + showInDialog: false, + }, + }, + }, }, }, diff --git a/packages/core/src/agents/browser/analyzeScreenshot.test.ts b/packages/core/src/agents/browser/analyzeScreenshot.test.ts new file mode 100644 index 0000000000..71e082b75d --- /dev/null +++ b/packages/core/src/agents/browser/analyzeScreenshot.test.ts @@ -0,0 +1,247 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, it, expect, vi, beforeEach } from 'vitest'; +import { createAnalyzeScreenshotTool } from './analyzeScreenshot.js'; +import type { BrowserManager, McpToolCallResult } from './browserManager.js'; +import type { Config } from '../../config/config.js'; +import type { MessageBus } from '../../confirmation-bus/message-bus.js'; + +const mockMessageBus = { + waitForConfirmation: vi.fn().mockResolvedValue({ approved: true }), +} as unknown as MessageBus; + +function createMockBrowserManager( + callToolResult?: McpToolCallResult, +): BrowserManager { + return { + callTool: vi.fn().mockResolvedValue( + callToolResult ?? { + content: [ + { type: 'text', text: 'Screenshot captured' }, + { + type: 'image', + data: 'base64encodeddata', + mimeType: 'image/png', + }, + ], + }, + ), + } as unknown as BrowserManager; +} + +function createMockConfig( + generateContentResult?: unknown, + generateContentError?: Error, +): Config { + const generateContent = generateContentError + ? vi.fn().mockRejectedValue(generateContentError) + : vi.fn().mockResolvedValue( + generateContentResult ?? { + candidates: [ + { + content: { + parts: [ + { + text: 'The blue submit button is at coordinates (250, 400).', + }, + ], + }, + }, + ], + }, + ); + + return { + getBrowserAgentConfig: vi.fn().mockReturnValue({ + customConfig: { visualModel: 'test-visual-model' }, + }), + getContentGenerator: vi.fn().mockReturnValue({ + generateContent, + }), + } as unknown as Config; +} + +describe('analyzeScreenshot', () => { + beforeEach(() => { + vi.clearAllMocks(); + }); + + describe('createAnalyzeScreenshotTool', () => { + it('creates a tool with the correct name and schema', () => { + const browserManager = createMockBrowserManager(); + const config = createMockConfig(); + const tool = createAnalyzeScreenshotTool( + browserManager, + config, + mockMessageBus, + ); + + expect(tool.name).toBe('analyze_screenshot'); + }); + }); + + describe('AnalyzeScreenshotInvocation', () => { + it('captures a screenshot and returns visual analysis', async () => { + const browserManager = createMockBrowserManager(); + const config = createMockConfig(); + const tool = createAnalyzeScreenshotTool( + browserManager, + config, + mockMessageBus, + ); + + const invocation = tool.build({ + instruction: 'Find the blue submit button', + }); + const result = await invocation.execute(new AbortController().signal); + + // Verify screenshot was captured + expect(browserManager.callTool).toHaveBeenCalledWith( + 'take_screenshot', + {}, + ); + + // Verify the visual model was called + const contentGenerator = config.getContentGenerator(); + expect(contentGenerator.generateContent).toHaveBeenCalledWith( + expect.objectContaining({ + model: 'test-visual-model', + contents: expect.arrayContaining([ + expect.objectContaining({ + role: 'user', + parts: expect.arrayContaining([ + expect.objectContaining({ + inlineData: { + mimeType: 'image/png', + data: 'base64encodeddata', + }, + }), + ]), + }), + ]), + }), + 'visual-analysis', + 'utility_tool', + ); + + // Verify result + expect(result.llmContent).toContain('Visual Analysis Result'); + expect(result.llmContent).toContain( + 'The blue submit button is at coordinates (250, 400).', + ); + expect(result.error).toBeUndefined(); + }); + + it('returns an error when screenshot capture fails (no image)', async () => { + const browserManager = createMockBrowserManager({ + content: [{ type: 'text', text: 'No screenshot available' }], + }); + const config = createMockConfig(); + const tool = createAnalyzeScreenshotTool( + browserManager, + config, + mockMessageBus, + ); + + const invocation = tool.build({ + instruction: 'Find the button', + }); + const result = await invocation.execute(new AbortController().signal); + + expect(result.error).toBeDefined(); + expect(result.llmContent).toContain('Failed to capture screenshot'); + // Should NOT call the visual model + const contentGenerator = config.getContentGenerator(); + expect(contentGenerator.generateContent).not.toHaveBeenCalled(); + }); + + it('returns an error when visual model returns empty response', async () => { + const browserManager = createMockBrowserManager(); + const config = createMockConfig({ + candidates: [{ content: { parts: [] } }], + }); + const tool = createAnalyzeScreenshotTool( + browserManager, + config, + mockMessageBus, + ); + + const invocation = tool.build({ + instruction: 'Check the layout', + }); + const result = await invocation.execute(new AbortController().signal); + + expect(result.error).toBeDefined(); + expect(result.llmContent).toContain('Visual model returned no analysis'); + }); + + it('returns a model-unavailability fallback for 404 errors', async () => { + const browserManager = createMockBrowserManager(); + const config = createMockConfig( + undefined, + new Error('Model not found: 404'), + ); + const tool = createAnalyzeScreenshotTool( + browserManager, + config, + mockMessageBus, + ); + + const invocation = tool.build({ + instruction: 'Find the red error', + }); + const result = await invocation.execute(new AbortController().signal); + + expect(result.error).toBeDefined(); + expect(result.llmContent).toContain( + 'Visual analysis model is not available', + ); + }); + + it('returns a model-unavailability fallback for 403 errors', async () => { + const browserManager = createMockBrowserManager(); + const config = createMockConfig( + undefined, + new Error('permission denied: 403'), + ); + const tool = createAnalyzeScreenshotTool( + browserManager, + config, + mockMessageBus, + ); + + const invocation = tool.build({ + instruction: 'Identify the element', + }); + const result = await invocation.execute(new AbortController().signal); + + expect(result.error).toBeDefined(); + expect(result.llmContent).toContain( + 'Visual analysis model is not available', + ); + }); + + it('returns a generic error for non-model errors', async () => { + const browserManager = createMockBrowserManager(); + const config = createMockConfig(undefined, new Error('Network timeout')); + const tool = createAnalyzeScreenshotTool( + browserManager, + config, + mockMessageBus, + ); + + const invocation = tool.build({ + instruction: 'Find something', + }); + const result = await invocation.execute(new AbortController().signal); + + expect(result.error).toBeDefined(); + expect(result.llmContent).toContain('Visual analysis failed'); + expect(result.llmContent).toContain('Network timeout'); + }); + }); +}); diff --git a/packages/core/src/agents/browser/analyzeScreenshot.ts b/packages/core/src/agents/browser/analyzeScreenshot.ts new file mode 100644 index 0000000000..c269b71bfb --- /dev/null +++ b/packages/core/src/agents/browser/analyzeScreenshot.ts @@ -0,0 +1,250 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * @fileoverview Tool for visual identification via a single model call. + * + * The semantic browser agent uses this tool when it needs to identify + * elements by visual attributes not present in the accessibility tree + * (e.g., color, layout, precise coordinates). + * + * Unlike the semantic agent which works with the accessibility tree, + * this tool sends a screenshot to a computer-use model for visual analysis. + * It returns the model's analysis (coordinates, element descriptions) back + * to the browser agent, which retains full control of subsequent actions. + */ + +import { + DeclarativeTool, + BaseToolInvocation, + Kind, + type ToolResult, + type ToolInvocation, +} from '../../tools/tools.js'; +import type { MessageBus } from '../../confirmation-bus/message-bus.js'; +import type { BrowserManager } from './browserManager.js'; +import type { Config } from '../../config/config.js'; +import { getVisualAgentModel } from './modelAvailability.js'; +import { debugLogger } from '../../utils/debugLogger.js'; +import { LlmRole } from '../../telemetry/llmRole.js'; + +/** + * System prompt for the visual analysis model call. + */ +const VISUAL_SYSTEM_PROMPT = `You are a Visual Analysis Agent. You receive a screenshot of a browser page and an instruction. + +Your job is to ANALYZE the screenshot and provide precise information that a browser automation agent can act on. + +COORDINATE SYSTEM: +- Coordinates are pixel-based relative to the viewport +- (0,0) is top-left of the visible area +- Estimate element positions from the screenshot + +RESPONSE FORMAT: +- For coordinate identification: provide exact (x, y) pixel coordinates +- For element identification: describe the element's visual location and appearance +- For layout analysis: describe the spatial relationships between elements +- Be concise and actionable — the browser agent will use your response to decide what action to take + +IMPORTANT: +- You are NOT performing actions — you are only providing visual analysis +- Include coordinates when possible so the caller can use click_at(x, y) +- If the element is not visible in the screenshot, say so explicitly`; + +/** + * Invocation for the analyze_screenshot tool. + * Makes a single generateContent call with a screenshot. + */ +class AnalyzeScreenshotInvocation extends BaseToolInvocation< + Record, + ToolResult +> { + constructor( + private readonly browserManager: BrowserManager, + private readonly config: Config, + params: Record, + messageBus: MessageBus, + ) { + super(params, messageBus, 'analyze_screenshot', 'Analyze Screenshot'); + } + + getDescription(): string { + const instruction = String(this.params['instruction'] ?? ''); + return `Visual analysis: "${instruction}"`; + } + + async execute(signal: AbortSignal): Promise { + try { + const instruction = String(this.params['instruction'] ?? ''); + + debugLogger.log(`Visual analysis requested: ${instruction}`); + + // Capture screenshot via MCP tool + const screenshotResult = await this.browserManager.callTool( + 'take_screenshot', + {}, + ); + + // Extract base64 image data from MCP response. + // Search ALL content items for image type — MCP returns [text, image] + // where content[0] is a text description and content[1] is the actual PNG. + let screenshotBase64 = ''; + let mimeType = 'image/png'; + if (screenshotResult.content && Array.isArray(screenshotResult.content)) { + for (const item of screenshotResult.content) { + if (item.type === 'image' && item.data) { + screenshotBase64 = item.data; + mimeType = item.mimeType ?? 'image/png'; + break; + } + } + } + + if (!screenshotBase64) { + return { + llmContent: + 'Failed to capture screenshot for visual analysis. Use accessibility tree elements instead.', + returnDisplay: 'Screenshot capture failed', + error: { message: 'Screenshot capture failed' }, + }; + } + + // Make a single generateContent call with the visual model + const visualModel = getVisualAgentModel(this.config); + const contentGenerator = this.config.getContentGenerator(); + + const response = await contentGenerator.generateContent( + { + model: visualModel, + config: { + temperature: 0, + topP: 0.95, + systemInstruction: VISUAL_SYSTEM_PROMPT, + abortSignal: signal, + }, + contents: [ + { + role: 'user', + parts: [ + { + text: `Analyze this screenshot and respond to the following instruction:\n\n${instruction}`, + }, + { + inlineData: { + mimeType, + data: screenshotBase64, + }, + }, + ], + }, + ], + }, + 'visual-analysis', + LlmRole.UTILITY_TOOL, + ); + + // Extract text from response + const responseText = + response.candidates?.[0]?.content?.parts + ?.filter((p) => p.text) + .map((p) => p.text) + .join('\n') ?? ''; + + if (!responseText) { + return { + llmContent: + 'Visual model returned no analysis. Use accessibility tree elements instead.', + returnDisplay: 'Visual analysis returned empty response', + error: { message: 'Empty visual analysis response' }, + }; + } + + debugLogger.log(`Visual analysis complete: ${responseText}`); + + return { + llmContent: `Visual Analysis Result:\n${responseText}`, + returnDisplay: `Visual Analysis Result:\n${responseText}`, + }; + } catch (error) { + const errorMsg = error instanceof Error ? error.message : String(error); + debugLogger.error(`Visual analysis failed: ${errorMsg}`); + + // Provide a graceful fallback message for model unavailability + const isModelError = + errorMsg.includes('404') || + errorMsg.includes('403') || + errorMsg.includes('not found') || + errorMsg.includes('permission'); + + const fallbackMsg = isModelError + ? 'Visual analysis model is not available. Use accessibility tree elements (uids from take_snapshot) for all interactions instead.' + : `Visual analysis failed: ${errorMsg}. Use accessibility tree elements instead.`; + + return { + llmContent: fallbackMsg, + returnDisplay: fallbackMsg, + error: { message: errorMsg }, + }; + } + } +} + +/** + * DeclarativeTool for screenshot-based visual analysis. + */ +class AnalyzeScreenshotTool extends DeclarativeTool< + Record, + ToolResult +> { + constructor( + private readonly browserManager: BrowserManager, + private readonly config: Config, + messageBus: MessageBus, + ) { + super( + 'analyze_screenshot', + 'analyze_screenshot', + 'Analyze the current page visually using a screenshot. Use when you need to identify elements by visual attributes (color, layout, position) not available in the accessibility tree, or when you need precise pixel coordinates for click_at. Returns visual analysis — you perform the actions yourself.', + Kind.Other, + { + type: 'object', + properties: { + instruction: { + type: 'string', + description: + 'What to identify or analyze visually (e.g., "Find the coordinates of the blue submit button", "What is the layout of the navigation menu?").', + }, + }, + required: ['instruction'], + }, + messageBus, + true, // isOutputMarkdown + false, // canUpdateOutput + ); + } + + build( + params: Record, + ): ToolInvocation, ToolResult> { + return new AnalyzeScreenshotInvocation( + this.browserManager, + this.config, + params, + this.messageBus, + ); + } +} + +/** + * Creates the analyze_screenshot tool for the browser agent. + */ +export function createAnalyzeScreenshotTool( + browserManager: BrowserManager, + config: Config, + messageBus: MessageBus, +): AnalyzeScreenshotTool { + return new AnalyzeScreenshotTool(browserManager, config, messageBus); +} diff --git a/packages/core/src/agents/browser/browserAgentDefinition.ts b/packages/core/src/agents/browser/browserAgentDefinition.ts new file mode 100644 index 0000000000..2703f53930 --- /dev/null +++ b/packages/core/src/agents/browser/browserAgentDefinition.ts @@ -0,0 +1,172 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * @fileoverview Browser Agent definition following the LocalAgentDefinition pattern. + * + * This agent uses LocalAgentExecutor for its reAct loop, like CodebaseInvestigatorAgent. + * It is available ONLY via delegate_to_agent, NOT as a direct tool. + * + * Tools are configured dynamically at invocation time via browserAgentFactory. + */ + +import type { LocalAgentDefinition } from '../types.js'; +import type { Config } from '../../config/config.js'; +import { z } from 'zod'; +import { + isPreviewModel, + PREVIEW_GEMINI_FLASH_MODEL, + DEFAULT_GEMINI_FLASH_MODEL, +} from '../../config/models.js'; + +/** Canonical agent name — used for routing and configuration lookup. */ +export const BROWSER_AGENT_NAME = 'browser_agent'; + +/** + * Output schema for browser agent results. + */ +export const BrowserTaskResultSchema = z.object({ + success: z.boolean().describe('Whether the task was completed successfully'), + summary: z + .string() + .describe('A summary of what was accomplished or what went wrong'), + data: z + .unknown() + .optional() + .describe('Optional extracted data from the task'), +}); + +const VISUAL_SECTION = ` +VISUAL IDENTIFICATION (analyze_screenshot): +When you need to identify elements by visual attributes not in the AX tree (e.g., "click the yellow button", "find the red error message"), or need precise pixel coordinates: +1. Call analyze_screenshot with a clear instruction describing what to find +2. It returns visual analysis with coordinates/descriptions — it does NOT perform actions +3. Use the returned coordinates with click_at(x, y) or other tools yourself +4. If the analysis is insufficient, call it again with a more specific instruction +`; + +/** + * System prompt for the semantic browser agent. + * Extracted from prototype (computer_use_subagent_cdt branch). + * + * @param visionEnabled Whether visual tools (analyze_screenshot, click_at) are available. + */ +export function buildBrowserSystemPrompt(visionEnabled: boolean): string { + return `You are an expert browser automation agent (Orchestrator). Your goal is to completely fulfill the user's request. + +IMPORTANT: You will receive an accessibility tree snapshot showing elements with uid values (e.g., uid=87_4 button "Login"). +Use these uid values directly with your tools: +- click(uid="87_4") to click the Login button +- fill(uid="87_2", value="john") to fill a text field +- fill_form(elements=[{uid: "87_2", value: "john"}, {uid: "87_3", value: "pass"}]) to fill multiple fields at once + +PARALLEL TOOL CALLS - CRITICAL: +- Do NOT make parallel calls for actions that change page state (click, fill, press_key, etc.) +- Each action changes the DOM and invalidates UIDs from the current snapshot +- Make state-changing actions ONE AT A TIME, then observe the results + +OVERLAY/POPUP HANDLING: +Before interacting with page content, scan the accessibility tree for blocking overlays: +- Tooltips, popups, modals, cookie banners, newsletter prompts, promo dialogs +- These often have: close buttons (×, X, Close, Dismiss), "Got it", "Accept", "No thanks" buttons +- Common patterns: elements with role="dialog", role="tooltip", role="alertdialog", or aria-modal="true" +- If you see such elements, DISMISS THEM FIRST by clicking close/dismiss buttons before proceeding +- If a click seems to have no effect, check if an overlay appeared or is blocking the target +${visionEnabled ? VISUAL_SECTION : ''} + +COMPLEX WEB APPS (spreadsheets, rich editors, canvas apps): +Many web apps (Google Sheets/Docs, Notion, Figma, etc.) use custom rendering rather than standard HTML inputs. +- fill does NOT work on these apps. Instead, click the target element, then use type_text to enter the value. +- type_text supports a submitKey parameter to press a key after typing (e.g., submitKey="Enter" to submit, submitKey="Tab" to move to the next field). This is much faster than separate press_key calls. +- Navigate cells/fields using keyboard shortcuts (Tab, Enter, ArrowDown) — more reliable than clicking UIDs. +- Use the Name Box (cell reference input, usually showing "A1") to jump to specific cells. + +TERMINAL FAILURES — STOP IMMEDIATELY: +Some errors are unrecoverable and retrying will never help. When you see ANY of these, call complete_task immediately with success=false and include the EXACT error message (including any remediation steps it contains) in your summary: +- "Could not connect to Chrome" or "Failed to connect to Chrome" or "Timed out connecting to Chrome" — Include the full error message with its remediation steps in your summary verbatim. Do NOT paraphrase or omit instructions. +- "Browser closed" or "Target closed" or "Session closed" — The browser process has terminated. Include the error and tell the user to try again. +- "net::ERR_" network errors on the SAME URL after 2 retries — the site is unreachable. Report the URL and error. +- Any error that appears IDENTICALLY 3+ times in a row — it will not resolve by retrying. +Do NOT keep retrying terminal errors. Report them with actionable remediation steps and exit immediately. + +CRITICAL: When you have fully completed the user's task, you MUST call the complete_task tool with a summary of what you accomplished. Do NOT just return text - you must explicitly call complete_task to exit the loop.`; +} + +/** + * Browser Agent Definition Factory. + * + * Following the CodebaseInvestigatorAgent pattern: + * - Returns a factory function that takes Config for dynamic model selection + * - kind: 'local' for LocalAgentExecutor + * - toolConfig is set dynamically by browserAgentFactory + */ +export const BrowserAgentDefinition = ( + config: Config, + visionEnabled = false, +): LocalAgentDefinition => { + // Use Preview Flash model if the main model is any of the preview models. + // If the main model is not a preview model, use the default flash model. + const model = isPreviewModel(config.getModel()) + ? PREVIEW_GEMINI_FLASH_MODEL + : DEFAULT_GEMINI_FLASH_MODEL; + + return { + name: BROWSER_AGENT_NAME, + kind: 'local', + experimental: true, + displayName: 'Browser Agent', + description: `Specialized autonomous agent for end-to-end web browser automation and objective-driven problem solving. Delegate complete, high-level tasks to this agent — it independently plans, executes multi-step interactions, interprets dynamic page feedback (e.g., game states, form validation errors, search results), and iterates until the goal is achieved. It perceives page structure through the Accessibility Tree, handles overlays and popups, and supports complex web apps.`, + + inputConfig: { + inputSchema: { + type: 'object', + properties: { + task: { + type: 'string', + description: 'The task to perform in the browser.', + }, + }, + required: ['task'], + }, + }, + + outputConfig: { + outputName: 'result', + description: 'The result of the browser task.', + schema: BrowserTaskResultSchema, + }, + + processOutput: (output) => JSON.stringify(output, null, 2), + + modelConfig: { + // Dynamic model based on whether user is using preview models + model, + generateContentConfig: { + temperature: 0.1, + topP: 0.95, + }, + }, + + runConfig: { + maxTimeMinutes: 10, + maxTurns: 50, + }, + + // Tools are set dynamically by browserAgentFactory after MCP connection + // This is undefined here and will be set at invocation time + toolConfig: undefined, + + promptConfig: { + query: `Your task is: + +\${task} + + +First, use new_page to open the relevant URL. Then call take_snapshot to see the page and proceed with your task.`, + systemPrompt: buildBrowserSystemPrompt(visionEnabled), + }, + }; +}; diff --git a/packages/core/src/agents/browser/browserAgentFactory.test.ts b/packages/core/src/agents/browser/browserAgentFactory.test.ts new file mode 100644 index 0000000000..a317f3a9ed --- /dev/null +++ b/packages/core/src/agents/browser/browserAgentFactory.test.ts @@ -0,0 +1,258 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; +import { + createBrowserAgentDefinition, + cleanupBrowserAgent, +} from './browserAgentFactory.js'; +import { makeFakeConfig } from '../../test-utils/config.js'; +import type { Config } from '../../config/config.js'; +import type { MessageBus } from '../../confirmation-bus/message-bus.js'; +import type { BrowserManager } from './browserManager.js'; + +// Create mock browser manager +const mockBrowserManager = { + ensureConnection: vi.fn().mockResolvedValue(undefined), + getDiscoveredTools: vi.fn().mockResolvedValue([ + // Semantic tools + { name: 'take_snapshot', description: 'Take snapshot' }, + { name: 'click', description: 'Click element' }, + { name: 'fill', description: 'Fill form field' }, + { name: 'navigate_page', description: 'Navigate to URL' }, + // Visual tools (from --experimental-vision) + { name: 'click_at', description: 'Click at coordinates' }, + ]), + callTool: vi.fn().mockResolvedValue({ content: [] }), + close: vi.fn().mockResolvedValue(undefined), +}; + +// Mock dependencies +vi.mock('./browserManager.js', () => ({ + BrowserManager: vi.fn(() => mockBrowserManager), +})); + +vi.mock('../../utils/debugLogger.js', () => ({ + debugLogger: { + log: vi.fn(), + warn: vi.fn(), + error: vi.fn(), + }, +})); + +import { + buildBrowserSystemPrompt, + BROWSER_AGENT_NAME, +} from './browserAgentDefinition.js'; + +describe('browserAgentFactory', () => { + let mockConfig: Config; + let mockMessageBus: MessageBus; + + beforeEach(() => { + vi.clearAllMocks(); + + // Reset mock implementations + mockBrowserManager.ensureConnection.mockResolvedValue(undefined); + mockBrowserManager.getDiscoveredTools.mockResolvedValue([ + // Semantic tools + { name: 'take_snapshot', description: 'Take snapshot' }, + { name: 'click', description: 'Click element' }, + { name: 'fill', description: 'Fill form field' }, + { name: 'navigate_page', description: 'Navigate to URL' }, + // Visual tools (from --experimental-vision) + { name: 'click_at', description: 'Click at coordinates' }, + ]); + mockBrowserManager.close.mockResolvedValue(undefined); + + mockConfig = makeFakeConfig({ + agents: { + overrides: { + browser_agent: { + enabled: true, + }, + }, + browser: { + headless: false, + }, + }, + }); + + mockMessageBus = { + publish: vi.fn().mockResolvedValue(undefined), + subscribe: vi.fn(), + unsubscribe: vi.fn(), + } as unknown as MessageBus; + }); + + afterEach(() => { + vi.restoreAllMocks(); + }); + + describe('createBrowserAgentDefinition', () => { + it('should ensure browser connection', async () => { + await createBrowserAgentDefinition(mockConfig, mockMessageBus); + + expect(mockBrowserManager.ensureConnection).toHaveBeenCalled(); + }); + + it('should return agent definition with discovered tools', async () => { + const { definition } = await createBrowserAgentDefinition( + mockConfig, + mockMessageBus, + ); + + expect(definition.name).toBe(BROWSER_AGENT_NAME); + // 5 MCP tools + 1 type_text composite tool (no analyze_screenshot without visualModel) + expect(definition.toolConfig?.tools).toHaveLength(6); + }); + + it('should return browser manager for cleanup', async () => { + const { browserManager } = await createBrowserAgentDefinition( + mockConfig, + mockMessageBus, + ); + + expect(browserManager).toBeDefined(); + }); + + it('should call printOutput when provided', async () => { + const printOutput = vi.fn(); + + await createBrowserAgentDefinition( + mockConfig, + mockMessageBus, + printOutput, + ); + + expect(printOutput).toHaveBeenCalled(); + }); + + it('should create definition with correct structure', async () => { + const { definition } = await createBrowserAgentDefinition( + mockConfig, + mockMessageBus, + ); + + expect(definition.kind).toBe('local'); + expect(definition.inputConfig).toBeDefined(); + expect(definition.outputConfig).toBeDefined(); + expect(definition.promptConfig).toBeDefined(); + }); + + it('should exclude visual prompt section when visualModel is not configured', async () => { + const { definition } = await createBrowserAgentDefinition( + mockConfig, + mockMessageBus, + ); + + const systemPrompt = definition.promptConfig?.systemPrompt ?? ''; + expect(systemPrompt).not.toContain('analyze_screenshot'); + expect(systemPrompt).not.toContain('VISUAL IDENTIFICATION'); + }); + + it('should include visual prompt section when visualModel is configured', async () => { + const configWithVision = makeFakeConfig({ + agents: { + overrides: { + browser_agent: { + enabled: true, + }, + }, + browser: { + headless: false, + visualModel: 'gemini-2.5-flash-preview', + }, + }, + }); + + const { definition } = await createBrowserAgentDefinition( + configWithVision, + mockMessageBus, + ); + + const systemPrompt = definition.promptConfig?.systemPrompt ?? ''; + expect(systemPrompt).toContain('analyze_screenshot'); + expect(systemPrompt).toContain('VISUAL IDENTIFICATION'); + }); + + it('should include analyze_screenshot tool when visualModel is configured', async () => { + const configWithVision = makeFakeConfig({ + agents: { + overrides: { + browser_agent: { + enabled: true, + }, + }, + browser: { + headless: false, + visualModel: 'gemini-2.5-flash-preview', + }, + }, + }); + + const { definition } = await createBrowserAgentDefinition( + configWithVision, + mockMessageBus, + ); + + // 5 MCP tools + 1 type_text + 1 analyze_screenshot + expect(definition.toolConfig?.tools).toHaveLength(7); + const toolNames = + definition.toolConfig?.tools + ?.filter( + (t): t is { name: string } => typeof t === 'object' && 'name' in t, + ) + .map((t) => t.name) ?? []; + expect(toolNames).toContain('analyze_screenshot'); + }); + }); + + describe('cleanupBrowserAgent', () => { + it('should call close on browser manager', async () => { + await cleanupBrowserAgent( + mockBrowserManager as unknown as BrowserManager, + ); + + expect(mockBrowserManager.close).toHaveBeenCalled(); + }); + + it('should handle errors during cleanup gracefully', async () => { + const errorManager = { + close: vi.fn().mockRejectedValue(new Error('Close failed')), + } as unknown as BrowserManager; + + // Should not throw + await expect(cleanupBrowserAgent(errorManager)).resolves.toBeUndefined(); + }); + }); +}); + +describe('buildBrowserSystemPrompt', () => { + it('should include visual section when vision is enabled', () => { + const prompt = buildBrowserSystemPrompt(true); + expect(prompt).toContain('VISUAL IDENTIFICATION'); + expect(prompt).toContain('analyze_screenshot'); + expect(prompt).toContain('click_at'); + }); + + it('should exclude visual section when vision is disabled', () => { + const prompt = buildBrowserSystemPrompt(false); + expect(prompt).not.toContain('VISUAL IDENTIFICATION'); + expect(prompt).not.toContain('analyze_screenshot'); + }); + + it('should always include core sections regardless of vision', () => { + for (const visionEnabled of [true, false]) { + const prompt = buildBrowserSystemPrompt(visionEnabled); + expect(prompt).toContain('PARALLEL TOOL CALLS'); + expect(prompt).toContain('OVERLAY/POPUP HANDLING'); + expect(prompt).toContain('COMPLEX WEB APPS'); + expect(prompt).toContain('TERMINAL FAILURES'); + expect(prompt).toContain('complete_task'); + } + }); +}); diff --git a/packages/core/src/agents/browser/browserAgentFactory.ts b/packages/core/src/agents/browser/browserAgentFactory.ts new file mode 100644 index 0000000000..a8a3b0f338 --- /dev/null +++ b/packages/core/src/agents/browser/browserAgentFactory.ts @@ -0,0 +1,161 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * @fileoverview Factory for creating browser agent definitions with configured tools. + * + * This factory is called when the browser agent is invoked via delegate_to_agent. + * It creates a BrowserManager, connects the isolated MCP client, wraps tools, + * and returns a fully configured LocalAgentDefinition. + * + * IMPORTANT: The MCP tools are ONLY available to the browser agent's isolated + * registry. They are NOT registered in the main agent's ToolRegistry. + */ + +import type { Config } from '../../config/config.js'; +import { AuthType } from '../../core/contentGenerator.js'; +import type { LocalAgentDefinition } from '../types.js'; +import type { MessageBus } from '../../confirmation-bus/message-bus.js'; +import type { AnyDeclarativeTool } from '../../tools/tools.js'; +import { BrowserManager } from './browserManager.js'; +import { + BrowserAgentDefinition, + type BrowserTaskResultSchema, +} from './browserAgentDefinition.js'; +import { createMcpDeclarativeTools } from './mcpToolWrapper.js'; +import { createAnalyzeScreenshotTool } from './analyzeScreenshot.js'; +import { debugLogger } from '../../utils/debugLogger.js'; + +/** + * Creates a browser agent definition with MCP tools configured. + * + * This is called when the browser agent is invoked via delegate_to_agent. + * The MCP client is created fresh and tools are wrapped for the agent's + * isolated registry - NOT registered with the main agent. + * + * @param config Runtime configuration + * @param messageBus Message bus for tool invocations + * @param printOutput Optional callback for progress messages + * @returns Fully configured LocalAgentDefinition with MCP tools + */ +export async function createBrowserAgentDefinition( + config: Config, + messageBus: MessageBus, + printOutput?: (msg: string) => void, +): Promise<{ + definition: LocalAgentDefinition; + browserManager: BrowserManager; +}> { + debugLogger.log( + 'Creating browser agent definition with isolated MCP tools...', + ); + + // Create and initialize browser manager with isolated MCP client + const browserManager = new BrowserManager(config); + await browserManager.ensureConnection(); + + if (printOutput) { + printOutput('Browser connected with isolated MCP client.'); + } + + // Create declarative tools from dynamically discovered MCP tools + // These tools dispatch to browserManager's isolated client + const mcpTools = await createMcpDeclarativeTools(browserManager, messageBus); + const availableToolNames = mcpTools.map((t) => t.name); + + // Validate required semantic tools are available + const requiredSemanticTools = [ + 'click', + 'fill', + 'navigate_page', + 'take_snapshot', + ]; + const missingSemanticTools = requiredSemanticTools.filter( + (t) => !availableToolNames.includes(t), + ); + if (missingSemanticTools.length > 0) { + debugLogger.warn( + `Semantic tools missing (${missingSemanticTools.join(', ')}). ` + + 'Some browser interactions may not work correctly.', + ); + } + + // Only click_at is strictly required — text input can use press_key or fill. + const requiredVisualTools = ['click_at']; + const missingVisualTools = requiredVisualTools.filter( + (t) => !availableToolNames.includes(t), + ); + + // Check whether vision can be enabled; returns undefined if all gates pass. + function getVisionDisabledReason(): string | undefined { + const browserConfig = config.getBrowserAgentConfig(); + if (!browserConfig.customConfig.visualModel) { + return 'No visualModel configured.'; + } + if (missingVisualTools.length > 0) { + return ( + `Visual tools missing (${missingVisualTools.join(', ')}). ` + + `The installed chrome-devtools-mcp version may be too old.` + ); + } + const authType = config.getContentGeneratorConfig()?.authType; + const blockedAuthTypes = new Set([ + AuthType.LOGIN_WITH_GOOGLE, + AuthType.LEGACY_CLOUD_SHELL, + AuthType.COMPUTE_ADC, + ]); + if (authType && blockedAuthTypes.has(authType)) { + return 'Visual agent model not available for current auth type.'; + } + return undefined; + } + + const allTools: AnyDeclarativeTool[] = [...mcpTools]; + const visionDisabledReason = getVisionDisabledReason(); + + if (visionDisabledReason) { + debugLogger.log(`Vision disabled: ${visionDisabledReason}`); + } else { + allTools.push( + createAnalyzeScreenshotTool(browserManager, config, messageBus), + ); + } + + debugLogger.log( + `Created ${allTools.length} tools for browser agent: ` + + allTools.map((t) => t.name).join(', '), + ); + + // Create configured definition with tools + // BrowserAgentDefinition is a factory function - call it with config + const baseDefinition = BrowserAgentDefinition(config, !visionDisabledReason); + const definition: LocalAgentDefinition = { + ...baseDefinition, + toolConfig: { + tools: allTools, + }, + }; + + return { definition, browserManager }; +} + +/** + * Cleans up browser resources after agent execution. + * + * @param browserManager The browser manager to clean up + */ +export async function cleanupBrowserAgent( + browserManager: BrowserManager, +): Promise { + try { + await browserManager.close(); + debugLogger.log('Browser agent cleanup complete'); + } catch (error) { + debugLogger.error( + `Error during browser cleanup: ${error instanceof Error ? error.message : String(error)}`, + ); + } +} diff --git a/packages/core/src/agents/browser/browserAgentInvocation.test.ts b/packages/core/src/agents/browser/browserAgentInvocation.test.ts new file mode 100644 index 0000000000..b58a9c409e --- /dev/null +++ b/packages/core/src/agents/browser/browserAgentInvocation.test.ts @@ -0,0 +1,139 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; +import { BrowserAgentInvocation } from './browserAgentInvocation.js'; +import { makeFakeConfig } from '../../test-utils/config.js'; +import type { Config } from '../../config/config.js'; +import type { MessageBus } from '../../confirmation-bus/message-bus.js'; +import type { AgentInputs } from '../types.js'; + +// Mock dependencies before imports +vi.mock('../../utils/debugLogger.js', () => ({ + debugLogger: { + log: vi.fn(), + error: vi.fn(), + }, +})); + +describe('BrowserAgentInvocation', () => { + let mockConfig: Config; + let mockMessageBus: MessageBus; + let mockParams: AgentInputs; + + beforeEach(() => { + vi.clearAllMocks(); + + mockConfig = makeFakeConfig({ + agents: { + overrides: { + browser_agent: { + enabled: true, + }, + }, + browser: { + headless: false, + sessionMode: 'isolated', + }, + }, + }); + + mockMessageBus = { + publish: vi.fn().mockResolvedValue(undefined), + subscribe: vi.fn(), + unsubscribe: vi.fn(), + } as unknown as MessageBus; + + mockParams = { + task: 'Navigate to example.com and click the button', + }; + }); + + afterEach(() => { + vi.restoreAllMocks(); + }); + + describe('constructor', () => { + it('should create invocation with params', () => { + const invocation = new BrowserAgentInvocation( + mockConfig, + mockParams, + mockMessageBus, + ); + + expect(invocation.params).toEqual(mockParams); + }); + + it('should use browser_agent as default tool name', () => { + const invocation = new BrowserAgentInvocation( + mockConfig, + mockParams, + mockMessageBus, + ); + + expect(invocation['_toolName']).toBe('browser_agent'); + }); + + it('should use custom tool name if provided', () => { + const invocation = new BrowserAgentInvocation( + mockConfig, + mockParams, + mockMessageBus, + 'custom_name', + 'Custom Display Name', + ); + + expect(invocation['_toolName']).toBe('custom_name'); + expect(invocation['_toolDisplayName']).toBe('Custom Display Name'); + }); + }); + + describe('getDescription', () => { + it('should return description with input summary', () => { + const invocation = new BrowserAgentInvocation( + mockConfig, + mockParams, + mockMessageBus, + ); + + const description = invocation.getDescription(); + + expect(description).toContain('browser agent'); + expect(description).toContain('task'); + }); + + it('should truncate long input values', () => { + const longParams = { + task: 'A'.repeat(100), + }; + + const invocation = new BrowserAgentInvocation( + mockConfig, + longParams, + mockMessageBus, + ); + + const description = invocation.getDescription(); + + // Should be truncated to max length + expect(description.length).toBeLessThanOrEqual(200); + }); + }); + + describe('toolLocations', () => { + it('should return empty array by default', () => { + const invocation = new BrowserAgentInvocation( + mockConfig, + mockParams, + mockMessageBus, + ); + + const locations = invocation.toolLocations(); + + expect(locations).toEqual([]); + }); + }); +}); diff --git a/packages/core/src/agents/browser/browserAgentInvocation.ts b/packages/core/src/agents/browser/browserAgentInvocation.ts new file mode 100644 index 0000000000..0de9564c39 --- /dev/null +++ b/packages/core/src/agents/browser/browserAgentInvocation.ts @@ -0,0 +1,171 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * @fileoverview Browser agent invocation that handles async tool setup. + * + * Unlike regular LocalSubagentInvocation, this invocation: + * 1. Uses browserAgentFactory to create definition with MCP tools + * 2. Cleans up browser resources after execution + * + * The MCP tools are only available in the browser agent's isolated registry. + */ + +import type { Config } from '../../config/config.js'; +import { LocalAgentExecutor } from '../local-executor.js'; +import type { AnsiOutput } from '../../utils/terminalSerializer.js'; +import { BaseToolInvocation, type ToolResult } from '../../tools/tools.js'; +import { ToolErrorType } from '../../tools/tool-error.js'; +import type { AgentInputs, SubagentActivityEvent } from '../types.js'; +import type { MessageBus } from '../../confirmation-bus/message-bus.js'; +import { + createBrowserAgentDefinition, + cleanupBrowserAgent, +} from './browserAgentFactory.js'; + +const INPUT_PREVIEW_MAX_LENGTH = 50; +const DESCRIPTION_MAX_LENGTH = 200; + +/** + * Browser agent invocation with async tool setup. + * + * This invocation handles the browser agent's special requirements: + * - MCP connection and tool wrapping at invocation time + * - Browser cleanup after execution + */ +export class BrowserAgentInvocation extends BaseToolInvocation< + AgentInputs, + ToolResult +> { + constructor( + private readonly config: Config, + params: AgentInputs, + messageBus: MessageBus, + _toolName?: string, + _toolDisplayName?: string, + ) { + // Note: BrowserAgentDefinition is a factory function, so we use hardcoded names + super( + params, + messageBus, + _toolName ?? 'browser_agent', + _toolDisplayName ?? 'Browser Agent', + ); + } + + /** + * Returns a concise, human-readable description of the invocation. + */ + getDescription(): string { + const inputSummary = Object.entries(this.params) + .map( + ([key, value]) => + `${key}: ${String(value).slice(0, INPUT_PREVIEW_MAX_LENGTH)}`, + ) + .join(', '); + + const description = `Running browser agent with inputs: { ${inputSummary} }`; + return description.slice(0, DESCRIPTION_MAX_LENGTH); + } + + /** + * Executes the browser agent. + * + * This method: + * 1. Creates browser manager and MCP connection + * 2. Wraps MCP tools for the isolated registry + * 3. Runs the agent via LocalAgentExecutor + * 4. Cleans up browser resources + */ + async execute( + signal: AbortSignal, + updateOutput?: (output: string | AnsiOutput) => void, + ): Promise { + let browserManager; + + try { + if (updateOutput) { + updateOutput('🌐 Starting browser agent...\n'); + } + + // Create definition with MCP tools + const printOutput = updateOutput + ? (msg: string) => updateOutput(`🌐 ${msg}\n`) + : undefined; + + const result = await createBrowserAgentDefinition( + this.config, + this.messageBus, + printOutput, + ); + const { definition } = result; + browserManager = result.browserManager; + + if (updateOutput) { + updateOutput( + `🌐 Browser connected. Tools: ${definition.toolConfig?.tools.length ?? 0}\n`, + ); + } + + // Create activity callback for streaming output + const onActivity = (activity: SubagentActivityEvent): void => { + if (!updateOutput) return; + + if ( + activity.type === 'THOUGHT_CHUNK' && + typeof activity.data['text'] === 'string' + ) { + updateOutput(`🌐💭 ${activity.data['text']}`); + } + }; + + // Create and run executor with the configured definition + const executor = await LocalAgentExecutor.create( + definition, + this.config, + onActivity, + ); + + const output = await executor.run(this.params, signal); + + const resultContent = `Browser agent finished. +Termination Reason: ${output.terminate_reason} +Result: +${output.result}`; + + const displayContent = ` +Browser Agent Finished + +Termination Reason: ${output.terminate_reason} + +Result: +${output.result} +`; + + return { + llmContent: [{ text: resultContent }], + returnDisplay: displayContent, + }; + } catch (error) { + const errorMessage = + error instanceof Error ? error.message : String(error); + + return { + llmContent: `Browser agent failed. Error: ${errorMessage}`, + returnDisplay: `Browser Agent Failed\nError: ${errorMessage}`, + error: { + message: errorMessage, + type: ToolErrorType.EXECUTION_FAILED, + }, + }; + } finally { + // Always cleanup browser resources + if (browserManager) { + await cleanupBrowserAgent(browserManager); + } + } + } +} diff --git a/packages/core/src/agents/browser/browserManager.test.ts b/packages/core/src/agents/browser/browserManager.test.ts new file mode 100644 index 0000000000..6c25181afe --- /dev/null +++ b/packages/core/src/agents/browser/browserManager.test.ts @@ -0,0 +1,414 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; +import { BrowserManager } from './browserManager.js'; +import { makeFakeConfig } from '../../test-utils/config.js'; +import type { Config } from '../../config/config.js'; + +// Mock the MCP SDK +vi.mock('@modelcontextprotocol/sdk/client/index.js', () => ({ + Client: vi.fn().mockImplementation(() => ({ + connect: vi.fn().mockResolvedValue(undefined), + close: vi.fn().mockResolvedValue(undefined), + listTools: vi.fn().mockResolvedValue({ + tools: [ + { name: 'take_snapshot', description: 'Take a snapshot' }, + { name: 'click', description: 'Click an element' }, + { name: 'click_at', description: 'Click at coordinates' }, + { name: 'take_screenshot', description: 'Take a screenshot' }, + ], + }), + callTool: vi.fn().mockResolvedValue({ + content: [{ type: 'text', text: 'Tool result' }], + }), + })), +})); + +vi.mock('@modelcontextprotocol/sdk/client/stdio.js', () => ({ + StdioClientTransport: vi.fn().mockImplementation(() => ({ + close: vi.fn().mockResolvedValue(undefined), + stderr: null, + })), +})); + +vi.mock('../../utils/debugLogger.js', () => ({ + debugLogger: { + log: vi.fn(), + error: vi.fn(), + }, +})); + +import { Client } from '@modelcontextprotocol/sdk/client/index.js'; +import { StdioClientTransport } from '@modelcontextprotocol/sdk/client/stdio.js'; + +describe('BrowserManager', () => { + let mockConfig: Config; + + beforeEach(() => { + vi.resetAllMocks(); + + // Setup mock config + mockConfig = makeFakeConfig({ + agents: { + overrides: { + browser_agent: { + enabled: true, + }, + }, + browser: { + headless: false, + }, + }, + }); + + // Re-setup Client mock after reset + vi.mocked(Client).mockImplementation( + () => + ({ + connect: vi.fn().mockResolvedValue(undefined), + close: vi.fn().mockResolvedValue(undefined), + listTools: vi.fn().mockResolvedValue({ + tools: [ + { name: 'take_snapshot', description: 'Take a snapshot' }, + { name: 'click', description: 'Click an element' }, + { name: 'click_at', description: 'Click at coordinates' }, + { name: 'take_screenshot', description: 'Take a screenshot' }, + ], + }), + callTool: vi.fn().mockResolvedValue({ + content: [{ type: 'text', text: 'Tool result' }], + }), + }) as unknown as InstanceType, + ); + }); + + afterEach(() => { + vi.restoreAllMocks(); + }); + + describe('getRawMcpClient', () => { + it('should ensure connection and return raw MCP client', async () => { + const manager = new BrowserManager(mockConfig); + const client = await manager.getRawMcpClient(); + + expect(client).toBeDefined(); + expect(Client).toHaveBeenCalled(); + }); + + it('should return cached client if already connected', async () => { + const manager = new BrowserManager(mockConfig); + + // First call + const client1 = await manager.getRawMcpClient(); + + // Second call should use cache + const client2 = await manager.getRawMcpClient(); + + expect(client1).toBe(client2); + // Client constructor should only be called once + expect(Client).toHaveBeenCalledTimes(1); + }); + }); + + describe('getDiscoveredTools', () => { + it('should return tools discovered from MCP server including visual tools', async () => { + const manager = new BrowserManager(mockConfig); + const tools = await manager.getDiscoveredTools(); + + expect(tools).toHaveLength(4); + expect(tools.map((t) => t.name)).toContain('take_snapshot'); + expect(tools.map((t) => t.name)).toContain('click'); + expect(tools.map((t) => t.name)).toContain('click_at'); + expect(tools.map((t) => t.name)).toContain('take_screenshot'); + }); + }); + + describe('callTool', () => { + it('should call tool on MCP client and return result', async () => { + const manager = new BrowserManager(mockConfig); + const result = await manager.callTool('take_snapshot', { verbose: true }); + + expect(result).toEqual({ + content: [{ type: 'text', text: 'Tool result' }], + isError: false, + }); + }); + }); + + describe('MCP connection', () => { + it('should spawn npx chrome-devtools-mcp with --experimental-vision (persistent mode by default)', async () => { + const manager = new BrowserManager(mockConfig); + await manager.ensureConnection(); + + // Verify StdioClientTransport was created with correct args + expect(StdioClientTransport).toHaveBeenCalledWith( + expect.objectContaining({ + command: 'npx', + args: expect.arrayContaining([ + '-y', + expect.stringMatching(/chrome-devtools-mcp@/), + '--experimental-vision', + ]), + }), + ); + // Persistent mode should NOT include --isolated or --autoConnect + const args = vi.mocked(StdioClientTransport).mock.calls[0]?.[0] + ?.args as string[]; + expect(args).not.toContain('--isolated'); + expect(args).not.toContain('--autoConnect'); + // Persistent mode should set the default --userDataDir under ~/.gemini + expect(args).toContain('--userDataDir'); + const userDataDirIndex = args.indexOf('--userDataDir'); + expect(args[userDataDirIndex + 1]).toMatch(/cli-browser-profile$/); + }); + + it('should pass headless flag when configured', async () => { + const headlessConfig = makeFakeConfig({ + agents: { + overrides: { + browser_agent: { + enabled: true, + }, + }, + browser: { + headless: true, + }, + }, + }); + + const manager = new BrowserManager(headlessConfig); + await manager.ensureConnection(); + + expect(StdioClientTransport).toHaveBeenCalledWith( + expect.objectContaining({ + command: 'npx', + args: expect.arrayContaining(['--headless']), + }), + ); + }); + + it('should pass profilePath as --userDataDir when configured', async () => { + const profileConfig = makeFakeConfig({ + agents: { + overrides: { + browser_agent: { + enabled: true, + }, + }, + browser: { + profilePath: '/path/to/profile', + }, + }, + }); + + const manager = new BrowserManager(profileConfig); + await manager.ensureConnection(); + + expect(StdioClientTransport).toHaveBeenCalledWith( + expect.objectContaining({ + command: 'npx', + args: expect.arrayContaining(['--userDataDir', '/path/to/profile']), + }), + ); + }); + + it('should pass --isolated when sessionMode is isolated', async () => { + const isolatedConfig = makeFakeConfig({ + agents: { + overrides: { + browser_agent: { + enabled: true, + }, + }, + browser: { + sessionMode: 'isolated', + }, + }, + }); + + const manager = new BrowserManager(isolatedConfig); + await manager.ensureConnection(); + + const args = vi.mocked(StdioClientTransport).mock.calls[0]?.[0] + ?.args as string[]; + expect(args).toContain('--isolated'); + expect(args).not.toContain('--autoConnect'); + }); + + it('should pass --autoConnect when sessionMode is existing', async () => { + const existingConfig = makeFakeConfig({ + agents: { + overrides: { + browser_agent: { + enabled: true, + }, + }, + browser: { + sessionMode: 'existing', + }, + }, + }); + + const manager = new BrowserManager(existingConfig); + await manager.ensureConnection(); + + const args = vi.mocked(StdioClientTransport).mock.calls[0]?.[0] + ?.args as string[]; + expect(args).toContain('--autoConnect'); + expect(args).not.toContain('--isolated'); + }); + + it('should throw actionable error when existing mode connection fails', async () => { + // Make the Client mock's connect method reject + vi.mocked(Client).mockImplementation( + () => + ({ + connect: vi.fn().mockRejectedValue(new Error('Connection refused')), + close: vi.fn().mockResolvedValue(undefined), + listTools: vi.fn(), + callTool: vi.fn(), + }) as unknown as InstanceType, + ); + + const existingConfig = makeFakeConfig({ + agents: { + overrides: { + browser_agent: { + enabled: true, + }, + }, + browser: { + sessionMode: 'existing', + }, + }, + }); + + const manager = new BrowserManager(existingConfig); + + await expect(manager.ensureConnection()).rejects.toThrow( + /Failed to connect to existing Chrome instance/, + ); + // Create a fresh manager to verify the error message includes remediation steps + const manager2 = new BrowserManager(existingConfig); + await expect(manager2.ensureConnection()).rejects.toThrow( + /chrome:\/\/inspect\/#remote-debugging/, + ); + }); + + it('should throw profile-lock remediation when persistent mode hits "already running"', async () => { + vi.mocked(Client).mockImplementation( + () => + ({ + connect: vi + .fn() + .mockRejectedValue( + new Error( + 'Could not connect to Chrome. The browser is already running for the current profile.', + ), + ), + close: vi.fn().mockResolvedValue(undefined), + listTools: vi.fn(), + callTool: vi.fn(), + }) as unknown as InstanceType, + ); + + // Default config = persistent mode + const manager = new BrowserManager(mockConfig); + + await expect(manager.ensureConnection()).rejects.toThrow( + /Close all Chrome windows using this profile/, + ); + const manager2 = new BrowserManager(mockConfig); + await expect(manager2.ensureConnection()).rejects.toThrow( + /Set sessionMode to "isolated"/, + ); + }); + + it('should throw timeout-specific remediation for persistent mode', async () => { + vi.mocked(Client).mockImplementation( + () => + ({ + connect: vi + .fn() + .mockRejectedValue( + new Error('Timed out connecting to chrome-devtools-mcp'), + ), + close: vi.fn().mockResolvedValue(undefined), + listTools: vi.fn(), + callTool: vi.fn(), + }) as unknown as InstanceType, + ); + + const manager = new BrowserManager(mockConfig); + + await expect(manager.ensureConnection()).rejects.toThrow( + /Chrome is not installed/, + ); + }); + + it('should include sessionMode in generic fallback error', async () => { + vi.mocked(Client).mockImplementation( + () => + ({ + connect: vi + .fn() + .mockRejectedValue(new Error('Some unexpected error')), + close: vi.fn().mockResolvedValue(undefined), + listTools: vi.fn(), + callTool: vi.fn(), + }) as unknown as InstanceType, + ); + + const manager = new BrowserManager(mockConfig); + + await expect(manager.ensureConnection()).rejects.toThrow( + /sessionMode: persistent/, + ); + }); + }); + + describe('MCP isolation', () => { + it('should use raw MCP SDK Client, not McpClient wrapper', async () => { + const manager = new BrowserManager(mockConfig); + await manager.ensureConnection(); + + // Verify we're using the raw Client from MCP SDK + expect(Client).toHaveBeenCalledWith( + expect.objectContaining({ + name: 'gemini-cli-browser-agent', + }), + expect.any(Object), + ); + }); + + it('should not use McpClientManager from config', async () => { + // Spy on config method to verify isolation + const getMcpClientManagerSpy = vi.spyOn( + mockConfig, + 'getMcpClientManager', + ); + + const manager = new BrowserManager(mockConfig); + await manager.ensureConnection(); + + // Config's getMcpClientManager should NOT be called + // This ensures isolation from main registry + expect(getMcpClientManagerSpy).not.toHaveBeenCalled(); + }); + }); + + describe('close', () => { + it('should close MCP connections', async () => { + const manager = new BrowserManager(mockConfig); + const client = await manager.getRawMcpClient(); + + await manager.close(); + + expect(client.close).toHaveBeenCalled(); + }); + }); +}); diff --git a/packages/core/src/agents/browser/browserManager.ts b/packages/core/src/agents/browser/browserManager.ts new file mode 100644 index 0000000000..205eb11a1f --- /dev/null +++ b/packages/core/src/agents/browser/browserManager.ts @@ -0,0 +1,436 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * @fileoverview Manages browser lifecycle for the Browser Agent. + * + * Handles: + * - Browser management via chrome-devtools-mcp with --isolated mode + * - CDP connection via raw MCP SDK Client (NOT registered in main registry) + * - Visual tools via --experimental-vision flag + * + * IMPORTANT: The MCP client here is ISOLATED from the main agent's tool registry. + * Tools discovered from chrome-devtools-mcp are NOT registered in the main registry. + * They are wrapped as DeclarativeTools and passed directly to the browser agent. + */ + +import { Client } from '@modelcontextprotocol/sdk/client/index.js'; +import { StdioClientTransport } from '@modelcontextprotocol/sdk/client/stdio.js'; +import type { Tool as McpTool } from '@modelcontextprotocol/sdk/types.js'; +import { debugLogger } from '../../utils/debugLogger.js'; +import type { Config } from '../../config/config.js'; +import { Storage } from '../../config/storage.js'; +import * as path from 'node:path'; + +// Pin chrome-devtools-mcp version for reproducibility. +const CHROME_DEVTOOLS_MCP_VERSION = '0.17.1'; + +// Default browser profile directory name within ~/.gemini/ +const BROWSER_PROFILE_DIR = 'cli-browser-profile'; + +// Default timeout for MCP operations +const MCP_TIMEOUT_MS = 60_000; + +/** + * Content item from an MCP tool call response. + * Can be text or image (for take_screenshot). + */ +export interface McpContentItem { + type: 'text' | 'image'; + text?: string; + /** Base64-encoded image data (for type='image') */ + data?: string; + /** MIME type of the image (e.g., 'image/png') */ + mimeType?: string; +} + +/** + * Result from an MCP tool call. + */ +export interface McpToolCallResult { + content?: McpContentItem[]; + isError?: boolean; +} + +/** + * Manages browser lifecycle and ISOLATED MCP client for the Browser Agent. + * + * The browser is launched and managed by chrome-devtools-mcp in --isolated mode. + * Visual tools (click_at, etc.) are enabled via --experimental-vision flag. + * + * Key isolation property: The MCP client here does NOT register tools + * in the main ToolRegistry. Tools are kept local to the browser agent. + */ +export class BrowserManager { + // Raw MCP SDK Client - NOT the wrapper McpClient + private rawMcpClient: Client | undefined; + private mcpTransport: StdioClientTransport | undefined; + private discoveredTools: McpTool[] = []; + + constructor(private config: Config) {} + + /** + * Gets the raw MCP SDK Client for direct tool calls. + * This client is ISOLATED from the main tool registry. + */ + async getRawMcpClient(): Promise { + if (this.rawMcpClient) { + return this.rawMcpClient; + } + await this.ensureConnection(); + if (!this.rawMcpClient) { + throw new Error('Failed to initialize chrome-devtools MCP client'); + } + return this.rawMcpClient; + } + + /** + * Gets the tool definitions discovered from the MCP server. + * These are dynamically fetched from chrome-devtools-mcp. + */ + async getDiscoveredTools(): Promise { + await this.ensureConnection(); + return this.discoveredTools; + } + + /** + * Calls a tool on the MCP server. + * + * @param toolName The name of the tool to call + * @param args Arguments to pass to the tool + * @param signal Optional AbortSignal to cancel the call + * @returns The result from the MCP server + */ + async callTool( + toolName: string, + args: Record, + signal?: AbortSignal, + ): Promise { + if (signal?.aborted) { + throw signal.reason ?? new Error('Operation cancelled'); + } + + const client = await this.getRawMcpClient(); + const callPromise = client.callTool( + { name: toolName, arguments: args }, + undefined, + { timeout: MCP_TIMEOUT_MS }, + ); + + // If no signal, just await directly + if (!signal) { + return this.toResult(await callPromise); + } + + // Race the call against the abort signal + let onAbort: (() => void) | undefined; + try { + const result = await Promise.race([ + callPromise, + new Promise((_resolve, reject) => { + onAbort = () => + reject(signal.reason ?? new Error('Operation cancelled')); + signal.addEventListener('abort', onAbort, { once: true }); + }), + ]); + return this.toResult(result); + } finally { + if (onAbort) { + signal.removeEventListener('abort', onAbort); + } + } + } + + /** + * Safely maps a raw MCP SDK callTool response to our typed McpToolCallResult + * without using unsafe type assertions. + */ + private toResult( + raw: Awaited>, + ): McpToolCallResult { + return { + content: Array.isArray(raw.content) + ? raw.content.map( + (item: { + type?: string; + text?: string; + data?: string; + mimeType?: string; + }) => ({ + type: item.type === 'image' ? 'image' : 'text', + text: item.text, + data: item.data, + mimeType: item.mimeType, + }), + ) + : undefined, + isError: raw.isError === true, + }; + } + + /** + * Ensures browser and MCP client are connected. + */ + async ensureConnection(): Promise { + if (this.rawMcpClient) { + return; + } + await this.connectMcp(); + } + + /** + * Closes browser and cleans up connections. + * The browser process is managed by chrome-devtools-mcp, so closing + * the transport will terminate the browser. + */ + async close(): Promise { + // Close MCP client first + if (this.rawMcpClient) { + try { + await this.rawMcpClient.close(); + } catch (error) { + debugLogger.error( + `Error closing MCP client: ${error instanceof Error ? error.message : String(error)}`, + ); + } + this.rawMcpClient = undefined; + } + + // Close transport (this terminates the npx process and browser) + if (this.mcpTransport) { + try { + await this.mcpTransport.close(); + } catch (error) { + debugLogger.error( + `Error closing MCP transport: ${error instanceof Error ? error.message : String(error)}`, + ); + } + this.mcpTransport = undefined; + } + + this.discoveredTools = []; + } + + /** + * Connects to chrome-devtools-mcp which manages the browser process. + * + * Spawns npx chrome-devtools-mcp with: + * - --isolated: Manages its own browser instance + * - --experimental-vision: Enables visual tools (click_at, etc.) + * + * IMPORTANT: This does NOT use McpClientManager and does NOT register + * tools in the main ToolRegistry. The connection is isolated to this + * BrowserManager instance. + */ + private async connectMcp(): Promise { + debugLogger.log('Connecting isolated MCP client to chrome-devtools-mcp...'); + + // Create raw MCP SDK Client (not the wrapper McpClient) + this.rawMcpClient = new Client( + { + name: 'gemini-cli-browser-agent', + version: '1.0.0', + }, + { + capabilities: {}, + }, + ); + + // Build args for chrome-devtools-mcp + const browserConfig = this.config.getBrowserAgentConfig(); + const sessionMode = browserConfig.customConfig.sessionMode ?? 'persistent'; + + const mcpArgs = [ + '-y', + `chrome-devtools-mcp@${CHROME_DEVTOOLS_MCP_VERSION}`, + '--experimental-vision', + ]; + + // Session mode determines how the browser is managed: + // - "isolated": Temp profile, cleaned up after session (--isolated) + // - "persistent": Persistent profile at ~/.gemini/cli-browser-profile/ (default) + // - "existing": Connect to already-running Chrome (--autoConnect, requires + // remote debugging enabled at chrome://inspect/#remote-debugging) + if (sessionMode === 'isolated') { + mcpArgs.push('--isolated'); + } else if (sessionMode === 'existing') { + mcpArgs.push('--autoConnect'); + } + + // Add optional settings from config + if (browserConfig.customConfig.headless) { + mcpArgs.push('--headless'); + } + if (browserConfig.customConfig.profilePath) { + mcpArgs.push('--userDataDir', browserConfig.customConfig.profilePath); + } else if (sessionMode === 'persistent') { + // Default persistent profile lives under ~/.gemini/cli-browser-profile + const defaultProfilePath = path.join( + Storage.getGlobalGeminiDir(), + BROWSER_PROFILE_DIR, + ); + mcpArgs.push('--userDataDir', defaultProfilePath); + } + + debugLogger.log( + `Launching chrome-devtools-mcp (${sessionMode} mode) with args: ${mcpArgs.join(' ')}`, + ); + + // Create stdio transport to npx chrome-devtools-mcp. + // stderr is piped (not inherited) to prevent MCP server banners and + // warnings from corrupting the UI in alternate buffer mode. + this.mcpTransport = new StdioClientTransport({ + command: 'npx', + args: mcpArgs, + stderr: 'pipe', + }); + + // Forward piped stderr to debugLogger so it's visible with --debug. + const stderrStream = this.mcpTransport.stderr; + if (stderrStream) { + stderrStream.on('data', (chunk: Buffer) => { + debugLogger.log( + `[chrome-devtools-mcp stderr] ${chunk.toString().trimEnd()}`, + ); + }); + } + + this.mcpTransport.onclose = () => { + debugLogger.error( + 'chrome-devtools-mcp transport closed unexpectedly. ' + + 'The MCP server process may have crashed.', + ); + this.rawMcpClient = undefined; + }; + this.mcpTransport.onerror = (error: Error) => { + debugLogger.error( + `chrome-devtools-mcp transport error: ${error.message}`, + ); + }; + + // Connect to MCP server — use a shorter timeout for 'existing' mode + // since it should connect quickly if remote debugging is enabled. + const connectTimeoutMs = + sessionMode === 'existing' ? 15_000 : MCP_TIMEOUT_MS; + + let timeoutId: ReturnType | undefined; + try { + await Promise.race([ + (async () => { + await this.rawMcpClient!.connect(this.mcpTransport!); + debugLogger.log('MCP client connected to chrome-devtools-mcp'); + await this.discoverTools(); + })(), + new Promise((_, reject) => { + timeoutId = setTimeout( + () => + reject( + new Error( + `Timed out connecting to chrome-devtools-mcp (${connectTimeoutMs}ms)`, + ), + ), + connectTimeoutMs, + ); + }), + ]); + } catch (error) { + await this.close(); + + // Provide error-specific, session-mode-aware remediation + throw this.createConnectionError( + error instanceof Error ? error.message : String(error), + sessionMode, + ); + } finally { + if (timeoutId !== undefined) { + clearTimeout(timeoutId); + } + } + } + + /** + * Creates an Error with context-specific remediation based on the actual + * error message and the current sessionMode. + */ + private createConnectionError(message: string, sessionMode: string): Error { + const lowerMessage = message.toLowerCase(); + + // "already running for the current profile" — persistent mode profile lock + if (lowerMessage.includes('already running')) { + if (sessionMode === 'persistent' || sessionMode === 'isolated') { + return new Error( + `Could not connect to Chrome: ${message}\n\n` + + `The Chrome profile is locked by another running instance.\n` + + `To fix this:\n` + + ` 1. Close all Chrome windows using this profile, OR\n` + + ` 2. Set sessionMode to "isolated" in settings.json to use a temporary profile, OR\n` + + ` 3. Set profilePath in settings.json to use a different profile directory`, + ); + } + // existing mode — shouldn't normally hit this, but handle gracefully + return new Error( + `Could not connect to Chrome: ${message}\n\n` + + `The Chrome profile is locked.\n` + + `Close other Chrome instances and try again.`, + ); + } + + // Timeout errors + if (lowerMessage.includes('timed out')) { + if (sessionMode === 'existing') { + return new Error( + `Timed out connecting to Chrome: ${message}\n\n` + + `To use sessionMode "existing", you must:\n` + + ` 1. Open Chrome (version 144+)\n` + + ` 2. Navigate to chrome://inspect/#remote-debugging\n` + + ` 3. Enable remote debugging\n\n` + + `Alternatively, set sessionMode to "persistent" (default) in settings.json to launch a dedicated browser.`, + ); + } + return new Error( + `Timed out connecting to Chrome: ${message}\n\n` + + `Possible causes:\n` + + ` 1. Chrome is not installed or not in PATH\n` + + ` 2. npx cannot download chrome-devtools-mcp (check network/proxy)\n` + + ` 3. Chrome failed to start (try setting headless: true in settings.json)`, + ); + } + + // Generic "existing" mode failures (connection refused, etc.) + if (sessionMode === 'existing') { + return new Error( + `Failed to connect to existing Chrome instance: ${message}\n\n` + + `To use sessionMode "existing", you must:\n` + + ` 1. Open Chrome (version 144+)\n` + + ` 2. Navigate to chrome://inspect/#remote-debugging\n` + + ` 3. Enable remote debugging\n\n` + + `Alternatively, set sessionMode to "persistent" (default) in settings.json to launch a dedicated browser.`, + ); + } + + // Generic fallback — include sessionMode for debugging context + return new Error( + `Failed to connect to Chrome (sessionMode: ${sessionMode}): ${message}`, + ); + } + + /** + * Discovers tools from the connected MCP server. + */ + private async discoverTools(): Promise { + if (!this.rawMcpClient) { + throw new Error('MCP client not connected'); + } + + const response = await this.rawMcpClient.listTools(); + this.discoveredTools = response.tools; + + debugLogger.log( + `Discovered ${this.discoveredTools.length} tools from chrome-devtools-mcp: ` + + this.discoveredTools.map((t) => t.name).join(', '), + ); + } +} diff --git a/packages/core/src/agents/browser/mcpToolWrapper.test.ts b/packages/core/src/agents/browser/mcpToolWrapper.test.ts new file mode 100644 index 0000000000..a99ff4943c --- /dev/null +++ b/packages/core/src/agents/browser/mcpToolWrapper.test.ts @@ -0,0 +1,196 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; +import { createMcpDeclarativeTools } from './mcpToolWrapper.js'; +import type { BrowserManager, McpToolCallResult } from './browserManager.js'; +import type { MessageBus } from '../../confirmation-bus/message-bus.js'; +import type { Tool as McpTool } from '@modelcontextprotocol/sdk/types.js'; + +describe('mcpToolWrapper', () => { + let mockBrowserManager: BrowserManager; + let mockMessageBus: MessageBus; + let mockMcpTools: McpTool[]; + + beforeEach(() => { + vi.resetAllMocks(); + + // Setup mock MCP tools discovered from server + mockMcpTools = [ + { + name: 'take_snapshot', + description: 'Take a snapshot of the page accessibility tree', + inputSchema: { + type: 'object', + properties: { + verbose: { type: 'boolean', description: 'Include details' }, + }, + }, + }, + { + name: 'click', + description: 'Click on an element by uid', + inputSchema: { + type: 'object', + properties: { + uid: { type: 'string', description: 'Element uid' }, + }, + required: ['uid'], + }, + }, + ]; + + // Setup mock browser manager + mockBrowserManager = { + getDiscoveredTools: vi.fn().mockResolvedValue(mockMcpTools), + callTool: vi.fn().mockResolvedValue({ + content: [{ type: 'text', text: 'Tool result' }], + } as McpToolCallResult), + } as unknown as BrowserManager; + + // Setup mock message bus + mockMessageBus = { + publish: vi.fn().mockResolvedValue(undefined), + subscribe: vi.fn(), + unsubscribe: vi.fn(), + } as unknown as MessageBus; + }); + + afterEach(() => { + vi.restoreAllMocks(); + }); + + describe('createMcpDeclarativeTools', () => { + it('should create declarative tools from discovered MCP tools', async () => { + const tools = await createMcpDeclarativeTools( + mockBrowserManager, + mockMessageBus, + ); + + expect(tools).toHaveLength(3); + expect(tools[0].name).toBe('take_snapshot'); + expect(tools[1].name).toBe('click'); + expect(tools[2].name).toBe('type_text'); + }); + + it('should return tools with correct description', async () => { + const tools = await createMcpDeclarativeTools( + mockBrowserManager, + mockMessageBus, + ); + + // Descriptions include augmented hints, so we check they contain the original + expect(tools[0].description).toContain( + 'Take a snapshot of the page accessibility tree', + ); + expect(tools[1].description).toContain('Click on an element by uid'); + }); + + it('should return tools with proper FunctionDeclaration schema', async () => { + const tools = await createMcpDeclarativeTools( + mockBrowserManager, + mockMessageBus, + ); + + const schema = tools[0].schema; + expect(schema.name).toBe('take_snapshot'); + expect(schema.parametersJsonSchema).toBeDefined(); + }); + }); + + describe('McpDeclarativeTool.build', () => { + it('should create invocation that can be executed', async () => { + const tools = await createMcpDeclarativeTools( + mockBrowserManager, + mockMessageBus, + ); + + const invocation = tools[0].build({ verbose: true }); + + expect(invocation).toBeDefined(); + expect(invocation.params).toEqual({ verbose: true }); + }); + + it('should return invocation with correct description', async () => { + const tools = await createMcpDeclarativeTools( + mockBrowserManager, + mockMessageBus, + ); + + const invocation = tools[0].build({}); + + expect(invocation.getDescription()).toContain('take_snapshot'); + }); + }); + + describe('McpToolInvocation.execute', () => { + it('should call browserManager.callTool with correct params', async () => { + const tools = await createMcpDeclarativeTools( + mockBrowserManager, + mockMessageBus, + ); + + const invocation = tools[1].build({ uid: 'elem-123' }); + await invocation.execute(new AbortController().signal); + + expect(mockBrowserManager.callTool).toHaveBeenCalledWith( + 'click', + { + uid: 'elem-123', + }, + expect.any(AbortSignal), + ); + }); + + it('should return success result from MCP tool', async () => { + const tools = await createMcpDeclarativeTools( + mockBrowserManager, + mockMessageBus, + ); + + const invocation = tools[0].build({ verbose: true }); + const result = await invocation.execute(new AbortController().signal); + + expect(result.llmContent).toBe('Tool result'); + expect(result.error).toBeUndefined(); + }); + + it('should handle MCP tool errors', async () => { + vi.mocked(mockBrowserManager.callTool).mockResolvedValue({ + content: [{ type: 'text', text: 'Element not found' }], + isError: true, + } as McpToolCallResult); + + const tools = await createMcpDeclarativeTools( + mockBrowserManager, + mockMessageBus, + ); + + const invocation = tools[1].build({ uid: 'invalid' }); + const result = await invocation.execute(new AbortController().signal); + + expect(result.error).toBeDefined(); + expect(result.error?.message).toBe('Element not found'); + }); + + it('should handle exceptions during tool call', async () => { + vi.mocked(mockBrowserManager.callTool).mockRejectedValue( + new Error('Connection lost'), + ); + + const tools = await createMcpDeclarativeTools( + mockBrowserManager, + mockMessageBus, + ); + + const invocation = tools[0].build({}); + const result = await invocation.execute(new AbortController().signal); + + expect(result.error).toBeDefined(); + expect(result.error?.message).toBe('Connection lost'); + }); + }); +}); diff --git a/packages/core/src/agents/browser/mcpToolWrapper.ts b/packages/core/src/agents/browser/mcpToolWrapper.ts new file mode 100644 index 0000000000..1838a01b42 --- /dev/null +++ b/packages/core/src/agents/browser/mcpToolWrapper.ts @@ -0,0 +1,545 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * @fileoverview Creates DeclarativeTool classes for MCP tools. + * + * These tools are ONLY registered in the browser agent's isolated ToolRegistry, + * NOT in the main agent's registry. They dispatch to the BrowserManager's + * isolated MCP client directly. + * + * Tool definitions are dynamically discovered from chrome-devtools-mcp + * at runtime, not hardcoded. + */ + +import type { FunctionDeclaration } from '@google/genai'; +import type { Tool as McpTool } from '@modelcontextprotocol/sdk/types.js'; +import { + type ToolConfirmationOutcome, + DeclarativeTool, + BaseToolInvocation, + Kind, + type ToolResult, + type ToolInvocation, + type ToolCallConfirmationDetails, + type PolicyUpdateOptions, +} from '../../tools/tools.js'; +import type { MessageBus } from '../../confirmation-bus/message-bus.js'; +import type { BrowserManager, McpToolCallResult } from './browserManager.js'; +import { debugLogger } from '../../utils/debugLogger.js'; + +/** + * Tool invocation that dispatches to BrowserManager's isolated MCP client. + */ +class McpToolInvocation extends BaseToolInvocation< + Record, + ToolResult +> { + constructor( + private readonly browserManager: BrowserManager, + private readonly toolName: string, + params: Record, + messageBus: MessageBus, + ) { + super(params, messageBus, toolName, toolName); + } + + getDescription(): string { + return `Calling MCP tool: ${this.toolName}`; + } + + protected override async getConfirmationDetails( + _abortSignal: AbortSignal, + ): Promise { + if (!this.messageBus) { + return false; + } + + return { + type: 'mcp', + title: `Confirm MCP Tool: ${this.toolName}`, + serverName: 'browser-agent', + toolName: this.toolName, + toolDisplayName: this.toolName, + onConfirm: async (outcome: ToolConfirmationOutcome) => { + await this.publishPolicyUpdate(outcome); + }, + }; + } + + protected override getPolicyUpdateOptions( + _outcome: ToolConfirmationOutcome, + ): PolicyUpdateOptions | undefined { + return { + mcpName: 'browser-agent', + }; + } + + async execute(signal: AbortSignal): Promise { + try { + const callToolPromise = this.browserManager.callTool( + this.toolName, + this.params, + signal, + ); + + const result: McpToolCallResult = await callToolPromise; + + // Extract text content from MCP response + let textContent = ''; + if (result.content && Array.isArray(result.content)) { + textContent = result.content + .filter((c) => c.type === 'text' && c.text) + .map((c) => c.text) + .join('\n'); + } + + // Post-process to add contextual hints for common error patterns + const processedContent = postProcessToolResult( + this.toolName, + textContent, + ); + + if (result.isError) { + return { + llmContent: `Error: ${processedContent}`, + returnDisplay: `Error: ${processedContent}`, + error: { message: textContent }, + }; + } + + return { + llmContent: processedContent || 'Tool executed successfully.', + returnDisplay: processedContent || 'Tool executed successfully.', + }; + } catch (error) { + const errorMsg = error instanceof Error ? error.message : String(error); + + // Chrome connection errors are fatal — re-throw to terminate the agent + // immediately instead of returning a result the LLM would retry. + if (errorMsg.includes('Could not connect to Chrome')) { + throw error; + } + + debugLogger.error(`MCP tool ${this.toolName} failed: ${errorMsg}`); + return { + llmContent: `Error: ${errorMsg}`, + returnDisplay: `Error: ${errorMsg}`, + error: { message: errorMsg }, + }; + } + } +} + +/** + * Composite tool invocation that types a full string by calling press_key + * for each character internally, avoiding N model round-trips. + */ +class TypeTextInvocation extends BaseToolInvocation< + Record, + ToolResult +> { + constructor( + private readonly browserManager: BrowserManager, + private readonly text: string, + private readonly submitKey: string | undefined, + messageBus: MessageBus, + ) { + super({ text, submitKey }, messageBus, 'type_text', 'type_text'); + } + + getDescription(): string { + const preview = `"${this.text.substring(0, 50)}${this.text.length > 50 ? '...' : ''}"`; + return this.submitKey + ? `type_text: ${preview} + ${this.submitKey}` + : `type_text: ${preview}`; + } + + protected override async getConfirmationDetails( + _abortSignal: AbortSignal, + ): Promise { + if (!this.messageBus) { + return false; + } + + return { + type: 'mcp', + title: `Confirm Tool: type_text`, + serverName: 'browser-agent', + toolName: 'type_text', + toolDisplayName: 'type_text', + onConfirm: async (outcome: ToolConfirmationOutcome) => { + await this.publishPolicyUpdate(outcome); + }, + }; + } + + protected override getPolicyUpdateOptions( + _outcome: ToolConfirmationOutcome, + ): PolicyUpdateOptions | undefined { + return { + mcpName: 'browser-agent', + }; + } + + override async execute(signal: AbortSignal): Promise { + try { + if (signal.aborted) { + return { + llmContent: 'Error: Operation cancelled before typing started.', + returnDisplay: 'Operation cancelled before typing started.', + error: { message: 'Operation cancelled' }, + }; + } + + await this.typeCharByChar(signal); + + // Optionally press a submit key (Enter, Tab, etc.) after typing + if (this.submitKey && !signal.aborted) { + const keyResult = await this.browserManager.callTool( + 'press_key', + { key: this.submitKey }, + signal, + ); + if (keyResult.isError) { + const errText = this.extractErrorText(keyResult); + debugLogger.warn( + `type_text: submitKey("${this.submitKey}") failed: ${errText}`, + ); + } + } + + const summary = this.submitKey + ? `Successfully typed "${this.text}" and pressed ${this.submitKey}` + : `Successfully typed "${this.text}"`; + + return { + llmContent: summary, + returnDisplay: summary, + }; + } catch (error) { + const errorMsg = error instanceof Error ? error.message : String(error); + + // Chrome connection errors are fatal + if (errorMsg.includes('Could not connect to Chrome')) { + throw error; + } + + debugLogger.error(`type_text failed: ${errorMsg}`); + return { + llmContent: `Error: ${errorMsg}`, + returnDisplay: `Error: ${errorMsg}`, + error: { message: errorMsg }, + }; + } + } + + /** Types each character via individual press_key MCP calls. */ + private async typeCharByChar(signal: AbortSignal): Promise { + const chars = [...this.text]; // Handle Unicode correctly + for (const char of chars) { + if (signal.aborted) return; + + // Map special characters to key names + const key = char === ' ' ? 'Space' : char; + const result = await this.browserManager.callTool( + 'press_key', + { key }, + signal, + ); + + if (result.isError) { + debugLogger.warn( + `type_text: press_key("${key}") failed: ${this.extractErrorText(result)}`, + ); + } + } + } + + /** Extract error text from an MCP tool result. */ + private extractErrorText(result: McpToolCallResult): string { + return ( + result.content + ?.filter( + (c: { type: string; text?: string }) => c.type === 'text' && c.text, + ) + .map((c: { type: string; text?: string }) => c.text) + .join('\n') || 'Unknown error' + ); + } +} + +/** + * DeclarativeTool wrapper for an MCP tool. + */ +class McpDeclarativeTool extends DeclarativeTool< + Record, + ToolResult +> { + constructor( + private readonly browserManager: BrowserManager, + name: string, + description: string, + parameterSchema: unknown, + messageBus: MessageBus, + ) { + super( + name, + name, + description, + Kind.Other, + parameterSchema, + messageBus, + /* isOutputMarkdown */ true, + /* canUpdateOutput */ false, + ); + } + + build( + params: Record, + ): ToolInvocation, ToolResult> { + return new McpToolInvocation( + this.browserManager, + this.name, + params, + this.messageBus, + ); + } +} + +/** + * DeclarativeTool for the custom type_text composite tool. + */ +class TypeTextDeclarativeTool extends DeclarativeTool< + Record, + ToolResult +> { + constructor( + private readonly browserManager: BrowserManager, + messageBus: MessageBus, + ) { + super( + 'type_text', + 'type_text', + 'Types a full text string into the currently focused element. ' + + 'Much faster than calling press_key for each character individually. ' + + 'Use this to enter text into form fields, search boxes, spreadsheet cells, or any focused input. ' + + 'The element must already be focused (e.g., after a click). ' + + 'Use submitKey to press a key after typing (e.g., submitKey="Enter" to submit a form or confirm a value, submitKey="Tab" to move to the next field).', + Kind.Other, + { + type: 'object', + properties: { + text: { + type: 'string', + description: 'The text to type into the focused element.', + }, + submitKey: { + type: 'string', + description: + 'Optional key to press after typing (e.g., "Enter", "Tab", "Escape"). ' + + 'Useful for submitting form fields or moving to the next cell in a spreadsheet.', + }, + }, + required: ['text'], + }, + messageBus, + /* isOutputMarkdown */ true, + /* canUpdateOutput */ false, + ); + } + + build( + params: Record, + ): ToolInvocation, ToolResult> { + const submitKey = + typeof params['submitKey'] === 'string' && params['submitKey'] + ? params['submitKey'] + : undefined; + return new TypeTextInvocation( + this.browserManager, + String(params['text'] ?? ''), + submitKey, + this.messageBus, + ); + } +} + +/** + * Creates DeclarativeTool instances from dynamically discovered MCP tools, + * plus custom composite tools (like type_text). + * + * These tools are registered in the browser agent's isolated ToolRegistry, + * NOT in the main agent's registry. + * + * Tool definitions are fetched dynamically from the MCP server at runtime. + * + * @param browserManager The browser manager with isolated MCP client + * @param messageBus Message bus for tool invocations + * @returns Array of DeclarativeTools that dispatch to the isolated MCP client + */ +export async function createMcpDeclarativeTools( + browserManager: BrowserManager, + messageBus: MessageBus, +): Promise> { + // Get dynamically discovered tools from the MCP server + const mcpTools = await browserManager.getDiscoveredTools(); + + debugLogger.log( + `Creating ${mcpTools.length} declarative tools for browser agent`, + ); + + const tools: Array = + mcpTools.map((mcpTool) => { + const schema = convertMcpToolToFunctionDeclaration(mcpTool); + // Augment description with uid-context hints + const augmentedDescription = augmentToolDescription( + mcpTool.name, + mcpTool.description ?? '', + ); + return new McpDeclarativeTool( + browserManager, + mcpTool.name, + augmentedDescription, + schema.parametersJsonSchema, + messageBus, + ); + }); + + // Add custom composite tools + tools.push(new TypeTextDeclarativeTool(browserManager, messageBus)); + + debugLogger.log( + `Total tools registered: ${tools.length} (${mcpTools.length} MCP + 1 custom)`, + ); + + return tools; +} + +/** + * Converts MCP tool definition to Gemini FunctionDeclaration. + */ +function convertMcpToolToFunctionDeclaration( + mcpTool: McpTool, +): FunctionDeclaration { + // MCP tool inputSchema is a JSON Schema object + // We pass it directly as parametersJsonSchema + return { + name: mcpTool.name, + description: mcpTool.description ?? '', + parametersJsonSchema: mcpTool.inputSchema ?? { + type: 'object', + properties: {}, + }, + }; +} + +/** + * Augments MCP tool descriptions with usage guidance. + * Adds semantic hints and usage rules directly in tool descriptions + * so the model makes correct tool choices without system prompt overhead. + * + * Actual chrome-devtools-mcp tools: + * Input: click, drag, fill, fill_form, handle_dialog, hover, press_key, upload_file + * Navigation: close_page, list_pages, navigate_page, new_page, select_page, wait_for + * Emulation: emulate, resize_page + * Performance: performance_analyze_insight, performance_start_trace, performance_stop_trace + * Network: get_network_request, list_network_requests + * Debugging: evaluate_script, get_console_message, list_console_messages, take_screenshot, take_snapshot + * Vision (--experimental-vision): click_at, analyze_screenshot + */ +function augmentToolDescription(toolName: string, description: string): string { + // More-specific keys MUST come before shorter keys to prevent + // partial matching from short-circuiting (e.g., fill_form before fill). + const hints: Record = { + fill_form: + ' Fills multiple standard HTML form fields at once. Same limitations as fill — does not work on canvas/custom widgets.', + fill: ' Fills standard HTML form fields (,