mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-05-30 13:43:00 -07:00
2bc2945d14
- Add custom type_text tool that types a full string by internally calling press_key for each character, turning N model round-trips into 1. Dramatically speeds up text input in complex web apps. - Move tool-specific usage rules from system prompt to individual tool descriptions via augmentToolDescription() for better organization and token efficiency. - Add terminal failure handling instructions to system prompt (Chrome connection errors, browser crashes, repeated errors) with specific remediation steps. - Add complex web app guidance (spreadsheets, rich editors) to system prompt, recommending type_text + keyboard navigation. - Fix augmentToolDescription key ordering so more-specific keys (fill_form, click_at) match before shorter keys (fill, click). - Remove non-existent tool references (scroll, type_text as MCP tool) and add click_at hint for vision tool.
171 lines
7.2 KiB
TypeScript
171 lines
7.2 KiB
TypeScript
/**
|
||
* @license
|
||
* Copyright 2025 Google LLC
|
||
* SPDX-License-Identifier: Apache-2.0
|
||
*/
|
||
|
||
/**
|
||
* @fileoverview Browser Agent definition following the LocalAgentDefinition pattern.
|
||
*
|
||
* This agent uses LocalAgentExecutor for its reAct loop, like CodebaseInvestigatorAgent.
|
||
* It is available ONLY via delegate_to_agent, NOT as a direct tool.
|
||
*
|
||
* Tools are configured dynamically at invocation time via browserAgentFactory.
|
||
*/
|
||
|
||
import type { LocalAgentDefinition } from '../types.js';
|
||
import type { Config } from '../../config/config.js';
|
||
import { z } from 'zod';
|
||
import {
|
||
isPreviewModel,
|
||
PREVIEW_GEMINI_FLASH_MODEL,
|
||
DEFAULT_GEMINI_FLASH_MODEL,
|
||
} from '../../config/models.js';
|
||
|
||
/** Canonical agent name — used for routing and configuration lookup. */
|
||
export const BROWSER_AGENT_NAME = 'browser_agent';
|
||
|
||
/**
|
||
* Output schema for browser agent results.
|
||
*/
|
||
export const BrowserTaskResultSchema = z.object({
|
||
success: z.boolean().describe('Whether the task was completed successfully'),
|
||
summary: z
|
||
.string()
|
||
.describe('A summary of what was accomplished or what went wrong'),
|
||
data: z
|
||
.unknown()
|
||
.optional()
|
||
.describe('Optional extracted data from the task'),
|
||
});
|
||
|
||
/**
|
||
* System prompt for the semantic browser agent.
|
||
* Extracted from prototype (computer_use_subagent_cdt branch).
|
||
*/
|
||
export const BROWSER_SYSTEM_PROMPT = `You are an expert browser automation agent (Orchestrator). Your goal is to completely fulfill the user's request.
|
||
|
||
IMPORTANT: You will receive an accessibility tree snapshot showing elements with uid values (e.g., uid=87_4 button "Login").
|
||
Use these uid values directly with your tools:
|
||
- click(uid="87_4") to click the Login button
|
||
- fill(uid="87_2", value="john") to fill a text field
|
||
- fill_form(elements=[{uid: "87_2", value: "john"}, {uid: "87_3", value: "pass"}]) to fill multiple fields at once
|
||
|
||
PARALLEL TOOL CALLS - CRITICAL:
|
||
- Do NOT make parallel calls for actions that change page state (click, fill, press_key, etc.)
|
||
- Each action changes the DOM and invalidates UIDs from the current snapshot
|
||
- Make state-changing actions ONE AT A TIME, then observe the results
|
||
|
||
OVERLAY/POPUP HANDLING:
|
||
Before interacting with page content, scan the accessibility tree for blocking overlays:
|
||
- Tooltips, popups, modals, cookie banners, newsletter prompts, promo dialogs
|
||
- These often have: close buttons (×, X, Close, Dismiss), "Got it", "Accept", "No thanks" buttons
|
||
- Common patterns: elements with role="dialog", role="tooltip", role="alertdialog", or aria-modal="true"
|
||
- If you see such elements, DISMISS THEM FIRST by clicking close/dismiss buttons before proceeding
|
||
- If a click seems to have no effect, check if an overlay appeared or is blocking the target
|
||
|
||
VISUAL IDENTIFICATION (analyze_screenshot):
|
||
When you need to identify elements by visual attributes not in the AX tree (e.g., "click the yellow button", "find the red error message"), or need precise pixel coordinates:
|
||
1. Call analyze_screenshot with a clear instruction describing what to find
|
||
2. It returns visual analysis with coordinates/descriptions — it does NOT perform actions
|
||
3. Use the returned coordinates with click_at(x, y) or other tools yourself
|
||
4. If the analysis is insufficient, call it again with a more specific instruction
|
||
|
||
COMPLEX WEB APPS (spreadsheets, rich editors, canvas apps):
|
||
Many web apps (Google Sheets/Docs, Notion, Figma, etc.) use custom rendering rather than standard HTML inputs.
|
||
- fill does NOT work on these apps. Instead, click the target element, then use type_text to enter the value.
|
||
- Navigate cells/fields using keyboard shortcuts (Tab, Enter, ArrowDown) — more reliable than clicking cell UIDs.
|
||
- For spreadsheets: click a cell → type_text("value") → press_key("Enter") to confirm and move to the next cell.
|
||
- Use the Name Box (cell reference input, usually showing "A1") to jump to specific cells.
|
||
|
||
TERMINAL FAILURES — STOP IMMEDIATELY:
|
||
Some errors are unrecoverable and retrying will never help. When you see ANY of these, call complete_task immediately with success=false and include the specific remediation steps in your summary:
|
||
- "Could not connect to Chrome" — Include ALL of these instructions in your summary:
|
||
1. Open Chrome (version 144+)
|
||
2. Go to chrome://inspect/#remote-debugging and enable remote debugging
|
||
3. Or change sessionMode to "persistent" in settings.json to let the agent launch its own browser
|
||
- "Browser closed" or "Target closed" or "Session closed" — The browser process has terminated. Tell the user to restart and try again.
|
||
- "net::ERR_" network errors on the SAME URL after 2 retries — the site is unreachable. Report the URL and error.
|
||
- Any error that appears IDENTICALLY 3+ times in a row — it will not resolve by retrying.
|
||
Do NOT keep retrying terminal errors. Report them with actionable remediation steps and exit immediately.
|
||
|
||
CRITICAL: When you have fully completed the user's task, you MUST call the complete_task tool with a summary of what you accomplished. Do NOT just return text - you must explicitly call complete_task to exit the loop.`;
|
||
|
||
/**
|
||
* Browser Agent Definition Factory.
|
||
*
|
||
* Following the CodebaseInvestigatorAgent pattern:
|
||
* - Returns a factory function that takes Config for dynamic model selection
|
||
* - kind: 'local' for LocalAgentExecutor
|
||
* - toolConfig is set dynamically by browserAgentFactory
|
||
*/
|
||
export const BrowserAgentDefinition = (
|
||
config: Config,
|
||
): LocalAgentDefinition<typeof BrowserTaskResultSchema> => {
|
||
// Use Preview Flash model if the main model is any of the preview models.
|
||
// If the main model is not a preview model, use the default flash model.
|
||
const model = isPreviewModel(config.getModel())
|
||
? PREVIEW_GEMINI_FLASH_MODEL
|
||
: DEFAULT_GEMINI_FLASH_MODEL;
|
||
|
||
return {
|
||
name: BROWSER_AGENT_NAME,
|
||
kind: 'local',
|
||
experimental: true,
|
||
displayName: 'Browser Agent',
|
||
description: `Specialized agent for web browser automation using the Accessibility Tree.
|
||
Use this agent for: navigating websites, filling forms, clicking buttons,
|
||
extracting information from web pages. It can see and interact with the page
|
||
structure semantically through the accessibility tree.`,
|
||
|
||
inputConfig: {
|
||
inputSchema: {
|
||
type: 'object',
|
||
properties: {
|
||
task: {
|
||
type: 'string',
|
||
description: 'The task to perform in the browser.',
|
||
},
|
||
},
|
||
required: ['task'],
|
||
},
|
||
},
|
||
|
||
outputConfig: {
|
||
outputName: 'result',
|
||
description: 'The result of the browser task.',
|
||
schema: BrowserTaskResultSchema,
|
||
},
|
||
|
||
processOutput: (output) => JSON.stringify(output, null, 2),
|
||
|
||
modelConfig: {
|
||
// Dynamic model based on whether user is using preview models
|
||
model,
|
||
generateContentConfig: {
|
||
temperature: 0.1,
|
||
topP: 0.95,
|
||
},
|
||
},
|
||
|
||
runConfig: {
|
||
maxTimeMinutes: 10,
|
||
maxTurns: 50,
|
||
},
|
||
|
||
// Tools are set dynamically by browserAgentFactory after MCP connection
|
||
// This is undefined here and will be set at invocation time
|
||
toolConfig: undefined,
|
||
|
||
promptConfig: {
|
||
query: `Your task is:
|
||
<task>
|
||
\${task}
|
||
</task>
|
||
|
||
First, use new_page to open the relevant URL. Then call take_snapshot to see the page and proceed with your task.`,
|
||
systemPrompt: BROWSER_SYSTEM_PROMPT,
|
||
},
|
||
};
|
||
};
|