mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-06-23 09:47:45 -07:00
251 lines
8.2 KiB
TypeScript
251 lines
8.2 KiB
TypeScript
/**
|
|
* @license
|
|
* Copyright 2026 Google LLC
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*/
|
|
|
|
/**
|
|
* @fileoverview Tool for visual identification via a single model call.
|
|
*
|
|
* The semantic browser agent uses this tool when it needs to identify
|
|
* elements by visual attributes not present in the accessibility tree
|
|
* (e.g., color, layout, precise coordinates).
|
|
*
|
|
* Unlike the semantic agent which works with the accessibility tree,
|
|
* this tool sends a screenshot to a computer-use model for visual analysis.
|
|
* It returns the model's analysis (coordinates, element descriptions) back
|
|
* to the browser agent, which retains full control of subsequent actions.
|
|
*/
|
|
|
|
import {
|
|
DeclarativeTool,
|
|
BaseToolInvocation,
|
|
Kind,
|
|
type ToolResult,
|
|
type ToolInvocation,
|
|
} from '../../tools/tools.js';
|
|
import type { MessageBus } from '../../confirmation-bus/message-bus.js';
|
|
import type { BrowserManager } from './browserManager.js';
|
|
import type { Config } from '../../config/config.js';
|
|
import { getVisualAgentModel } from './modelAvailability.js';
|
|
import { debugLogger } from '../../utils/debugLogger.js';
|
|
import { LlmRole } from '../../telemetry/llmRole.js';
|
|
|
|
/**
|
|
* System prompt for the visual analysis model call.
|
|
*/
|
|
const VISUAL_SYSTEM_PROMPT = `You are a Visual Analysis Agent. You receive a screenshot of a browser page and an instruction.
|
|
|
|
Your job is to ANALYZE the screenshot and provide precise information that a browser automation agent can act on.
|
|
|
|
COORDINATE SYSTEM:
|
|
- Coordinates are pixel-based relative to the viewport
|
|
- (0,0) is top-left of the visible area
|
|
- Estimate element positions from the screenshot
|
|
|
|
RESPONSE FORMAT:
|
|
- For coordinate identification: provide exact (x, y) pixel coordinates
|
|
- For element identification: describe the element's visual location and appearance
|
|
- For layout analysis: describe the spatial relationships between elements
|
|
- Be concise and actionable — the browser agent will use your response to decide what action to take
|
|
|
|
IMPORTANT:
|
|
- You are NOT performing actions — you are only providing visual analysis
|
|
- Include coordinates when possible so the caller can use click_at(x, y)
|
|
- If the element is not visible in the screenshot, say so explicitly`;
|
|
|
|
/**
|
|
* Invocation for the analyze_screenshot tool.
|
|
* Makes a single generateContent call with a screenshot.
|
|
*/
|
|
class AnalyzeScreenshotInvocation extends BaseToolInvocation<
|
|
Record<string, unknown>,
|
|
ToolResult
|
|
> {
|
|
constructor(
|
|
private readonly browserManager: BrowserManager,
|
|
private readonly config: Config,
|
|
params: Record<string, unknown>,
|
|
messageBus: MessageBus,
|
|
) {
|
|
super(params, messageBus, 'analyze_screenshot', 'Analyze Screenshot');
|
|
}
|
|
|
|
getDescription(): string {
|
|
const instruction = String(this.params['instruction'] ?? '');
|
|
return `Visual analysis: "${instruction}"`;
|
|
}
|
|
|
|
async execute(signal: AbortSignal): Promise<ToolResult> {
|
|
try {
|
|
const instruction = String(this.params['instruction'] ?? '');
|
|
|
|
debugLogger.log(`Visual analysis requested: ${instruction}`);
|
|
|
|
// Capture screenshot via MCP tool
|
|
const screenshotResult = await this.browserManager.callTool(
|
|
'take_screenshot',
|
|
{},
|
|
);
|
|
|
|
// Extract base64 image data from MCP response.
|
|
// Search ALL content items for image type — MCP returns [text, image]
|
|
// where content[0] is a text description and content[1] is the actual PNG.
|
|
let screenshotBase64 = '';
|
|
let mimeType = 'image/png';
|
|
if (screenshotResult.content && Array.isArray(screenshotResult.content)) {
|
|
for (const item of screenshotResult.content) {
|
|
if (item.type === 'image' && item.data) {
|
|
screenshotBase64 = item.data;
|
|
mimeType = item.mimeType ?? 'image/png';
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!screenshotBase64) {
|
|
return {
|
|
llmContent:
|
|
'Failed to capture screenshot for visual analysis. Use accessibility tree elements instead.',
|
|
returnDisplay: 'Screenshot capture failed',
|
|
error: { message: 'Screenshot capture failed' },
|
|
};
|
|
}
|
|
|
|
// Make a single generateContent call with the visual model
|
|
const visualModel = getVisualAgentModel(this.config);
|
|
const contentGenerator = this.config.getContentGenerator();
|
|
|
|
const response = await contentGenerator.generateContent(
|
|
{
|
|
model: visualModel,
|
|
config: {
|
|
temperature: 0,
|
|
topP: 0.95,
|
|
systemInstruction: VISUAL_SYSTEM_PROMPT,
|
|
abortSignal: signal,
|
|
},
|
|
contents: [
|
|
{
|
|
role: 'user',
|
|
parts: [
|
|
{
|
|
text: `Analyze this screenshot and respond to the following instruction:\n\n${instruction}`,
|
|
},
|
|
{
|
|
inlineData: {
|
|
mimeType,
|
|
data: screenshotBase64,
|
|
},
|
|
},
|
|
],
|
|
},
|
|
],
|
|
},
|
|
'visual-analysis',
|
|
LlmRole.UTILITY_TOOL,
|
|
);
|
|
|
|
// Extract text from response
|
|
const responseText =
|
|
response.candidates?.[0]?.content?.parts
|
|
?.filter((p) => p.text)
|
|
.map((p) => p.text)
|
|
.join('\n') ?? '';
|
|
|
|
if (!responseText) {
|
|
return {
|
|
llmContent:
|
|
'Visual model returned no analysis. Use accessibility tree elements instead.',
|
|
returnDisplay: 'Visual analysis returned empty response',
|
|
error: { message: 'Empty visual analysis response' },
|
|
};
|
|
}
|
|
|
|
debugLogger.log(`Visual analysis complete: ${responseText}`);
|
|
|
|
return {
|
|
llmContent: `Visual Analysis Result:\n${responseText}`,
|
|
returnDisplay: `Visual Analysis Result:\n${responseText}`,
|
|
};
|
|
} catch (error) {
|
|
const errorMsg = error instanceof Error ? error.message : String(error);
|
|
debugLogger.error(`Visual analysis failed: ${errorMsg}`);
|
|
|
|
// Provide a graceful fallback message for model unavailability
|
|
const isModelError =
|
|
errorMsg.includes('404') ||
|
|
errorMsg.includes('403') ||
|
|
errorMsg.includes('not found') ||
|
|
errorMsg.includes('permission');
|
|
|
|
const fallbackMsg = isModelError
|
|
? 'Visual analysis model is not available. Use accessibility tree elements (uids from take_snapshot) for all interactions instead.'
|
|
: `Visual analysis failed: ${errorMsg}. Use accessibility tree elements instead.`;
|
|
|
|
return {
|
|
llmContent: fallbackMsg,
|
|
returnDisplay: fallbackMsg,
|
|
error: { message: errorMsg },
|
|
};
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* DeclarativeTool for screenshot-based visual analysis.
|
|
*/
|
|
class AnalyzeScreenshotTool extends DeclarativeTool<
|
|
Record<string, unknown>,
|
|
ToolResult
|
|
> {
|
|
constructor(
|
|
private readonly browserManager: BrowserManager,
|
|
private readonly config: Config,
|
|
messageBus: MessageBus,
|
|
) {
|
|
super(
|
|
'analyze_screenshot',
|
|
'analyze_screenshot',
|
|
'Analyze the current page visually using a screenshot. Use when you need to identify elements by visual attributes (color, layout, position) not available in the accessibility tree, or when you need precise pixel coordinates for click_at. Returns visual analysis — you perform the actions yourself.',
|
|
Kind.Other,
|
|
{
|
|
type: 'object',
|
|
properties: {
|
|
instruction: {
|
|
type: 'string',
|
|
description:
|
|
'What to identify or analyze visually (e.g., "Find the coordinates of the blue submit button", "What is the layout of the navigation menu?").',
|
|
},
|
|
},
|
|
required: ['instruction'],
|
|
},
|
|
messageBus,
|
|
true, // isOutputMarkdown
|
|
false, // canUpdateOutput
|
|
);
|
|
}
|
|
|
|
build(
|
|
params: Record<string, unknown>,
|
|
): ToolInvocation<Record<string, unknown>, ToolResult> {
|
|
return new AnalyzeScreenshotInvocation(
|
|
this.browserManager,
|
|
this.config,
|
|
params,
|
|
this.messageBus,
|
|
);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Creates the analyze_screenshot tool for the browser agent.
|
|
*/
|
|
export function createAnalyzeScreenshotTool(
|
|
browserManager: BrowserManager,
|
|
config: Config,
|
|
messageBus: MessageBus,
|
|
): AnalyzeScreenshotTool {
|
|
return new AnalyzeScreenshotTool(browserManager, config, messageBus);
|
|
}
|