mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-04-25 12:34:38 -07:00
fix(browser): handle computer-use model detection for analyze_screenshot (#24502)
This commit is contained in:
@@ -1244,7 +1244,8 @@ their corresponding top-level category object in your `settings.json` file.
|
|||||||
- **Requires restart:** Yes
|
- **Requires restart:** Yes
|
||||||
|
|
||||||
- **`agents.browser.visualModel`** (string):
|
- **`agents.browser.visualModel`** (string):
|
||||||
- **Description:** Model override for the visual agent.
|
- **Description:** Model for the visual agent's analyze_screenshot tool. When
|
||||||
|
set, enables the tool.
|
||||||
- **Default:** `undefined`
|
- **Default:** `undefined`
|
||||||
- **Requires restart:** Yes
|
- **Requires restart:** Yes
|
||||||
|
|
||||||
|
|||||||
@@ -1202,7 +1202,8 @@ const SETTINGS_SCHEMA = {
|
|||||||
category: 'Advanced',
|
category: 'Advanced',
|
||||||
requiresRestart: true,
|
requiresRestart: true,
|
||||||
default: undefined as string | undefined,
|
default: undefined as string | undefined,
|
||||||
description: 'Model override for the visual agent.',
|
description:
|
||||||
|
"Model for the visual agent's analyze_screenshot tool. When set, enables the tool.",
|
||||||
showInDialog: false,
|
showInDialog: false,
|
||||||
},
|
},
|
||||||
allowedDomains: {
|
allowedDomains: {
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ import { createAnalyzeScreenshotTool } from './analyzeScreenshot.js';
|
|||||||
import type { BrowserManager, McpToolCallResult } from './browserManager.js';
|
import type { BrowserManager, McpToolCallResult } from './browserManager.js';
|
||||||
import type { Config } from '../../config/config.js';
|
import type { Config } from '../../config/config.js';
|
||||||
import type { MessageBus } from '../../confirmation-bus/message-bus.js';
|
import type { MessageBus } from '../../confirmation-bus/message-bus.js';
|
||||||
|
import { Environment } from '@google/genai';
|
||||||
|
|
||||||
const mockMessageBus = {
|
const mockMessageBus = {
|
||||||
waitForConfirmation: vi.fn().mockResolvedValue({ approved: true }),
|
waitForConfirmation: vi.fn().mockResolvedValue({ approved: true }),
|
||||||
@@ -36,6 +37,7 @@ function createMockBrowserManager(
|
|||||||
function createMockConfig(
|
function createMockConfig(
|
||||||
generateContentResult?: unknown,
|
generateContentResult?: unknown,
|
||||||
generateContentError?: Error,
|
generateContentError?: Error,
|
||||||
|
modelName: string = 'gemini-2.5-computer-use-preview-10-2025',
|
||||||
): Config {
|
): Config {
|
||||||
const generateContent = generateContentError
|
const generateContent = generateContentError
|
||||||
? vi.fn().mockRejectedValue(generateContentError)
|
? vi.fn().mockRejectedValue(generateContentError)
|
||||||
@@ -57,7 +59,7 @@ function createMockConfig(
|
|||||||
|
|
||||||
return {
|
return {
|
||||||
getBrowserAgentConfig: vi.fn().mockReturnValue({
|
getBrowserAgentConfig: vi.fn().mockReturnValue({
|
||||||
customConfig: { visualModel: 'test-visual-model' },
|
customConfig: { visualModel: modelName },
|
||||||
}),
|
}),
|
||||||
getContentGenerator: vi.fn().mockReturnValue({
|
getContentGenerator: vi.fn().mockReturnValue({
|
||||||
generateContent,
|
generateContent,
|
||||||
@@ -109,7 +111,22 @@ describe('analyzeScreenshot', () => {
|
|||||||
const contentGenerator = config.getContentGenerator();
|
const contentGenerator = config.getContentGenerator();
|
||||||
expect(contentGenerator.generateContent).toHaveBeenCalledWith(
|
expect(contentGenerator.generateContent).toHaveBeenCalledWith(
|
||||||
expect.objectContaining({
|
expect.objectContaining({
|
||||||
model: 'test-visual-model',
|
model: 'gemini-2.5-computer-use-preview-10-2025',
|
||||||
|
config: expect.objectContaining({
|
||||||
|
tools: [
|
||||||
|
{
|
||||||
|
computerUse: {
|
||||||
|
environment: Environment.ENVIRONMENT_BROWSER,
|
||||||
|
excludedPredefinedFunctions: [
|
||||||
|
'open_web_browser',
|
||||||
|
'click_at',
|
||||||
|
'key_combination',
|
||||||
|
'drag_and_drop',
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}),
|
||||||
contents: expect.arrayContaining([
|
contents: expect.arrayContaining([
|
||||||
expect.objectContaining({
|
expect.objectContaining({
|
||||||
role: 'user',
|
role: 'user',
|
||||||
@@ -136,6 +153,33 @@ describe('analyzeScreenshot', () => {
|
|||||||
expect(result.error).toBeUndefined();
|
expect(result.error).toBeUndefined();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('omits computerUse tools for non-computer-use models', async () => {
|
||||||
|
const browserManager = createMockBrowserManager();
|
||||||
|
const config = createMockConfig(undefined, undefined, 'gemini-2.0-flash');
|
||||||
|
const tool = createAnalyzeScreenshotTool(
|
||||||
|
browserManager,
|
||||||
|
config,
|
||||||
|
mockMessageBus,
|
||||||
|
);
|
||||||
|
|
||||||
|
const invocation = tool.build({
|
||||||
|
instruction: 'Find the search bar',
|
||||||
|
});
|
||||||
|
await invocation.execute(new AbortController().signal);
|
||||||
|
|
||||||
|
const contentGenerator = config.getContentGenerator();
|
||||||
|
expect(contentGenerator.generateContent).toHaveBeenCalledWith(
|
||||||
|
expect.objectContaining({
|
||||||
|
model: 'gemini-2.0-flash',
|
||||||
|
config: expect.not.objectContaining({
|
||||||
|
tools: expect.anything(),
|
||||||
|
}),
|
||||||
|
}),
|
||||||
|
'visual-analysis',
|
||||||
|
'utility_tool',
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
it('returns an error when screenshot capture fails (no image)', async () => {
|
it('returns an error when screenshot capture fails (no image)', async () => {
|
||||||
const browserManager = createMockBrowserManager({
|
const browserManager = createMockBrowserManager({
|
||||||
content: [{ type: 'text', text: 'No screenshot available' }],
|
content: [{ type: 'text', text: 'No screenshot available' }],
|
||||||
|
|||||||
@@ -24,10 +24,14 @@ import {
|
|||||||
type ToolResult,
|
type ToolResult,
|
||||||
type ToolInvocation,
|
type ToolInvocation,
|
||||||
} from '../../tools/tools.js';
|
} from '../../tools/tools.js';
|
||||||
|
import { Environment } from '@google/genai';
|
||||||
import type { MessageBus } from '../../confirmation-bus/message-bus.js';
|
import type { MessageBus } from '../../confirmation-bus/message-bus.js';
|
||||||
import type { BrowserManager } from './browserManager.js';
|
import type { BrowserManager } from './browserManager.js';
|
||||||
import type { Config } from '../../config/config.js';
|
import type { Config } from '../../config/config.js';
|
||||||
import { getVisualAgentModel } from './modelAvailability.js';
|
import {
|
||||||
|
getVisualAgentModel,
|
||||||
|
isComputerUseModel,
|
||||||
|
} from './modelAvailability.js';
|
||||||
import { debugLogger } from '../../utils/debugLogger.js';
|
import { debugLogger } from '../../utils/debugLogger.js';
|
||||||
import { LlmRole } from '../../telemetry/llmRole.js';
|
import { LlmRole } from '../../telemetry/llmRole.js';
|
||||||
|
|
||||||
@@ -116,6 +120,27 @@ class AnalyzeScreenshotInvocation extends BaseToolInvocation<
|
|||||||
const visualModel = getVisualAgentModel(this.config);
|
const visualModel = getVisualAgentModel(this.config);
|
||||||
const contentGenerator = this.config.getContentGenerator();
|
const contentGenerator = this.config.getContentGenerator();
|
||||||
|
|
||||||
|
// Computer-use models require the computerUse tool declaration in every
|
||||||
|
// request. We exclude all predefined action functions so the model
|
||||||
|
// provides text analysis rather than issuing actions.
|
||||||
|
// Non-computer-use models (e.g., gemini-2.0-flash) do plain text
|
||||||
|
// analysis natively and don't need this declaration.
|
||||||
|
const tools = isComputerUseModel(visualModel)
|
||||||
|
? [
|
||||||
|
{
|
||||||
|
computerUse: {
|
||||||
|
environment: Environment.ENVIRONMENT_BROWSER,
|
||||||
|
excludedPredefinedFunctions: [
|
||||||
|
'open_web_browser',
|
||||||
|
'click_at',
|
||||||
|
'key_combination',
|
||||||
|
'drag_and_drop',
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
: undefined;
|
||||||
|
|
||||||
const response = await contentGenerator.generateContent(
|
const response = await contentGenerator.generateContent(
|
||||||
{
|
{
|
||||||
model: visualModel,
|
model: visualModel,
|
||||||
@@ -124,6 +149,7 @@ class AnalyzeScreenshotInvocation extends BaseToolInvocation<
|
|||||||
topP: 0.95,
|
topP: 0.95,
|
||||||
systemInstruction: VISUAL_SYSTEM_PROMPT,
|
systemInstruction: VISUAL_SYSTEM_PROMPT,
|
||||||
abortSignal: signal,
|
abortSignal: signal,
|
||||||
|
...(tools ? { tools } : {}),
|
||||||
},
|
},
|
||||||
contents: [
|
contents: [
|
||||||
{
|
{
|
||||||
@@ -146,12 +172,22 @@ class AnalyzeScreenshotInvocation extends BaseToolInvocation<
|
|||||||
LlmRole.UTILITY_TOOL,
|
LlmRole.UTILITY_TOOL,
|
||||||
);
|
);
|
||||||
|
|
||||||
// Extract text from response
|
// Extract response content. Computer-use models may still return
|
||||||
const responseText =
|
// functionCall parts even with exclusions, so we handle both text
|
||||||
response.candidates?.[0]?.content?.parts
|
// and functionCall parts defensively.
|
||||||
?.filter((p) => p.text)
|
const parts = response.candidates?.[0]?.content?.parts ?? [];
|
||||||
.map((p) => p.text)
|
|
||||||
.join('\n') ?? '';
|
const textParts = parts.filter((p) => p.text).map((p) => p.text!);
|
||||||
|
|
||||||
|
const functionCallParts = parts
|
||||||
|
.filter((p) => p.functionCall)
|
||||||
|
.map((p) => {
|
||||||
|
const fc = p.functionCall!;
|
||||||
|
const argsStr = fc.args ? JSON.stringify(fc.args) : '';
|
||||||
|
return `Action: ${fc.name}${argsStr ? ` with args ${argsStr}` : ''}`;
|
||||||
|
});
|
||||||
|
|
||||||
|
const responseText = [...textParts, ...functionCallParts].join('\n');
|
||||||
|
|
||||||
if (!responseText) {
|
if (!responseText) {
|
||||||
return {
|
return {
|
||||||
|
|||||||
@@ -19,6 +19,20 @@ import { debugLogger } from '../../utils/debugLogger.js';
|
|||||||
*/
|
*/
|
||||||
export const VISUAL_AGENT_MODEL = 'gemini-2.5-computer-use-preview-10-2025';
|
export const VISUAL_AGENT_MODEL = 'gemini-2.5-computer-use-preview-10-2025';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Pattern matching the gemini computer-use model family.
|
||||||
|
* These models require a computerUse tool declaration in every request.
|
||||||
|
*/
|
||||||
|
const COMPUTER_USE_MODEL_PATTERN = /^gemini-.*-computer-use-/;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns true if the model name belongs to the computer-use family
|
||||||
|
* (matches gemini-*-computer-use-*).
|
||||||
|
*/
|
||||||
|
export function isComputerUseModel(model: string): boolean {
|
||||||
|
return COMPUTER_USE_MODEL_PATTERN.test(model);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Gets the visual agent model from config, falling back to default.
|
* Gets the visual agent model from config, falling back to default.
|
||||||
*
|
*
|
||||||
|
|||||||
@@ -377,7 +377,7 @@ export interface BrowserAgentCustomConfig {
|
|||||||
headless?: boolean;
|
headless?: boolean;
|
||||||
/** Path to Chrome profile directory for session persistence. */
|
/** Path to Chrome profile directory for session persistence. */
|
||||||
profilePath?: string;
|
profilePath?: string;
|
||||||
/** Model override for the visual agent. */
|
/** Model for the visual agent's analyze_screenshot tool. When set, enables the tool. */
|
||||||
visualModel?: string;
|
visualModel?: string;
|
||||||
/** List of allowed domains for the browser agent (e.g., ["github.com", "*.google.com"]). */
|
/** List of allowed domains for the browser agent (e.g., ["github.com", "*.google.com"]). */
|
||||||
allowedDomains?: string[];
|
allowedDomains?: string[];
|
||||||
|
|||||||
@@ -2205,8 +2205,8 @@
|
|||||||
},
|
},
|
||||||
"visualModel": {
|
"visualModel": {
|
||||||
"title": "Browser Visual Model",
|
"title": "Browser Visual Model",
|
||||||
"description": "Model override for the visual agent.",
|
"description": "Model for the visual agent's analyze_screenshot tool. When set, enables the tool.",
|
||||||
"markdownDescription": "Model override for the visual agent.\n\n- Category: `Advanced`\n- Requires restart: `yes`",
|
"markdownDescription": "Model for the visual agent's analyze_screenshot tool. When set, enables the tool.\n\n- Category: `Advanced`\n- Requires restart: `yes`",
|
||||||
"type": "string"
|
"type": "string"
|
||||||
},
|
},
|
||||||
"allowedDomains": {
|
"allowedDomains": {
|
||||||
|
|||||||
Reference in New Issue
Block a user