From beff8c91aa48d6f0d080debe7a682d46a0016cf7 Mon Sep 17 00:00:00 2001 From: Gaurav <39389231+gsquared94@users.noreply.github.com> Date: Thu, 2 Apr 2026 23:05:21 +0800 Subject: [PATCH] fix(browser): handle computer-use model detection for analyze_screenshot (#24502) --- docs/reference/configuration.md | 3 +- packages/cli/src/config/settingsSchema.ts | 3 +- .../agents/browser/analyzeScreenshot.test.ts | 48 +++++++++++++++++- .../src/agents/browser/analyzeScreenshot.ts | 50 ++++++++++++++++--- .../src/agents/browser/modelAvailability.ts | 14 ++++++ packages/core/src/config/config.ts | 2 +- schemas/settings.schema.json | 4 +- 7 files changed, 110 insertions(+), 14 deletions(-) diff --git a/docs/reference/configuration.md b/docs/reference/configuration.md index 279e71205a..15ea47c82e 100644 --- a/docs/reference/configuration.md +++ b/docs/reference/configuration.md @@ -1244,7 +1244,8 @@ their corresponding top-level category object in your `settings.json` file. - **Requires restart:** Yes - **`agents.browser.visualModel`** (string): - - **Description:** Model override for the visual agent. + - **Description:** Model for the visual agent's analyze_screenshot tool. When + set, enables the tool. - **Default:** `undefined` - **Requires restart:** Yes diff --git a/packages/cli/src/config/settingsSchema.ts b/packages/cli/src/config/settingsSchema.ts index 03f0a774ba..04f9ff5724 100644 --- a/packages/cli/src/config/settingsSchema.ts +++ b/packages/cli/src/config/settingsSchema.ts @@ -1202,7 +1202,8 @@ const SETTINGS_SCHEMA = { category: 'Advanced', requiresRestart: true, default: undefined as string | undefined, - description: 'Model override for the visual agent.', + description: + "Model for the visual agent's analyze_screenshot tool. When set, enables the tool.", showInDialog: false, }, allowedDomains: { diff --git a/packages/core/src/agents/browser/analyzeScreenshot.test.ts b/packages/core/src/agents/browser/analyzeScreenshot.test.ts index 71e082b75d..b37bd3666e 100644 --- a/packages/core/src/agents/browser/analyzeScreenshot.test.ts +++ b/packages/core/src/agents/browser/analyzeScreenshot.test.ts @@ -9,6 +9,7 @@ import { createAnalyzeScreenshotTool } from './analyzeScreenshot.js'; import type { BrowserManager, McpToolCallResult } from './browserManager.js'; import type { Config } from '../../config/config.js'; import type { MessageBus } from '../../confirmation-bus/message-bus.js'; +import { Environment } from '@google/genai'; const mockMessageBus = { waitForConfirmation: vi.fn().mockResolvedValue({ approved: true }), @@ -36,6 +37,7 @@ function createMockBrowserManager( function createMockConfig( generateContentResult?: unknown, generateContentError?: Error, + modelName: string = 'gemini-2.5-computer-use-preview-10-2025', ): Config { const generateContent = generateContentError ? vi.fn().mockRejectedValue(generateContentError) @@ -57,7 +59,7 @@ function createMockConfig( return { getBrowserAgentConfig: vi.fn().mockReturnValue({ - customConfig: { visualModel: 'test-visual-model' }, + customConfig: { visualModel: modelName }, }), getContentGenerator: vi.fn().mockReturnValue({ generateContent, @@ -109,7 +111,22 @@ describe('analyzeScreenshot', () => { const contentGenerator = config.getContentGenerator(); expect(contentGenerator.generateContent).toHaveBeenCalledWith( expect.objectContaining({ - model: 'test-visual-model', + model: 'gemini-2.5-computer-use-preview-10-2025', + config: expect.objectContaining({ + tools: [ + { + computerUse: { + environment: Environment.ENVIRONMENT_BROWSER, + excludedPredefinedFunctions: [ + 'open_web_browser', + 'click_at', + 'key_combination', + 'drag_and_drop', + ], + }, + }, + ], + }), contents: expect.arrayContaining([ expect.objectContaining({ role: 'user', @@ -136,6 +153,33 @@ describe('analyzeScreenshot', () => { expect(result.error).toBeUndefined(); }); + it('omits computerUse tools for non-computer-use models', async () => { + const browserManager = createMockBrowserManager(); + const config = createMockConfig(undefined, undefined, 'gemini-2.0-flash'); + const tool = createAnalyzeScreenshotTool( + browserManager, + config, + mockMessageBus, + ); + + const invocation = tool.build({ + instruction: 'Find the search bar', + }); + await invocation.execute(new AbortController().signal); + + const contentGenerator = config.getContentGenerator(); + expect(contentGenerator.generateContent).toHaveBeenCalledWith( + expect.objectContaining({ + model: 'gemini-2.0-flash', + config: expect.not.objectContaining({ + tools: expect.anything(), + }), + }), + 'visual-analysis', + 'utility_tool', + ); + }); + it('returns an error when screenshot capture fails (no image)', async () => { const browserManager = createMockBrowserManager({ content: [{ type: 'text', text: 'No screenshot available' }], diff --git a/packages/core/src/agents/browser/analyzeScreenshot.ts b/packages/core/src/agents/browser/analyzeScreenshot.ts index c269b71bfb..91fd5d66d6 100644 --- a/packages/core/src/agents/browser/analyzeScreenshot.ts +++ b/packages/core/src/agents/browser/analyzeScreenshot.ts @@ -24,10 +24,14 @@ import { type ToolResult, type ToolInvocation, } from '../../tools/tools.js'; +import { Environment } from '@google/genai'; import type { MessageBus } from '../../confirmation-bus/message-bus.js'; import type { BrowserManager } from './browserManager.js'; import type { Config } from '../../config/config.js'; -import { getVisualAgentModel } from './modelAvailability.js'; +import { + getVisualAgentModel, + isComputerUseModel, +} from './modelAvailability.js'; import { debugLogger } from '../../utils/debugLogger.js'; import { LlmRole } from '../../telemetry/llmRole.js'; @@ -116,6 +120,27 @@ class AnalyzeScreenshotInvocation extends BaseToolInvocation< const visualModel = getVisualAgentModel(this.config); const contentGenerator = this.config.getContentGenerator(); + // Computer-use models require the computerUse tool declaration in every + // request. We exclude all predefined action functions so the model + // provides text analysis rather than issuing actions. + // Non-computer-use models (e.g., gemini-2.0-flash) do plain text + // analysis natively and don't need this declaration. + const tools = isComputerUseModel(visualModel) + ? [ + { + computerUse: { + environment: Environment.ENVIRONMENT_BROWSER, + excludedPredefinedFunctions: [ + 'open_web_browser', + 'click_at', + 'key_combination', + 'drag_and_drop', + ], + }, + }, + ] + : undefined; + const response = await contentGenerator.generateContent( { model: visualModel, @@ -124,6 +149,7 @@ class AnalyzeScreenshotInvocation extends BaseToolInvocation< topP: 0.95, systemInstruction: VISUAL_SYSTEM_PROMPT, abortSignal: signal, + ...(tools ? { tools } : {}), }, contents: [ { @@ -146,12 +172,22 @@ class AnalyzeScreenshotInvocation extends BaseToolInvocation< LlmRole.UTILITY_TOOL, ); - // Extract text from response - const responseText = - response.candidates?.[0]?.content?.parts - ?.filter((p) => p.text) - .map((p) => p.text) - .join('\n') ?? ''; + // Extract response content. Computer-use models may still return + // functionCall parts even with exclusions, so we handle both text + // and functionCall parts defensively. + const parts = response.candidates?.[0]?.content?.parts ?? []; + + const textParts = parts.filter((p) => p.text).map((p) => p.text!); + + const functionCallParts = parts + .filter((p) => p.functionCall) + .map((p) => { + const fc = p.functionCall!; + const argsStr = fc.args ? JSON.stringify(fc.args) : ''; + return `Action: ${fc.name}${argsStr ? ` with args ${argsStr}` : ''}`; + }); + + const responseText = [...textParts, ...functionCallParts].join('\n'); if (!responseText) { return { diff --git a/packages/core/src/agents/browser/modelAvailability.ts b/packages/core/src/agents/browser/modelAvailability.ts index 358d498aa4..3f3bc1e280 100644 --- a/packages/core/src/agents/browser/modelAvailability.ts +++ b/packages/core/src/agents/browser/modelAvailability.ts @@ -19,6 +19,20 @@ import { debugLogger } from '../../utils/debugLogger.js'; */ export const VISUAL_AGENT_MODEL = 'gemini-2.5-computer-use-preview-10-2025'; +/** + * Pattern matching the gemini computer-use model family. + * These models require a computerUse tool declaration in every request. + */ +const COMPUTER_USE_MODEL_PATTERN = /^gemini-.*-computer-use-/; + +/** + * Returns true if the model name belongs to the computer-use family + * (matches gemini-*-computer-use-*). + */ +export function isComputerUseModel(model: string): boolean { + return COMPUTER_USE_MODEL_PATTERN.test(model); +} + /** * Gets the visual agent model from config, falling back to default. * diff --git a/packages/core/src/config/config.ts b/packages/core/src/config/config.ts index d203e047b4..c58c0de7f5 100644 --- a/packages/core/src/config/config.ts +++ b/packages/core/src/config/config.ts @@ -377,7 +377,7 @@ export interface BrowserAgentCustomConfig { headless?: boolean; /** Path to Chrome profile directory for session persistence. */ profilePath?: string; - /** Model override for the visual agent. */ + /** Model for the visual agent's analyze_screenshot tool. When set, enables the tool. */ visualModel?: string; /** List of allowed domains for the browser agent (e.g., ["github.com", "*.google.com"]). */ allowedDomains?: string[]; diff --git a/schemas/settings.schema.json b/schemas/settings.schema.json index 43e1609b0f..a675defc06 100644 --- a/schemas/settings.schema.json +++ b/schemas/settings.schema.json @@ -2205,8 +2205,8 @@ }, "visualModel": { "title": "Browser Visual Model", - "description": "Model override for the visual agent.", - "markdownDescription": "Model override for the visual agent.\n\n- Category: `Advanced`\n- Requires restart: `yes`", + "description": "Model for the visual agent's analyze_screenshot tool. When set, enables the tool.", + "markdownDescription": "Model for the visual agent's analyze_screenshot tool. When set, enables the tool.\n\n- Category: `Advanced`\n- Requires restart: `yes`", "type": "string" }, "allowedDomains": {