fix(browser): handle computer-use model detection for analyze_screenshot (#24502)

This commit is contained in:
Gaurav
2026-04-02 23:05:21 +08:00
committed by GitHub
parent f510394721
commit beff8c91aa
7 changed files with 110 additions and 14 deletions

View File

@@ -9,6 +9,7 @@ import { createAnalyzeScreenshotTool } from './analyzeScreenshot.js';
import type { BrowserManager, McpToolCallResult } from './browserManager.js';
import type { Config } from '../../config/config.js';
import type { MessageBus } from '../../confirmation-bus/message-bus.js';
import { Environment } from '@google/genai';
const mockMessageBus = {
waitForConfirmation: vi.fn().mockResolvedValue({ approved: true }),
@@ -36,6 +37,7 @@ function createMockBrowserManager(
function createMockConfig(
generateContentResult?: unknown,
generateContentError?: Error,
modelName: string = 'gemini-2.5-computer-use-preview-10-2025',
): Config {
const generateContent = generateContentError
? vi.fn().mockRejectedValue(generateContentError)
@@ -57,7 +59,7 @@ function createMockConfig(
return {
getBrowserAgentConfig: vi.fn().mockReturnValue({
customConfig: { visualModel: 'test-visual-model' },
customConfig: { visualModel: modelName },
}),
getContentGenerator: vi.fn().mockReturnValue({
generateContent,
@@ -109,7 +111,22 @@ describe('analyzeScreenshot', () => {
const contentGenerator = config.getContentGenerator();
expect(contentGenerator.generateContent).toHaveBeenCalledWith(
expect.objectContaining({
model: 'test-visual-model',
model: 'gemini-2.5-computer-use-preview-10-2025',
config: expect.objectContaining({
tools: [
{
computerUse: {
environment: Environment.ENVIRONMENT_BROWSER,
excludedPredefinedFunctions: [
'open_web_browser',
'click_at',
'key_combination',
'drag_and_drop',
],
},
},
],
}),
contents: expect.arrayContaining([
expect.objectContaining({
role: 'user',
@@ -136,6 +153,33 @@ describe('analyzeScreenshot', () => {
expect(result.error).toBeUndefined();
});
it('omits computerUse tools for non-computer-use models', async () => {
const browserManager = createMockBrowserManager();
const config = createMockConfig(undefined, undefined, 'gemini-2.0-flash');
const tool = createAnalyzeScreenshotTool(
browserManager,
config,
mockMessageBus,
);
const invocation = tool.build({
instruction: 'Find the search bar',
});
await invocation.execute(new AbortController().signal);
const contentGenerator = config.getContentGenerator();
expect(contentGenerator.generateContent).toHaveBeenCalledWith(
expect.objectContaining({
model: 'gemini-2.0-flash',
config: expect.not.objectContaining({
tools: expect.anything(),
}),
}),
'visual-analysis',
'utility_tool',
);
});
it('returns an error when screenshot capture fails (no image)', async () => {
const browserManager = createMockBrowserManager({
content: [{ type: 'text', text: 'No screenshot available' }],

View File

@@ -24,10 +24,14 @@ import {
type ToolResult,
type ToolInvocation,
} from '../../tools/tools.js';
import { Environment } from '@google/genai';
import type { MessageBus } from '../../confirmation-bus/message-bus.js';
import type { BrowserManager } from './browserManager.js';
import type { Config } from '../../config/config.js';
import { getVisualAgentModel } from './modelAvailability.js';
import {
getVisualAgentModel,
isComputerUseModel,
} from './modelAvailability.js';
import { debugLogger } from '../../utils/debugLogger.js';
import { LlmRole } from '../../telemetry/llmRole.js';
@@ -116,6 +120,27 @@ class AnalyzeScreenshotInvocation extends BaseToolInvocation<
const visualModel = getVisualAgentModel(this.config);
const contentGenerator = this.config.getContentGenerator();
// Computer-use models require the computerUse tool declaration in every
// request. We exclude all predefined action functions so the model
// provides text analysis rather than issuing actions.
// Non-computer-use models (e.g., gemini-2.0-flash) do plain text
// analysis natively and don't need this declaration.
const tools = isComputerUseModel(visualModel)
? [
{
computerUse: {
environment: Environment.ENVIRONMENT_BROWSER,
excludedPredefinedFunctions: [
'open_web_browser',
'click_at',
'key_combination',
'drag_and_drop',
],
},
},
]
: undefined;
const response = await contentGenerator.generateContent(
{
model: visualModel,
@@ -124,6 +149,7 @@ class AnalyzeScreenshotInvocation extends BaseToolInvocation<
topP: 0.95,
systemInstruction: VISUAL_SYSTEM_PROMPT,
abortSignal: signal,
...(tools ? { tools } : {}),
},
contents: [
{
@@ -146,12 +172,22 @@ class AnalyzeScreenshotInvocation extends BaseToolInvocation<
LlmRole.UTILITY_TOOL,
);
// Extract text from response
const responseText =
response.candidates?.[0]?.content?.parts
?.filter((p) => p.text)
.map((p) => p.text)
.join('\n') ?? '';
// Extract response content. Computer-use models may still return
// functionCall parts even with exclusions, so we handle both text
// and functionCall parts defensively.
const parts = response.candidates?.[0]?.content?.parts ?? [];
const textParts = parts.filter((p) => p.text).map((p) => p.text!);
const functionCallParts = parts
.filter((p) => p.functionCall)
.map((p) => {
const fc = p.functionCall!;
const argsStr = fc.args ? JSON.stringify(fc.args) : '';
return `Action: ${fc.name}${argsStr ? ` with args ${argsStr}` : ''}`;
});
const responseText = [...textParts, ...functionCallParts].join('\n');
if (!responseText) {
return {

View File

@@ -19,6 +19,20 @@ import { debugLogger } from '../../utils/debugLogger.js';
*/
export const VISUAL_AGENT_MODEL = 'gemini-2.5-computer-use-preview-10-2025';
/**
* Pattern matching the gemini computer-use model family.
* These models require a computerUse tool declaration in every request.
*/
const COMPUTER_USE_MODEL_PATTERN = /^gemini-.*-computer-use-/;
/**
* Returns true if the model name belongs to the computer-use family
* (matches gemini-*-computer-use-*).
*/
export function isComputerUseModel(model: string): boolean {
return COMPUTER_USE_MODEL_PATTERN.test(model);
}
/**
* Gets the visual agent model from config, falling back to default.
*

View File

@@ -377,7 +377,7 @@ export interface BrowserAgentCustomConfig {
headless?: boolean;
/** Path to Chrome profile directory for session persistence. */
profilePath?: string;
/** Model override for the visual agent. */
/** Model for the visual agent's analyze_screenshot tool. When set, enables the tool. */
visualModel?: string;
/** List of allowed domains for the browser agent (e.g., ["github.com", "*.google.com"]). */
allowedDomains?: string[];