From beff8c91aa48d6f0d080debe7a682d46a0016cf7 Mon Sep 17 00:00:00 2001
From: Gaurav <39389231+gsquared94@users.noreply.github.com>
Date: Thu, 2 Apr 2026 23:05:21 +0800
Subject: [PATCH] fix(browser): handle computer-use model detection for
 analyze_screenshot (#24502)

---
 docs/reference/configuration.md               |  3 +-
 packages/cli/src/config/settingsSchema.ts     |  3 +-
 .../agents/browser/analyzeScreenshot.test.ts  | 48 +++++++++++++++++-
 .../src/agents/browser/analyzeScreenshot.ts   | 50 ++++++++++++++++---
 .../src/agents/browser/modelAvailability.ts   | 14 ++++++
 packages/core/src/config/config.ts            |  2 +-
 schemas/settings.schema.json                  |  4 +-
 7 files changed, 110 insertions(+), 14 deletions(-)

diff --git a/docs/reference/configuration.md b/docs/reference/configuration.md
index 279e71205a..15ea47c82e 100644
--- a/docs/reference/configuration.md
+++ b/docs/reference/configuration.md
@@ -1244,7 +1244,8 @@ their corresponding top-level category object in your `settings.json` file.
   - **Requires restart:** Yes
 
 - **`agents.browser.visualModel`** (string):
-  - **Description:** Model override for the visual agent.
+  - **Description:** Model for the visual agent's analyze_screenshot tool. When
+    set, enables the tool.
   - **Default:** `undefined`
   - **Requires restart:** Yes
 
diff --git a/packages/cli/src/config/settingsSchema.ts b/packages/cli/src/config/settingsSchema.ts
index 03f0a774ba..04f9ff5724 100644
--- a/packages/cli/src/config/settingsSchema.ts
+++ b/packages/cli/src/config/settingsSchema.ts
@@ -1202,7 +1202,8 @@ const SETTINGS_SCHEMA = {
             category: 'Advanced',
             requiresRestart: true,
             default: undefined as string | undefined,
-            description: 'Model override for the visual agent.',
+            description:
+              "Model for the visual agent's analyze_screenshot tool. When set, enables the tool.",
             showInDialog: false,
           },
           allowedDomains: {
diff --git a/packages/core/src/agents/browser/analyzeScreenshot.test.ts b/packages/core/src/agents/browser/analyzeScreenshot.test.ts
index 71e082b75d..b37bd3666e 100644
--- a/packages/core/src/agents/browser/analyzeScreenshot.test.ts
+++ b/packages/core/src/agents/browser/analyzeScreenshot.test.ts
@@ -9,6 +9,7 @@ import { createAnalyzeScreenshotTool } from './analyzeScreenshot.js';
 import type { BrowserManager, McpToolCallResult } from './browserManager.js';
 import type { Config } from '../../config/config.js';
 import type { MessageBus } from '../../confirmation-bus/message-bus.js';
+import { Environment } from '@google/genai';
 
 const mockMessageBus = {
   waitForConfirmation: vi.fn().mockResolvedValue({ approved: true }),
@@ -36,6 +37,7 @@ function createMockBrowserManager(
 function createMockConfig(
   generateContentResult?: unknown,
   generateContentError?: Error,
+  modelName: string = 'gemini-2.5-computer-use-preview-10-2025',
 ): Config {
   const generateContent = generateContentError
     ? vi.fn().mockRejectedValue(generateContentError)
@@ -57,7 +59,7 @@ function createMockConfig(
 
   return {
     getBrowserAgentConfig: vi.fn().mockReturnValue({
-      customConfig: { visualModel: 'test-visual-model' },
+      customConfig: { visualModel: modelName },
     }),
     getContentGenerator: vi.fn().mockReturnValue({
       generateContent,
@@ -109,7 +111,22 @@ describe('analyzeScreenshot', () => {
       const contentGenerator = config.getContentGenerator();
       expect(contentGenerator.generateContent).toHaveBeenCalledWith(
         expect.objectContaining({
-          model: 'test-visual-model',
+          model: 'gemini-2.5-computer-use-preview-10-2025',
+          config: expect.objectContaining({
+            tools: [
+              {
+                computerUse: {
+                  environment: Environment.ENVIRONMENT_BROWSER,
+                  excludedPredefinedFunctions: [
+                    'open_web_browser',
+                    'click_at',
+                    'key_combination',
+                    'drag_and_drop',
+                  ],
+                },
+              },
+            ],
+          }),
           contents: expect.arrayContaining([
             expect.objectContaining({
               role: 'user',
@@ -136,6 +153,33 @@ describe('analyzeScreenshot', () => {
       expect(result.error).toBeUndefined();
     });
 
+    it('omits computerUse tools for non-computer-use models', async () => {
+      const browserManager = createMockBrowserManager();
+      const config = createMockConfig(undefined, undefined, 'gemini-2.0-flash');
+      const tool = createAnalyzeScreenshotTool(
+        browserManager,
+        config,
+        mockMessageBus,
+      );
+
+      const invocation = tool.build({
+        instruction: 'Find the search bar',
+      });
+      await invocation.execute(new AbortController().signal);
+
+      const contentGenerator = config.getContentGenerator();
+      expect(contentGenerator.generateContent).toHaveBeenCalledWith(
+        expect.objectContaining({
+          model: 'gemini-2.0-flash',
+          config: expect.not.objectContaining({
+            tools: expect.anything(),
+          }),
+        }),
+        'visual-analysis',
+        'utility_tool',
+      );
+    });
+
     it('returns an error when screenshot capture fails (no image)', async () => {
       const browserManager = createMockBrowserManager({
         content: [{ type: 'text', text: 'No screenshot available' }],
diff --git a/packages/core/src/agents/browser/analyzeScreenshot.ts b/packages/core/src/agents/browser/analyzeScreenshot.ts
index c269b71bfb..91fd5d66d6 100644
--- a/packages/core/src/agents/browser/analyzeScreenshot.ts
+++ b/packages/core/src/agents/browser/analyzeScreenshot.ts
@@ -24,10 +24,14 @@ import {
   type ToolResult,
   type ToolInvocation,
 } from '../../tools/tools.js';
+import { Environment } from '@google/genai';
 import type { MessageBus } from '../../confirmation-bus/message-bus.js';
 import type { BrowserManager } from './browserManager.js';
 import type { Config } from '../../config/config.js';
-import { getVisualAgentModel } from './modelAvailability.js';
+import {
+  getVisualAgentModel,
+  isComputerUseModel,
+} from './modelAvailability.js';
 import { debugLogger } from '../../utils/debugLogger.js';
 import { LlmRole } from '../../telemetry/llmRole.js';
 
@@ -116,6 +120,27 @@ class AnalyzeScreenshotInvocation extends BaseToolInvocation<
       const visualModel = getVisualAgentModel(this.config);
       const contentGenerator = this.config.getContentGenerator();
 
+      // Computer-use models require the computerUse tool declaration in every
+      // request. We exclude all predefined action functions so the model
+      // provides text analysis rather than issuing actions.
+      // Non-computer-use models (e.g., gemini-2.0-flash) do plain text
+      // analysis natively and don't need this declaration.
+      const tools = isComputerUseModel(visualModel)
+        ? [
+            {
+              computerUse: {
+                environment: Environment.ENVIRONMENT_BROWSER,
+                excludedPredefinedFunctions: [
+                  'open_web_browser',
+                  'click_at',
+                  'key_combination',
+                  'drag_and_drop',
+                ],
+              },
+            },
+          ]
+        : undefined;
+
       const response = await contentGenerator.generateContent(
         {
           model: visualModel,
@@ -124,6 +149,7 @@ class AnalyzeScreenshotInvocation extends BaseToolInvocation<
             topP: 0.95,
             systemInstruction: VISUAL_SYSTEM_PROMPT,
             abortSignal: signal,
+            ...(tools ? { tools } : {}),
           },
           contents: [
             {
@@ -146,12 +172,22 @@ class AnalyzeScreenshotInvocation extends BaseToolInvocation<
         LlmRole.UTILITY_TOOL,
       );
 
-      // Extract text from response
-      const responseText =
-        response.candidates?.[0]?.content?.parts
-          ?.filter((p) => p.text)
-          .map((p) => p.text)
-          .join('\n') ?? '';
+      // Extract response content. Computer-use models may still return
+      // functionCall parts even with exclusions, so we handle both text
+      // and functionCall parts defensively.
+      const parts = response.candidates?.[0]?.content?.parts ?? [];
+
+      const textParts = parts.filter((p) => p.text).map((p) => p.text!);
+
+      const functionCallParts = parts
+        .filter((p) => p.functionCall)
+        .map((p) => {
+          const fc = p.functionCall!;
+          const argsStr = fc.args ? JSON.stringify(fc.args) : '';
+          return `Action: ${fc.name}${argsStr ? ` with args ${argsStr}` : ''}`;
+        });
+
+      const responseText = [...textParts, ...functionCallParts].join('\n');
 
       if (!responseText) {
         return {
diff --git a/packages/core/src/agents/browser/modelAvailability.ts b/packages/core/src/agents/browser/modelAvailability.ts
index 358d498aa4..3f3bc1e280 100644
--- a/packages/core/src/agents/browser/modelAvailability.ts
+++ b/packages/core/src/agents/browser/modelAvailability.ts
@@ -19,6 +19,20 @@ import { debugLogger } from '../../utils/debugLogger.js';
  */
 export const VISUAL_AGENT_MODEL = 'gemini-2.5-computer-use-preview-10-2025';
 
+/**
+ * Pattern matching the gemini computer-use model family.
+ * These models require a computerUse tool declaration in every request.
+ */
+const COMPUTER_USE_MODEL_PATTERN = /^gemini-.*-computer-use-/;
+
+/**
+ * Returns true if the model name belongs to the computer-use family
+ * (matches gemini-*-computer-use-*).
+ */
+export function isComputerUseModel(model: string): boolean {
+  return COMPUTER_USE_MODEL_PATTERN.test(model);
+}
+
 /**
  * Gets the visual agent model from config, falling back to default.
  *
diff --git a/packages/core/src/config/config.ts b/packages/core/src/config/config.ts
index d203e047b4..c58c0de7f5 100644
--- a/packages/core/src/config/config.ts
+++ b/packages/core/src/config/config.ts
@@ -377,7 +377,7 @@ export interface BrowserAgentCustomConfig {
   headless?: boolean;
   /** Path to Chrome profile directory for session persistence. */
   profilePath?: string;
-  /** Model override for the visual agent. */
+  /** Model for the visual agent's analyze_screenshot tool. When set, enables the tool. */
   visualModel?: string;
   /** List of allowed domains for the browser agent (e.g., ["github.com", "*.google.com"]). */
   allowedDomains?: string[];
diff --git a/schemas/settings.schema.json b/schemas/settings.schema.json
index 43e1609b0f..a675defc06 100644
--- a/schemas/settings.schema.json
+++ b/schemas/settings.schema.json
@@ -2205,8 +2205,8 @@
             },
             "visualModel": {
               "title": "Browser Visual Model",
-              "description": "Model override for the visual agent.",
-              "markdownDescription": "Model override for the visual agent.\n\n- Category: `Advanced`\n- Requires restart: `yes`",
+              "description": "Model for the visual agent's analyze_screenshot tool. When set, enables the tool.",
+              "markdownDescription": "Model for the visual agent's analyze_screenshot tool. When set, enables the tool.\n\n- Category: `Advanced`\n- Requires restart: `yes`",
               "type": "string"
             },
             "allowedDomains": {