fix(browser): handle computer-use model detection for analyze_screenshot (#24502)

2026-04-25 12:34:38 -07:00 · 2026-04-02 23:05:21 +08:00
parent f510394721
commit beff8c91aa
7 changed files with 110 additions and 14 deletions
@@ -1244,7 +1244,8 @@ their corresponding top-level category object in your `settings.json` file.
  - **Requires restart:** Yes
 - **`agents.browser.visualModel`** (string):
-  - **Description:** Model override for the visual agent.
+  - **Description:** Model for the visual agent's analyze_screenshot tool. When
    set, enables the tool.
  - **Default:** `undefined`
  - **Requires restart:** Yes
@@ -1202,7 +1202,8 @@ const SETTINGS_SCHEMA = {
            category: 'Advanced',
            requiresRestart: true,
            default: undefined as string | undefined,
-            description: 'Model override for the visual agent.',
+            description:
              "Model for the visual agent's analyze_screenshot tool. When set, enables the tool.",
            showInDialog: false,
          },
          allowedDomains: {
@@ -9,6 +9,7 @@ import { createAnalyzeScreenshotTool } from './analyzeScreenshot.js';
 import type { BrowserManager, McpToolCallResult } from './browserManager.js';
 import type { Config } from '../../config/config.js';
 import type { MessageBus } from '../../confirmation-bus/message-bus.js';
 import { Environment } from '@google/genai';
 const mockMessageBus = {
  waitForConfirmation: vi.fn().mockResolvedValue({ approved: true }),
@@ -36,6 +37,7 @@ function createMockBrowserManager(
 function createMockConfig(
  generateContentResult?: unknown,
  generateContentError?: Error,
  modelName: string = 'gemini-2.5-computer-use-preview-10-2025',
 ): Config {
  const generateContent = generateContentError
    ? vi.fn().mockRejectedValue(generateContentError)
@@ -57,7 +59,7 @@ function createMockConfig(
  return {
    getBrowserAgentConfig: vi.fn().mockReturnValue({
-      customConfig: { visualModel: 'test-visual-model' },
+      customConfig: { visualModel: modelName },
    }),
    getContentGenerator: vi.fn().mockReturnValue({
      generateContent,
@@ -109,7 +111,22 @@ describe('analyzeScreenshot', () => {
      const contentGenerator = config.getContentGenerator();
      expect(contentGenerator.generateContent).toHaveBeenCalledWith(
        expect.objectContaining({
-          model: 'test-visual-model',
+          model: 'gemini-2.5-computer-use-preview-10-2025',
          config: expect.objectContaining({
            tools: [
              {
                computerUse: {
                  environment: Environment.ENVIRONMENT_BROWSER,
                  excludedPredefinedFunctions: [
                    'open_web_browser',
                    'click_at',
                    'key_combination',
                    'drag_and_drop',
                  ],
                },
              },
            ],
          }),
          contents: expect.arrayContaining([
            expect.objectContaining({
              role: 'user',
@@ -136,6 +153,33 @@ describe('analyzeScreenshot', () => {
      expect(result.error).toBeUndefined();
    });
    it('omits computerUse tools for non-computer-use models', async () => {
      const browserManager = createMockBrowserManager();
      const config = createMockConfig(undefined, undefined, 'gemini-2.0-flash');
      const tool = createAnalyzeScreenshotTool(
        browserManager,
        config,
        mockMessageBus,
      );
      const invocation = tool.build({
        instruction: 'Find the search bar',
      });
      await invocation.execute(new AbortController().signal);
      const contentGenerator = config.getContentGenerator();
      expect(contentGenerator.generateContent).toHaveBeenCalledWith(
        expect.objectContaining({
          model: 'gemini-2.0-flash',
          config: expect.not.objectContaining({
            tools: expect.anything(),
          }),
        }),
        'visual-analysis',
        'utility_tool',
      );
    });
    it('returns an error when screenshot capture fails (no image)', async () => {
      const browserManager = createMockBrowserManager({
        content: [{ type: 'text', text: 'No screenshot available' }],
@@ -24,10 +24,14 @@ import {
  type ToolResult,
  type ToolInvocation,
 } from '../../tools/tools.js';
 import { Environment } from '@google/genai';
 import type { MessageBus } from '../../confirmation-bus/message-bus.js';
 import type { BrowserManager } from './browserManager.js';
 import type { Config } from '../../config/config.js';
-import { getVisualAgentModel } from './modelAvailability.js';
+import {
  getVisualAgentModel,
  isComputerUseModel,
 } from './modelAvailability.js';
 import { debugLogger } from '../../utils/debugLogger.js';
 import { LlmRole } from '../../telemetry/llmRole.js';
@@ -116,6 +120,27 @@ class AnalyzeScreenshotInvocation extends BaseToolInvocation<
      const visualModel = getVisualAgentModel(this.config);
      const contentGenerator = this.config.getContentGenerator();
      // Computer-use models require the computerUse tool declaration in every
      // request. We exclude all predefined action functions so the model
      // provides text analysis rather than issuing actions.
      // Non-computer-use models (e.g., gemini-2.0-flash) do plain text
      // analysis natively and don't need this declaration.
      const tools = isComputerUseModel(visualModel)
        ? [
            {
              computerUse: {
                environment: Environment.ENVIRONMENT_BROWSER,
                excludedPredefinedFunctions: [
                  'open_web_browser',
                  'click_at',
                  'key_combination',
                  'drag_and_drop',
                ],
              },
            },
          ]
        : undefined;
      const response = await contentGenerator.generateContent(
        {
          model: visualModel,
@@ -124,6 +149,7 @@ class AnalyzeScreenshotInvocation extends BaseToolInvocation<
            topP: 0.95,
            systemInstruction: VISUAL_SYSTEM_PROMPT,
            abortSignal: signal,
            ...(tools ? { tools } : {}),
          },
          contents: [
            {
@@ -146,12 +172,22 @@ class AnalyzeScreenshotInvocation extends BaseToolInvocation<
        LlmRole.UTILITY_TOOL,
      );
-      // Extract text from response
+      // Extract response content. Computer-use models may still return
-      const responseText =
+      // functionCall parts even with exclusions, so we handle both text
-        response.candidates?.[0]?.content?.parts
+      // and functionCall parts defensively.
-          ?.filter((p) => p.text)
+      const parts = response.candidates?.[0]?.content?.parts ?? [];
-          .map((p) => p.text)
+
-          .join('\n') ?? '';
+      const textParts = parts.filter((p) => p.text).map((p) => p.text!);
      const functionCallParts = parts
        .filter((p) => p.functionCall)
        .map((p) => {
          const fc = p.functionCall!;
          const argsStr = fc.args ? JSON.stringify(fc.args) : '';
          return `Action: ${fc.name}${argsStr ? ` with args ${argsStr}` : ''}`;
        });
      const responseText = [...textParts, ...functionCallParts].join('\n');
      if (!responseText) {
        return {
@@ -19,6 +19,20 @@ import { debugLogger } from '../../utils/debugLogger.js';
 */
 export const VISUAL_AGENT_MODEL = 'gemini-2.5-computer-use-preview-10-2025';
 /**
 * Pattern matching the gemini computer-use model family.
 * These models require a computerUse tool declaration in every request.
 */
 const COMPUTER_USE_MODEL_PATTERN = /^gemini-.*-computer-use-/;
 /**
 * Returns true if the model name belongs to the computer-use family
 * (matches gemini-*-computer-use-*).
 */
 export function isComputerUseModel(model: string): boolean {
  return COMPUTER_USE_MODEL_PATTERN.test(model);
 }
 /**
 * Gets the visual agent model from config, falling back to default.
 *
@@ -377,7 +377,7 @@ export interface BrowserAgentCustomConfig {
  headless?: boolean;
  /** Path to Chrome profile directory for session persistence. */
  profilePath?: string;
-  /** Model override for the visual agent. */
+  /** Model for the visual agent's analyze_screenshot tool. When set, enables the tool. */
  visualModel?: string;
  /** List of allowed domains for the browser agent (e.g., ["github.com", "*.google.com"]). */
  allowedDomains?: string[];
@@ -2205,8 +2205,8 @@
            },
            "visualModel": {
              "title": "Browser Visual Model",
-              "description": "Model override for the visual agent.",
+              "description": "Model for the visual agent's analyze_screenshot tool. When set, enables the tool.",
-              "markdownDescription": "Model override for the visual agent.\n\n- Category: `Advanced`\n- Requires restart: `yes`",
+              "markdownDescription": "Model for the visual agent's analyze_screenshot tool. When set, enables the tool.\n\n- Category: `Advanced`\n- Requires restart: `yes`",
              "type": "string"
            },
            "allowedDomains": {