mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-05-20 16:53:12 -07:00
feat(browser): add submitKey param to type_text and improve connection errors
- Add submitKey parameter to type_text tool for pressing Enter/Tab/etc after typing, eliminating a separate model round-trip per value entry - Update system prompt and tool hints to guide model toward type_text with submitKey instead of per-character press_key calls - Refactor connection error handling into createConnectionError() with session-mode-aware remediation messages for profile locks, timeouts, and generic failures - Update terminal failure prompts to pass through error remediation verbatim instead of hardcoding instructions - Add tests for profile-lock, timeout, and generic connection errors
This commit is contained in:
@@ -74,17 +74,14 @@ When you need to identify elements by visual attributes not in the AX tree (e.g.
|
||||
COMPLEX WEB APPS (spreadsheets, rich editors, canvas apps):
|
||||
Many web apps (Google Sheets/Docs, Notion, Figma, etc.) use custom rendering rather than standard HTML inputs.
|
||||
- fill does NOT work on these apps. Instead, click the target element, then use type_text to enter the value.
|
||||
- Navigate cells/fields using keyboard shortcuts (Tab, Enter, ArrowDown) — more reliable than clicking cell UIDs.
|
||||
- For spreadsheets: click a cell → type_text("value") → press_key("Enter") to confirm and move to the next cell.
|
||||
- type_text supports a submitKey parameter to press a key after typing (e.g., submitKey="Enter" to submit, submitKey="Tab" to move to the next field). This is much faster than separate press_key calls.
|
||||
- Navigate cells/fields using keyboard shortcuts (Tab, Enter, ArrowDown) — more reliable than clicking UIDs.
|
||||
- Use the Name Box (cell reference input, usually showing "A1") to jump to specific cells.
|
||||
|
||||
TERMINAL FAILURES — STOP IMMEDIATELY:
|
||||
Some errors are unrecoverable and retrying will never help. When you see ANY of these, call complete_task immediately with success=false and include the specific remediation steps in your summary:
|
||||
- "Could not connect to Chrome" — Include ALL of these instructions in your summary:
|
||||
1. Open Chrome (version 144+)
|
||||
2. Go to chrome://inspect/#remote-debugging and enable remote debugging
|
||||
3. Or change sessionMode to "persistent" in settings.json to let the agent launch its own browser
|
||||
- "Browser closed" or "Target closed" or "Session closed" — The browser process has terminated. Tell the user to restart and try again.
|
||||
Some errors are unrecoverable and retrying will never help. When you see ANY of these, call complete_task immediately with success=false and include the EXACT error message (including any remediation steps it contains) in your summary:
|
||||
- "Could not connect to Chrome" or "Failed to connect to Chrome" or "Timed out connecting to Chrome" — Include the full error message with its remediation steps in your summary verbatim. Do NOT paraphrase or omit instructions.
|
||||
- "Browser closed" or "Target closed" or "Session closed" — The browser process has terminated. Include the error and tell the user to try again.
|
||||
- "net::ERR_" network errors on the SAME URL after 2 retries — the site is unreachable. Report the URL and error.
|
||||
- Any error that appears IDENTICALLY 3+ times in a row — it will not resolve by retrying.
|
||||
Do NOT keep retrying terminal errors. Report them with actionable remediation steps and exit immediately.
|
||||
|
||||
@@ -287,6 +287,77 @@ describe('BrowserManager', () => {
|
||||
/chrome:\/\/inspect\/#remote-debugging/,
|
||||
);
|
||||
});
|
||||
|
||||
it('should throw profile-lock remediation when persistent mode hits "already running"', async () => {
|
||||
vi.mocked(Client).mockImplementation(
|
||||
() =>
|
||||
({
|
||||
connect: vi
|
||||
.fn()
|
||||
.mockRejectedValue(
|
||||
new Error(
|
||||
'Could not connect to Chrome. The browser is already running for the current profile.',
|
||||
),
|
||||
),
|
||||
close: vi.fn().mockResolvedValue(undefined),
|
||||
listTools: vi.fn(),
|
||||
callTool: vi.fn(),
|
||||
}) as unknown as InstanceType<typeof Client>,
|
||||
);
|
||||
|
||||
// Default config = persistent mode
|
||||
const manager = new BrowserManager(mockConfig);
|
||||
|
||||
await expect(manager.ensureConnection()).rejects.toThrow(
|
||||
/Close all Chrome windows using this profile/,
|
||||
);
|
||||
const manager2 = new BrowserManager(mockConfig);
|
||||
await expect(manager2.ensureConnection()).rejects.toThrow(
|
||||
/Set sessionMode to "isolated"/,
|
||||
);
|
||||
});
|
||||
|
||||
it('should throw timeout-specific remediation for persistent mode', async () => {
|
||||
vi.mocked(Client).mockImplementation(
|
||||
() =>
|
||||
({
|
||||
connect: vi
|
||||
.fn()
|
||||
.mockRejectedValue(
|
||||
new Error('Timed out connecting to chrome-devtools-mcp'),
|
||||
),
|
||||
close: vi.fn().mockResolvedValue(undefined),
|
||||
listTools: vi.fn(),
|
||||
callTool: vi.fn(),
|
||||
}) as unknown as InstanceType<typeof Client>,
|
||||
);
|
||||
|
||||
const manager = new BrowserManager(mockConfig);
|
||||
|
||||
await expect(manager.ensureConnection()).rejects.toThrow(
|
||||
/Chrome is not installed/,
|
||||
);
|
||||
});
|
||||
|
||||
it('should include sessionMode in generic fallback error', async () => {
|
||||
vi.mocked(Client).mockImplementation(
|
||||
() =>
|
||||
({
|
||||
connect: vi
|
||||
.fn()
|
||||
.mockRejectedValue(new Error('Some unexpected error')),
|
||||
close: vi.fn().mockResolvedValue(undefined),
|
||||
listTools: vi.fn(),
|
||||
callTool: vi.fn(),
|
||||
}) as unknown as InstanceType<typeof Client>,
|
||||
);
|
||||
|
||||
const manager = new BrowserManager(mockConfig);
|
||||
|
||||
await expect(manager.ensureConnection()).rejects.toThrow(
|
||||
/sessionMode: persistent/,
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
describe('MCP isolation', () => {
|
||||
|
||||
@@ -315,19 +315,11 @@ export class BrowserManager {
|
||||
}),
|
||||
]);
|
||||
} catch (error) {
|
||||
// Provide actionable error for 'existing' mode failures
|
||||
if (sessionMode === 'existing') {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
throw new Error(
|
||||
`Failed to connect to existing Chrome instance: ${message}\n\n` +
|
||||
`To use sessionMode "existing", you must:\n` +
|
||||
` 1. Open Chrome (version 144+)\n` +
|
||||
` 2. Navigate to chrome://inspect/#remote-debugging\n` +
|
||||
` 3. Enable remote debugging\n\n` +
|
||||
`Alternatively, use sessionMode "persistent" (default) to launch a dedicated browser.`,
|
||||
);
|
||||
}
|
||||
throw error;
|
||||
// Provide error-specific, session-mode-aware remediation
|
||||
throw this.createConnectionError(
|
||||
error instanceof Error ? error.message : String(error),
|
||||
sessionMode,
|
||||
);
|
||||
} finally {
|
||||
if (timeoutId !== undefined) {
|
||||
clearTimeout(timeoutId);
|
||||
@@ -335,6 +327,72 @@ export class BrowserManager {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an Error with context-specific remediation based on the actual
|
||||
* error message and the current sessionMode.
|
||||
*/
|
||||
private createConnectionError(message: string, sessionMode: string): Error {
|
||||
const lowerMessage = message.toLowerCase();
|
||||
|
||||
// "already running for the current profile" — persistent mode profile lock
|
||||
if (lowerMessage.includes('already running')) {
|
||||
if (sessionMode === 'persistent' || sessionMode === 'isolated') {
|
||||
return new Error(
|
||||
`Could not connect to Chrome: ${message}\n\n` +
|
||||
`The Chrome profile is locked by another running instance.\n` +
|
||||
`To fix this:\n` +
|
||||
` 1. Close all Chrome windows using this profile, OR\n` +
|
||||
` 2. Set sessionMode to "isolated" in settings.json to use a temporary profile, OR\n` +
|
||||
` 3. Set chromeProfilePath in settings.json to use a different profile directory`,
|
||||
);
|
||||
}
|
||||
// existing mode — shouldn't normally hit this, but handle gracefully
|
||||
return new Error(
|
||||
`Could not connect to Chrome: ${message}\n\n` +
|
||||
`The Chrome profile is locked.\n` +
|
||||
`Close other Chrome instances and try again.`,
|
||||
);
|
||||
}
|
||||
|
||||
// Timeout errors
|
||||
if (lowerMessage.includes('timed out')) {
|
||||
if (sessionMode === 'existing') {
|
||||
return new Error(
|
||||
`Timed out connecting to Chrome: ${message}\n\n` +
|
||||
`To use sessionMode "existing", you must:\n` +
|
||||
` 1. Open Chrome (version 144+)\n` +
|
||||
` 2. Navigate to chrome://inspect/#remote-debugging\n` +
|
||||
` 3. Enable remote debugging\n\n` +
|
||||
`Alternatively, set sessionMode to "persistent" (default) in settings.json to launch a dedicated browser.`,
|
||||
);
|
||||
}
|
||||
return new Error(
|
||||
`Timed out connecting to Chrome: ${message}\n\n` +
|
||||
`Possible causes:\n` +
|
||||
` 1. Chrome is not installed or not in PATH\n` +
|
||||
` 2. npx cannot download chrome-devtools-mcp (check network/proxy)\n` +
|
||||
` 3. Chrome failed to start (try setting headless: true in settings.json)`,
|
||||
);
|
||||
}
|
||||
|
||||
// Generic "existing" mode failures (connection refused, etc.)
|
||||
if (sessionMode === 'existing') {
|
||||
return new Error(
|
||||
`Failed to connect to existing Chrome instance: ${message}\n\n` +
|
||||
`To use sessionMode "existing", you must:\n` +
|
||||
` 1. Open Chrome (version 144+)\n` +
|
||||
` 2. Navigate to chrome://inspect/#remote-debugging\n` +
|
||||
` 3. Enable remote debugging\n\n` +
|
||||
`Alternatively, set sessionMode to "persistent" (default) in settings.json to launch a dedicated browser.`,
|
||||
);
|
||||
}
|
||||
|
||||
// Generic fallback — include sessionMode for debugging context
|
||||
return new Error(
|
||||
`Failed to connect to Chrome (sessionMode: ${sessionMode}): ${message}`,
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Discovers tools from the connected MCP server.
|
||||
*/
|
||||
|
||||
@@ -156,13 +156,17 @@ class TypeTextInvocation extends BaseToolInvocation<
|
||||
constructor(
|
||||
private readonly browserManager: BrowserManager,
|
||||
private readonly text: string,
|
||||
private readonly submitKey: string | undefined,
|
||||
messageBus: MessageBus,
|
||||
) {
|
||||
super({ text }, messageBus, 'type_text', 'type_text');
|
||||
super({ text, submitKey }, messageBus, 'type_text', 'type_text');
|
||||
}
|
||||
|
||||
getDescription(): string {
|
||||
return `type_text: "${this.text.substring(0, 50)}${this.text.length > 50 ? '...' : ''}"`;
|
||||
const preview = `"${this.text.substring(0, 50)}${this.text.length > 50 ? '...' : ''}"`;
|
||||
return this.submitKey
|
||||
? `type_text: ${preview} + ${this.submitKey}`
|
||||
: `type_text: ${preview}`;
|
||||
}
|
||||
|
||||
protected override async getConfirmationDetails(
|
||||
@@ -194,48 +198,32 @@ class TypeTextInvocation extends BaseToolInvocation<
|
||||
|
||||
override async execute(signal: AbortSignal): Promise<ToolResult> {
|
||||
try {
|
||||
const chars = [...this.text]; // Handle Unicode correctly
|
||||
let successCount = 0;
|
||||
let lastError: string | undefined;
|
||||
if (signal.aborted) {
|
||||
return {
|
||||
llmContent: 'Error: Operation cancelled before typing started.',
|
||||
returnDisplay: 'Operation cancelled before typing started.',
|
||||
error: { message: 'Operation cancelled' },
|
||||
};
|
||||
}
|
||||
|
||||
for (const char of chars) {
|
||||
if (signal.aborted) {
|
||||
return {
|
||||
llmContent: `Error: Operation cancelled after typing ${successCount}/${chars.length} characters.`,
|
||||
returnDisplay: `Operation cancelled after typing ${successCount}/${chars.length} characters.`,
|
||||
error: { message: 'Operation cancelled' },
|
||||
};
|
||||
}
|
||||
await this.typeCharByChar(signal);
|
||||
|
||||
// Map special characters to key names
|
||||
const key = char === ' ' ? 'Space' : char;
|
||||
const result: McpToolCallResult = await this.browserManager.callTool(
|
||||
'press_key',
|
||||
{ key },
|
||||
);
|
||||
|
||||
if (result.isError) {
|
||||
const errorText = result.content
|
||||
?.filter(
|
||||
(c: { type: string; text?: string }) =>
|
||||
c.type === 'text' && c.text,
|
||||
)
|
||||
.map((c: { type: string; text?: string }) => c.text)
|
||||
.join('\n');
|
||||
lastError = errorText || 'Unknown error';
|
||||
// Continue typing remaining characters on soft errors
|
||||
// Optionally press a submit key (Enter, Tab, etc.) after typing
|
||||
if (this.submitKey && !signal.aborted) {
|
||||
const keyResult = await this.browserManager.callTool('press_key', {
|
||||
key: this.submitKey,
|
||||
});
|
||||
if (keyResult.isError) {
|
||||
const errText = this.extractErrorText(keyResult);
|
||||
debugLogger.warn(
|
||||
`type_text: press_key("${key}") failed: ${lastError}`,
|
||||
`type_text: submitKey("${this.submitKey}") failed: ${errText}`,
|
||||
);
|
||||
} else {
|
||||
successCount++;
|
||||
}
|
||||
}
|
||||
|
||||
const summary =
|
||||
successCount === chars.length
|
||||
? `Successfully typed ${chars.length} characters: "${this.text}"`
|
||||
: `Typed ${successCount}/${chars.length} characters. Last error: ${lastError}`;
|
||||
const summary = this.submitKey
|
||||
? `Successfully typed "${this.text}" and pressed ${this.submitKey}`
|
||||
: `Successfully typed "${this.text}"`;
|
||||
|
||||
return {
|
||||
llmContent: summary,
|
||||
@@ -257,6 +245,36 @@ class TypeTextInvocation extends BaseToolInvocation<
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/** Types each character via individual press_key MCP calls. */
|
||||
private async typeCharByChar(signal: AbortSignal): Promise<void> {
|
||||
const chars = [...this.text]; // Handle Unicode correctly
|
||||
for (const char of chars) {
|
||||
if (signal.aborted) return;
|
||||
|
||||
// Map special characters to key names
|
||||
const key = char === ' ' ? 'Space' : char;
|
||||
const result = await this.browserManager.callTool('press_key', { key });
|
||||
|
||||
if (result.isError) {
|
||||
debugLogger.warn(
|
||||
`type_text: press_key("${key}") failed: ${this.extractErrorText(result)}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Extract error text from an MCP tool result. */
|
||||
private extractErrorText(result: McpToolCallResult): string {
|
||||
return (
|
||||
result.content
|
||||
?.filter(
|
||||
(c: { type: string; text?: string }) => c.type === 'text' && c.text,
|
||||
)
|
||||
.map((c: { type: string; text?: string }) => c.text)
|
||||
.join('\n') || 'Unknown error'
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -311,19 +329,24 @@ class TypeTextDeclarativeTool extends DeclarativeTool<
|
||||
super(
|
||||
'type_text',
|
||||
'type_text',
|
||||
'Types a full text string into the currently focused element by pressing each key in sequence. ' +
|
||||
'Types a full text string into the currently focused element. ' +
|
||||
'Much faster than calling press_key for each character individually. ' +
|
||||
'Use this to enter text into form fields, spreadsheet cells, or any focused input. ' +
|
||||
'Use this to enter text into form fields, search boxes, spreadsheet cells, or any focused input. ' +
|
||||
'The element must already be focused (e.g., after a click). ' +
|
||||
'Does NOT press Enter at the end — call press_key("Enter") separately if needed.',
|
||||
'Use submitKey to press a key after typing (e.g., submitKey="Enter" to submit a form or confirm a value, submitKey="Tab" to move to the next field).',
|
||||
Kind.Other,
|
||||
{
|
||||
type: 'object',
|
||||
properties: {
|
||||
text: {
|
||||
type: 'string',
|
||||
description: 'The text to type into the focused element.',
|
||||
},
|
||||
submitKey: {
|
||||
type: 'string',
|
||||
description:
|
||||
'The text to type. Each character will be pressed in sequence.',
|
||||
'Optional key to press after typing (e.g., "Enter", "Tab", "Escape"). ' +
|
||||
'Useful for submitting form fields or moving to the next cell in a spreadsheet.',
|
||||
},
|
||||
},
|
||||
required: ['text'],
|
||||
@@ -337,9 +360,14 @@ class TypeTextDeclarativeTool extends DeclarativeTool<
|
||||
build(
|
||||
params: Record<string, unknown>,
|
||||
): ToolInvocation<Record<string, unknown>, ToolResult> {
|
||||
const submitKey =
|
||||
typeof params['submitKey'] === 'string' && params['submitKey']
|
||||
? params['submitKey']
|
||||
: undefined;
|
||||
return new TypeTextInvocation(
|
||||
this.browserManager,
|
||||
String(params['text'] ?? ''),
|
||||
submitKey,
|
||||
this.messageBus,
|
||||
);
|
||||
}
|
||||
@@ -448,7 +476,7 @@ function augmentToolDescription(toolName: string, description: string): string {
|
||||
new_page:
|
||||
' Opens a new page/tab with the specified URL. Call take_snapshot after to see the new page.',
|
||||
press_key:
|
||||
' Press a SINGLE keyboard key (e.g., "Enter", "Tab", "Escape", "ArrowDown", "a", "8"). ONLY accepts one key name — do NOT pass multi-character strings like "Hello" or "A1\\nEnter". To type text, call press_key once per character.',
|
||||
' Press a SINGLE keyboard key (e.g., "Enter", "Tab", "Escape", "ArrowDown", "a", "8"). ONLY accepts one key name — do NOT pass multi-character strings like "Hello" or "A1\\nEnter". To type text, use type_text instead of calling press_key for each character.',
|
||||
};
|
||||
|
||||
// Check for partial matches — order matters! More-specific keys first.
|
||||
|
||||
Reference in New Issue
Block a user