diff --git a/packages/core/src/agents/browser/browserAgentDefinition.ts b/packages/core/src/agents/browser/browserAgentDefinition.ts index d0afa2c4b3..f7bf3258ec 100644 --- a/packages/core/src/agents/browser/browserAgentDefinition.ts +++ b/packages/core/src/agents/browser/browserAgentDefinition.ts @@ -14,6 +14,7 @@ */ import type { LocalAgentDefinition } from '../types.js'; +import { supersedeStaleSnapshots } from './snapshotSuperseder.js'; import type { Config } from '../../config/config.js'; import { z } from 'zod'; import { @@ -184,6 +185,11 @@ export const BrowserAgentDefinition = ( // This is undefined here and will be set at invocation time toolConfig: undefined, + // Supersede stale take_snapshot outputs to reclaim context-window tokens. + // Each snapshot contains the full accessibility tree; only the most recent + // one is meaningful, so prior snapshots are replaced with a placeholder. + onBeforeTurn: (chat) => supersedeStaleSnapshots(chat), + promptConfig: { query: `Your task is: diff --git a/packages/core/src/agents/browser/snapshotSuperseder.test.ts b/packages/core/src/agents/browser/snapshotSuperseder.test.ts new file mode 100644 index 0000000000..773d0216e0 --- /dev/null +++ b/packages/core/src/agents/browser/snapshotSuperseder.test.ts @@ -0,0 +1,214 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, it, expect, vi, beforeEach } from 'vitest'; +import { + supersedeStaleSnapshots, + SNAPSHOT_SUPERSEDED_PLACEHOLDER, +} from './snapshotSuperseder.js'; +import type { GeminiChat } from '../../core/geminiChat.js'; +import type { Content } from '@google/genai'; + +/** Builds a minimal mock GeminiChat around a mutable history array. */ +function createMockChat(history: Content[]): GeminiChat { + return { + getHistory: vi.fn(() => [...history]), + setHistory: vi.fn((newHistory: readonly Content[]) => { + history.length = 0; + history.push(...newHistory); + }), + } as unknown as GeminiChat; +} + +/** Helper: creates a take_snapshot functionResponse part. */ +function snapshotResponse(output: string) { + return { + functionResponse: { + name: 'take_snapshot', + response: { output }, + }, + }; +} + +/** Helper: creates a non-snapshot functionResponse part. */ +function otherToolResponse(name: string, output: string) { + return { + functionResponse: { + name, + response: { output }, + }, + }; +} + +describe('supersedeStaleSnapshots', () => { + let history: Content[]; + let chat: GeminiChat; + + beforeEach(() => { + history = []; + }); + + it('should no-op when history has no snapshots', () => { + history.push( + { role: 'user', parts: [{ text: 'Click the button' }] }, + { + role: 'user', + parts: [otherToolResponse('click', 'Clicked element')], + }, + ); + chat = createMockChat(history); + + supersedeStaleSnapshots(chat); + + expect(chat.setHistory).not.toHaveBeenCalled(); + }); + + it('should no-op when history has exactly 1 snapshot', () => { + history.push( + { role: 'user', parts: [{ text: 'Navigate to page' }] }, + { + role: 'user', + parts: [snapshotResponse('big accessibility tree')], + }, + ); + chat = createMockChat(history); + + supersedeStaleSnapshots(chat); + + expect(chat.setHistory).not.toHaveBeenCalled(); + }); + + it('should replace all but the last snapshot when there are 2+', () => { + history.push( + { + role: 'user', + parts: [snapshotResponse('snapshot 1')], + }, + { + role: 'user', + parts: [otherToolResponse('click', 'Clicked OK')], + }, + { + role: 'user', + parts: [snapshotResponse('snapshot 2')], + }, + { + role: 'user', + parts: [otherToolResponse('type_text', 'Typed hello')], + }, + { + role: 'user', + parts: [snapshotResponse('snapshot 3 (latest)')], + }, + ); + chat = createMockChat(history); + + supersedeStaleSnapshots(chat); + + expect(chat.setHistory).toHaveBeenCalledTimes(1); + + // First two snapshots should be replaced + const part0 = history[0].parts![0]; + expect(part0.functionResponse?.response).toEqual({ + output: SNAPSHOT_SUPERSEDED_PLACEHOLDER, + }); + + const part2 = history[2].parts![0]; + expect(part2.functionResponse?.response).toEqual({ + output: SNAPSHOT_SUPERSEDED_PLACEHOLDER, + }); + + // Last snapshot should be untouched + const part4 = history[4].parts![0]; + expect(part4.functionResponse?.response).toEqual({ + output: 'snapshot 3 (latest)', + }); + }); + + it('should leave non-snapshot tool responses untouched', () => { + history.push( + { + role: 'user', + parts: [snapshotResponse('snapshot A')], + }, + { + role: 'user', + parts: [otherToolResponse('click', 'Clicked button')], + }, + { + role: 'user', + parts: [snapshotResponse('snapshot B (latest)')], + }, + ); + chat = createMockChat(history); + + supersedeStaleSnapshots(chat); + + // click response should be untouched + const clickPart = history[1].parts![0]; + expect(clickPart.functionResponse?.response).toEqual({ + output: 'Clicked button', + }); + }); + + it('should no-op when all stale snapshots are already superseded', () => { + history.push( + { + role: 'user', + parts: [snapshotResponse(SNAPSHOT_SUPERSEDED_PLACEHOLDER)], + }, + { + role: 'user', + parts: [snapshotResponse('current snapshot')], + }, + ); + chat = createMockChat(history); + + supersedeStaleSnapshots(chat); + + // Should not call setHistory since nothing changed + expect(chat.setHistory).not.toHaveBeenCalled(); + }); + + it('should handle snapshots in Content entries with multiple parts', () => { + history.push( + { + role: 'user', + parts: [ + otherToolResponse('click', 'Clicked'), + snapshotResponse('snapshot in multi-part'), + ], + }, + { + role: 'user', + parts: [snapshotResponse('latest snapshot')], + }, + ); + chat = createMockChat(history); + + supersedeStaleSnapshots(chat); + + expect(chat.setHistory).toHaveBeenCalledTimes(1); + + // The click response (index 0 of parts) should be untouched + const clickPart = history[0].parts![0]; + expect(clickPart.functionResponse?.response).toEqual({ + output: 'Clicked', + }); + + // The snapshot (index 1 of parts) should be replaced + const snapshotPart = history[0].parts![1]; + expect(snapshotPart.functionResponse?.response).toEqual({ + output: SNAPSHOT_SUPERSEDED_PLACEHOLDER, + }); + + // Latest snapshot untouched + const latestPart = history[1].parts![0]; + expect(latestPart.functionResponse?.response).toEqual({ + output: 'latest snapshot', + }); + }); +}); diff --git a/packages/core/src/agents/browser/snapshotSuperseder.ts b/packages/core/src/agents/browser/snapshotSuperseder.ts new file mode 100644 index 0000000000..e8a5068dd9 --- /dev/null +++ b/packages/core/src/agents/browser/snapshotSuperseder.ts @@ -0,0 +1,149 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * @fileoverview Supersedes stale `take_snapshot` outputs in the browser + * subagent's conversation history. Each snapshot contains the full + * accessibility tree and is only meaningful as the "current" page state; + * prior snapshots are stale and waste context-window tokens. + * + * Called via the {@link LocalAgentDefinition.onBeforeTurn} hook before each + * model call so the model only ever sees the most recent snapshot in full. + */ + +import type { GeminiChat } from '../../core/geminiChat.js'; +import type { Content, Part } from '@google/genai'; +import { debugLogger } from '../../utils/debugLogger.js'; + +const TAKE_SNAPSHOT_TOOL_NAME = 'take_snapshot'; + +/** + * Placeholder that replaces superseded snapshot outputs. + * Kept short to minimise token cost while still being informative. + */ +export const SNAPSHOT_SUPERSEDED_PLACEHOLDER = + '[Snapshot superseded — a newer snapshot exists later in this conversation. ' + + 'Call take_snapshot for current page state.]'; + +/** + * Scans the chat history and replaces all but the most recent + * `take_snapshot` `functionResponse` with a compact placeholder. + * + * No-ops when: + * - There are fewer than 2 snapshots (nothing to supersede). + * - All prior snapshots have already been superseded. + * + * Uses {@link GeminiChat.setHistory} to apply the modified history. + */ +export function supersedeStaleSnapshots(chat: GeminiChat): void { + const history = chat.getHistory(); + + // Locate all (contentIndex, partIndex) tuples for take_snapshot responses. + const snapshotLocations: Array<{ + contentIdx: number; + partIdx: number; + }> = []; + + for (let i = 0; i < history.length; i++) { + const parts = history[i].parts; + if (!parts) continue; + for (let j = 0; j < parts.length; j++) { + const part = parts[j]; + if ( + part.functionResponse && + part.functionResponse.name === TAKE_SNAPSHOT_TOOL_NAME + ) { + snapshotLocations.push({ contentIdx: i, partIdx: j }); + } + } + } + + // Nothing to do if there are 0 or 1 snapshots. + if (snapshotLocations.length < 2) { + return; + } + + // Check whether any stale snapshot actually needs replacement. + // (Skip the last entry — that's the one we keep.) + const staleLocations = snapshotLocations.slice(0, -1); + const needsUpdate = staleLocations.some(({ contentIdx, partIdx }) => { + const output = getResponseOutput( + history[contentIdx].parts![partIdx].functionResponse?.response, + ); + return !output.includes(SNAPSHOT_SUPERSEDED_PLACEHOLDER); + }); + + if (!needsUpdate) { + return; + } + + // Shallow-copy the history and replace stale snapshots. + const newHistory: Content[] = history.map((content) => ({ + ...content, + parts: content.parts ? [...content.parts] : undefined, + })); + + let replacedCount = 0; + + for (const { contentIdx, partIdx } of staleLocations) { + const originalPart = newHistory[contentIdx].parts![partIdx]; + if (!originalPart.functionResponse) continue; + + // Check if already superseded + const output = getResponseOutput(originalPart.functionResponse.response); + if (output.includes(SNAPSHOT_SUPERSEDED_PLACEHOLDER)) { + continue; + } + + const replacementPart: Part = { + functionResponse: { + // eslint-disable-next-line @typescript-eslint/no-misused-spread + ...originalPart.functionResponse, + response: { output: SNAPSHOT_SUPERSEDED_PLACEHOLDER }, + }, + }; + + newHistory[contentIdx].parts![partIdx] = replacementPart; + replacedCount++; + } + + if (replacedCount > 0) { + chat.setHistory(newHistory); + debugLogger.log( + `[SnapshotSuperseder] Replaced ${replacedCount} stale take_snapshot output(s).`, + ); + } +} + +/** + * Shape of a functionResponse.response that contains an `output` string. + */ +interface ResponseWithOutput { + output: string; +} + +function isResponseWithOutput( + response: object | undefined, +): response is ResponseWithOutput { + return ( + response !== null && + response !== undefined && + 'output' in response && + typeof response.output === 'string' + ); +} + +/** + * Safely extracts the `output` string from a functionResponse.response object. + * The GenAI SDK types `response` as `object | undefined`, so we need runtime + * checks to access the `output` field. + */ +function getResponseOutput(response: object | undefined): string { + if (isResponseWithOutput(response)) { + return response.output; + } + return ''; +} diff --git a/packages/core/src/agents/local-executor.ts b/packages/core/src/agents/local-executor.ts index 8168c44610..af7312c231 100644 --- a/packages/core/src/agents/local-executor.ts +++ b/packages/core/src/agents/local-executor.ts @@ -317,6 +317,10 @@ export class LocalAgentExecutor { await this.tryCompressChat(chat, promptId, combinedSignal); + // Allow the agent definition to modify history before the model call + // (e.g., superseding stale tool outputs to reclaim context tokens). + await this.definition.onBeforeTurn?.(chat, combinedSignal); + const { functionCalls, modelToUse } = await promptIdContext.run( promptId, async () => diff --git a/packages/core/src/agents/types.ts b/packages/core/src/agents/types.ts index 456f4cfdb3..a7d921453b 100644 --- a/packages/core/src/agents/types.ts +++ b/packages/core/src/agents/types.ts @@ -16,6 +16,7 @@ import type { AnySchema } from 'ajv'; import type { AgentCard } from '@a2a-js/sdk'; import type { A2AAuthConfig } from './auth-provider/types.js'; import type { MCPServerConfig } from '../config/config.js'; +import type { GeminiChat } from '../core/geminiChat.js'; /** * Describes the possible termination modes for an agent. @@ -227,6 +228,18 @@ export interface LocalAgentDefinition< * @returns A string representation of the final output. */ processOutput?: (output: z.infer) => string; + + /** + * Optional hook invoked before each model call. Receives the active + * {@link GeminiChat} instance and may modify chat history (e.g., to + * supersede stale tool outputs and reclaim context-window tokens). + * + * Runs immediately after chat compression in the agent loop. + */ + onBeforeTurn?: ( + chat: GeminiChat, + signal?: AbortSignal, + ) => Promise | void; } export interface BaseRemoteAgentDefinition<