feat(browser): supersede stale snapshots to reclaim context-window tokens (#24440)

2026-04-12 14:22:00 -07:00 · 2026-04-02 02:41:39 +08:00
parent 43cf63e189
commit aed85725b6
5 changed files with 386 additions and 0 deletions
--- a/packages/core/src/agents/browser/browserAgentDefinition.ts
+++ b/packages/core/src/agents/browser/browserAgentDefinition.ts
@@ -14,6 +14,7 @@
 */

 import type { LocalAgentDefinition } from '../types.js';
+import { supersedeStaleSnapshots } from './snapshotSuperseder.js';
 import type { Config } from '../../config/config.js';
 import { z } from 'zod';
 import {
@@ -184,6 +185,11 @@ export const BrowserAgentDefinition = (
    // This is undefined here and will be set at invocation time
    toolConfig: undefined,

+    // Supersede stale take_snapshot outputs to reclaim context-window tokens.
+    // Each snapshot contains the full accessibility tree; only the most recent
+    // one is meaningful, so prior snapshots are replaced with a placeholder.
+    onBeforeTurn: (chat) => supersedeStaleSnapshots(chat),
+
    promptConfig: {
      query: `Your task is:
 <task>
--- a/packages/core/src/agents/browser/snapshotSuperseder.test.ts
+++ b/packages/core/src/agents/browser/snapshotSuperseder.test.ts
@@ -0,0 +1,214 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, it, expect, vi, beforeEach } from 'vitest';
+import {
+  supersedeStaleSnapshots,
+  SNAPSHOT_SUPERSEDED_PLACEHOLDER,
+} from './snapshotSuperseder.js';
+import type { GeminiChat } from '../../core/geminiChat.js';
+import type { Content } from '@google/genai';
+
+/** Builds a minimal mock GeminiChat around a mutable history array. */
+function createMockChat(history: Content[]): GeminiChat {
+  return {
+    getHistory: vi.fn(() => [...history]),
+    setHistory: vi.fn((newHistory: readonly Content[]) => {
+      history.length = 0;
+      history.push(...newHistory);
+    }),
+  } as unknown as GeminiChat;
+}
+
+/** Helper: creates a take_snapshot functionResponse part. */
+function snapshotResponse(output: string) {
+  return {
+    functionResponse: {
+      name: 'take_snapshot',
+      response: { output },
+    },
+  };
+}
+
+/** Helper: creates a non-snapshot functionResponse part. */
+function otherToolResponse(name: string, output: string) {
+  return {
+    functionResponse: {
+      name,
+      response: { output },
+    },
+  };
+}
+
+describe('supersedeStaleSnapshots', () => {
+  let history: Content[];
+  let chat: GeminiChat;
+
+  beforeEach(() => {
+    history = [];
+  });
+
+  it('should no-op when history has no snapshots', () => {
+    history.push(
+      { role: 'user', parts: [{ text: 'Click the button' }] },
+      {
+        role: 'user',
+        parts: [otherToolResponse('click', 'Clicked element')],
+      },
+    );
+    chat = createMockChat(history);
+
+    supersedeStaleSnapshots(chat);
+
+    expect(chat.setHistory).not.toHaveBeenCalled();
+  });
+
+  it('should no-op when history has exactly 1 snapshot', () => {
+    history.push(
+      { role: 'user', parts: [{ text: 'Navigate to page' }] },
+      {
+        role: 'user',
+        parts: [snapshotResponse('<tree>big accessibility tree</tree>')],
+      },
+    );
+    chat = createMockChat(history);
+
+    supersedeStaleSnapshots(chat);
+
+    expect(chat.setHistory).not.toHaveBeenCalled();
+  });
+
+  it('should replace all but the last snapshot when there are 2+', () => {
+    history.push(
+      {
+        role: 'user',
+        parts: [snapshotResponse('<tree>snapshot 1</tree>')],
+      },
+      {
+        role: 'user',
+        parts: [otherToolResponse('click', 'Clicked OK')],
+      },
+      {
+        role: 'user',
+        parts: [snapshotResponse('<tree>snapshot 2</tree>')],
+      },
+      {
+        role: 'user',
+        parts: [otherToolResponse('type_text', 'Typed hello')],
+      },
+      {
+        role: 'user',
+        parts: [snapshotResponse('<tree>snapshot 3 (latest)</tree>')],
+      },
+    );
+    chat = createMockChat(history);
+
+    supersedeStaleSnapshots(chat);
+
+    expect(chat.setHistory).toHaveBeenCalledTimes(1);
+
+    // First two snapshots should be replaced
+    const part0 = history[0].parts![0];
+    expect(part0.functionResponse?.response).toEqual({
+      output: SNAPSHOT_SUPERSEDED_PLACEHOLDER,
+    });
+
+    const part2 = history[2].parts![0];
+    expect(part2.functionResponse?.response).toEqual({
+      output: SNAPSHOT_SUPERSEDED_PLACEHOLDER,
+    });
+
+    // Last snapshot should be untouched
+    const part4 = history[4].parts![0];
+    expect(part4.functionResponse?.response).toEqual({
+      output: '<tree>snapshot 3 (latest)</tree>',
+    });
+  });
+
+  it('should leave non-snapshot tool responses untouched', () => {
+    history.push(
+      {
+        role: 'user',
+        parts: [snapshotResponse('<tree>snapshot A</tree>')],
+      },
+      {
+        role: 'user',
+        parts: [otherToolResponse('click', 'Clicked button')],
+      },
+      {
+        role: 'user',
+        parts: [snapshotResponse('<tree>snapshot B (latest)</tree>')],
+      },
+    );
+    chat = createMockChat(history);
+
+    supersedeStaleSnapshots(chat);
+
+    // click response should be untouched
+    const clickPart = history[1].parts![0];
+    expect(clickPart.functionResponse?.response).toEqual({
+      output: 'Clicked button',
+    });
+  });
+
+  it('should no-op when all stale snapshots are already superseded', () => {
+    history.push(
+      {
+        role: 'user',
+        parts: [snapshotResponse(SNAPSHOT_SUPERSEDED_PLACEHOLDER)],
+      },
+      {
+        role: 'user',
+        parts: [snapshotResponse('<tree>current snapshot</tree>')],
+      },
+    );
+    chat = createMockChat(history);
+
+    supersedeStaleSnapshots(chat);
+
+    // Should not call setHistory since nothing changed
+    expect(chat.setHistory).not.toHaveBeenCalled();
+  });
+
+  it('should handle snapshots in Content entries with multiple parts', () => {
+    history.push(
+      {
+        role: 'user',
+        parts: [
+          otherToolResponse('click', 'Clicked'),
+          snapshotResponse('<tree>snapshot in multi-part</tree>'),
+        ],
+      },
+      {
+        role: 'user',
+        parts: [snapshotResponse('<tree>latest snapshot</tree>')],
+      },
+    );
+    chat = createMockChat(history);
+
+    supersedeStaleSnapshots(chat);
+
+    expect(chat.setHistory).toHaveBeenCalledTimes(1);
+
+    // The click response (index 0 of parts) should be untouched
+    const clickPart = history[0].parts![0];
+    expect(clickPart.functionResponse?.response).toEqual({
+      output: 'Clicked',
+    });
+
+    // The snapshot (index 1 of parts) should be replaced
+    const snapshotPart = history[0].parts![1];
+    expect(snapshotPart.functionResponse?.response).toEqual({
+      output: SNAPSHOT_SUPERSEDED_PLACEHOLDER,
+    });
+
+    // Latest snapshot untouched
+    const latestPart = history[1].parts![0];
+    expect(latestPart.functionResponse?.response).toEqual({
+      output: '<tree>latest snapshot</tree>',
+    });
+  });
+});
--- a/packages/core/src/agents/browser/snapshotSuperseder.ts
+++ b/packages/core/src/agents/browser/snapshotSuperseder.ts
@@ -0,0 +1,149 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/**
+ * @fileoverview Supersedes stale `take_snapshot` outputs in the browser
+ * subagent's conversation history. Each snapshot contains the full
+ * accessibility tree and is only meaningful as the "current" page state;
+ * prior snapshots are stale and waste context-window tokens.
+ *
+ * Called via the {@link LocalAgentDefinition.onBeforeTurn} hook before each
+ * model call so the model only ever sees the most recent snapshot in full.
+ */
+
+import type { GeminiChat } from '../../core/geminiChat.js';
+import type { Content, Part } from '@google/genai';
+import { debugLogger } from '../../utils/debugLogger.js';
+
+const TAKE_SNAPSHOT_TOOL_NAME = 'take_snapshot';
+
+/**
+ * Placeholder that replaces superseded snapshot outputs.
+ * Kept short to minimise token cost while still being informative.
+ */
+export const SNAPSHOT_SUPERSEDED_PLACEHOLDER =
+  '[Snapshot superseded — a newer snapshot exists later in this conversation. ' +
+  'Call take_snapshot for current page state.]';
+
+/**
+ * Scans the chat history and replaces all but the most recent
+ * `take_snapshot` `functionResponse` with a compact placeholder.
+ *
+ * No-ops when:
+ * - There are fewer than 2 snapshots (nothing to supersede).
+ * - All prior snapshots have already been superseded.
+ *
+ * Uses {@link GeminiChat.setHistory} to apply the modified history.
+ */
+export function supersedeStaleSnapshots(chat: GeminiChat): void {
+  const history = chat.getHistory();
+
+  // Locate all (contentIndex, partIndex) tuples for take_snapshot responses.
+  const snapshotLocations: Array<{
+    contentIdx: number;
+    partIdx: number;
+  }> = [];
+
+  for (let i = 0; i < history.length; i++) {
+    const parts = history[i].parts;
+    if (!parts) continue;
+    for (let j = 0; j < parts.length; j++) {
+      const part = parts[j];
+      if (
+        part.functionResponse &&
+        part.functionResponse.name === TAKE_SNAPSHOT_TOOL_NAME
+      ) {
+        snapshotLocations.push({ contentIdx: i, partIdx: j });
+      }
+    }
+  }
+
+  // Nothing to do if there are 0 or 1 snapshots.
+  if (snapshotLocations.length < 2) {
+    return;
+  }
+
+  // Check whether any stale snapshot actually needs replacement.
+  // (Skip the last entry — that's the one we keep.)
+  const staleLocations = snapshotLocations.slice(0, -1);
+  const needsUpdate = staleLocations.some(({ contentIdx, partIdx }) => {
+    const output = getResponseOutput(
+      history[contentIdx].parts![partIdx].functionResponse?.response,
+    );
+    return !output.includes(SNAPSHOT_SUPERSEDED_PLACEHOLDER);
+  });
+
+  if (!needsUpdate) {
+    return;
+  }
+
+  // Shallow-copy the history and replace stale snapshots.
+  const newHistory: Content[] = history.map((content) => ({
+    ...content,
+    parts: content.parts ? [...content.parts] : undefined,
+  }));
+
+  let replacedCount = 0;
+
+  for (const { contentIdx, partIdx } of staleLocations) {
+    const originalPart = newHistory[contentIdx].parts![partIdx];
+    if (!originalPart.functionResponse) continue;
+
+    // Check if already superseded
+    const output = getResponseOutput(originalPart.functionResponse.response);
+    if (output.includes(SNAPSHOT_SUPERSEDED_PLACEHOLDER)) {
+      continue;
+    }
+
+    const replacementPart: Part = {
+      functionResponse: {
+        // eslint-disable-next-line @typescript-eslint/no-misused-spread
+        ...originalPart.functionResponse,
+        response: { output: SNAPSHOT_SUPERSEDED_PLACEHOLDER },
+      },
+    };
+
+    newHistory[contentIdx].parts![partIdx] = replacementPart;
+    replacedCount++;
+  }
+
+  if (replacedCount > 0) {
+    chat.setHistory(newHistory);
+    debugLogger.log(
+      `[SnapshotSuperseder] Replaced ${replacedCount} stale take_snapshot output(s).`,
+    );
+  }
+}
+
+/**
+ * Shape of a functionResponse.response that contains an `output` string.
+ */
+interface ResponseWithOutput {
+  output: string;
+}
+
+function isResponseWithOutput(
+  response: object | undefined,
+): response is ResponseWithOutput {
+  return (
+    response !== null &&
+    response !== undefined &&
+    'output' in response &&
+    typeof response.output === 'string'
+  );
+}
+
+/**
+ * Safely extracts the `output` string from a functionResponse.response object.
+ * The GenAI SDK types `response` as `object | undefined`, so we need runtime
+ * checks to access the `output` field.
+ */
+function getResponseOutput(response: object | undefined): string {
+  if (isResponseWithOutput(response)) {
+    return response.output;
+  }
+  return '';
+}
--- a/packages/core/src/agents/local-executor.ts
+++ b/packages/core/src/agents/local-executor.ts
@@ -317,6 +317,10 @@ export class LocalAgentExecutor<TOutput extends z.ZodTypeAny> {

    await this.tryCompressChat(chat, promptId, combinedSignal);

+    // Allow the agent definition to modify history before the model call
+    // (e.g., superseding stale tool outputs to reclaim context tokens).
+    await this.definition.onBeforeTurn?.(chat, combinedSignal);
+
    const { functionCalls, modelToUse } = await promptIdContext.run(
      promptId,
      async () =>
--- a/packages/core/src/agents/types.ts
+++ b/packages/core/src/agents/types.ts
@@ -16,6 +16,7 @@ import type { AnySchema } from 'ajv';
 import type { AgentCard } from '@a2a-js/sdk';
 import type { A2AAuthConfig } from './auth-provider/types.js';
 import type { MCPServerConfig } from '../config/config.js';
+import type { GeminiChat } from '../core/geminiChat.js';

 /**
 * Describes the possible termination modes for an agent.
@@ -227,6 +228,18 @@ export interface LocalAgentDefinition<
   * @returns A string representation of the final output.
   */
  processOutput?: (output: z.infer<TOutput>) => string;
+
+  /**
+   * Optional hook invoked before each model call. Receives the active
+   * {@link GeminiChat} instance and may modify chat history (e.g., to
+   * supersede stale tool outputs and reclaim context-window tokens).
+   *
+   * Runs immediately after chat compression in the agent loop.
+   */
+  onBeforeTurn?: (
+    chat: GeminiChat,
+    signal?: AbortSignal,
+  ) => Promise<void> | void;
 }

 export interface BaseRemoteAgentDefinition<