feat(context): implement observation masking for tool outputs (#18389)

2026-03-17 17:41:24 -07:00 · 2026-02-05 20:53:11 -05:00
parent 289769f544
commit 8ec176e005
15 changed files with 1151 additions and 7 deletions
--- a/.gemini/settings.json
+++ b/.gemini/settings.json
@@ -0,0 +1,7 @@
+{
+  "experimental": {
+    "toolOutputMasking": {
+      "enabled": true
+    }
+  }
+}
--- a/packages/cli/src/config/config.ts
+++ b/packages/cli/src/config/config.ts
@@ -806,6 +806,7 @@ export async function loadCliConfig(
    skillsSupport: settings.skills?.enabled ?? true,
    disabledSkills: settings.skills?.disabled,
    experimentalJitContext: settings.experimental?.jitContext,
+    toolOutputMasking: settings.experimental?.toolOutputMasking,
    noBrowser: !!process.env['NO_BROWSER'],
    summarizeToolOutput: settings.model?.summarizeToolOutput,
    ideMode,
--- a/packages/cli/src/config/settingsSchema.ts
+++ b/packages/cli/src/config/settingsSchema.ts
@@ -1462,6 +1462,58 @@ const SETTINGS_SCHEMA = {
    description: 'Setting to enable experimental features',
    showInDialog: false,
    properties: {
+      toolOutputMasking: {
+        type: 'object',
+        label: 'Tool Output Masking',
+        category: 'Experimental',
+        requiresRestart: true,
+        ignoreInDocs: true,
+        default: {},
+        description:
+          'Advanced settings for tool output masking to manage context window efficiency.',
+        showInDialog: false,
+        properties: {
+          enabled: {
+            type: 'boolean',
+            label: 'Enable Tool Output Masking',
+            category: 'Experimental',
+            requiresRestart: true,
+            default: false,
+            description: 'Enables tool output masking to save tokens.',
+            showInDialog: false,
+          },
+          toolProtectionThreshold: {
+            type: 'number',
+            label: 'Tool Protection Threshold',
+            category: 'Experimental',
+            requiresRestart: true,
+            default: 50000,
+            description:
+              'Minimum number of tokens to protect from masking (most recent tool outputs).',
+            showInDialog: false,
+          },
+          minPrunableTokensThreshold: {
+            type: 'number',
+            label: 'Min Prunable Tokens Threshold',
+            category: 'Experimental',
+            requiresRestart: true,
+            default: 30000,
+            description:
+              'Minimum prunable tokens required to trigger a masking pass.',
+            showInDialog: false,
+          },
+          protectLatestTurn: {
+            type: 'boolean',
+            label: 'Protect Latest Turn',
+            category: 'Experimental',
+            requiresRestart: true,
+            default: true,
+            description:
+              'Ensures the absolute latest turn is never masked, regardless of token count.',
+            showInDialog: false,
+          },
+        },
+      },
      enableAgents: {
        type: 'boolean',
        label: 'Enable Agents',
--- a/packages/core/src/config/config.ts
+++ b/packages/core/src/config/config.ts
@@ -149,6 +149,13 @@ export interface OutputSettings {
  format?: OutputFormat;
 }

+export interface ToolOutputMaskingConfig {
+  enabled: boolean;
+  toolProtectionThreshold: number;
+  minPrunableTokensThreshold: number;
+  protectLatestTurn: boolean;
+}
+
 export interface ExtensionSetting {
  name: string;
  description: string;
@@ -273,6 +280,11 @@ import {
  DEFAULT_FILE_FILTERING_OPTIONS,
  DEFAULT_MEMORY_FILE_FILTERING_OPTIONS,
 } from './constants.js';
+import {
+  DEFAULT_TOOL_PROTECTION_THRESHOLD,
+  DEFAULT_MIN_PRUNABLE_TOKENS_THRESHOLD,
+  DEFAULT_PROTECT_LATEST_TURN,
+} from '../services/toolOutputMaskingService.js';

 import {
  type ExtensionLoader,
@@ -462,6 +474,7 @@ export interface ConfigParameters {
  disabledSkills?: string[];
  adminSkillsEnabled?: boolean;
  experimentalJitContext?: boolean;
+  toolOutputMasking?: Partial<ToolOutputMaskingConfig>;
  disableLLMCorrection?: boolean;
  plan?: boolean;
  onModelChange?: (model: string) => void;
@@ -599,6 +612,7 @@ export class Config {
  private pendingIncludeDirectories: string[];
  private readonly enableHooks: boolean;
  private readonly enableHooksUI: boolean;
+  private readonly toolOutputMasking: ToolOutputMaskingConfig;
  private hooks: { [K in HookEventName]?: HookDefinition[] } | undefined;
  private projectHooks:
    | ({ [K in HookEventName]?: HookDefinition[] } & { disabled?: string[] })
@@ -721,6 +735,18 @@ export class Config {
    this.modelAvailabilityService = new ModelAvailabilityService();
    this.previewFeatures = params.previewFeatures ?? undefined;
    this.experimentalJitContext = params.experimentalJitContext ?? false;
+    this.toolOutputMasking = {
+      enabled: params.toolOutputMasking?.enabled ?? false,
+      toolProtectionThreshold:
+        params.toolOutputMasking?.toolProtectionThreshold ??
+        DEFAULT_TOOL_PROTECTION_THRESHOLD,
+      minPrunableTokensThreshold:
+        params.toolOutputMasking?.minPrunableTokensThreshold ??
+        DEFAULT_MIN_PRUNABLE_TOKENS_THRESHOLD,
+      protectLatestTurn:
+        params.toolOutputMasking?.protectLatestTurn ??
+        DEFAULT_PROTECT_LATEST_TURN,
+    };
    this.maxSessionTurns = params.maxSessionTurns ?? -1;
    this.experimentalZedIntegration =
      params.experimentalZedIntegration ?? false;
@@ -1445,6 +1471,14 @@ export class Config {
    return this.experimentalJitContext;
  }

+  getToolOutputMaskingEnabled(): boolean {
+    return this.toolOutputMasking.enabled;
+  }
+
+  getToolOutputMaskingConfig(): ToolOutputMaskingConfig {
+    return this.toolOutputMasking;
+  }
+
  getGeminiMdFileCount(): number {
    if (this.experimentalJitContext && this.contextManager) {
      return this.contextManager.getLoadedPaths().size;
--- a/packages/core/src/core/client.test.ts
+++ b/packages/core/src/core/client.test.ts
@@ -213,6 +213,7 @@ describe('Gemini Client (client.ts)', () => {
      getGlobalMemory: vi.fn().mockReturnValue(''),
      getEnvironmentMemory: vi.fn().mockReturnValue(''),
      isJitContextEnabled: vi.fn().mockReturnValue(false),
+      getToolOutputMaskingEnabled: vi.fn().mockReturnValue(false),
      getDisableLoopDetection: vi.fn().mockReturnValue(false),

      getSessionId: vi.fn().mockReturnValue('test-session-id'),
--- a/packages/core/src/core/client.ts
+++ b/packages/core/src/core/client.ts
@@ -54,6 +54,7 @@ import { handleFallback } from '../fallback/handler.js';
 import type { RoutingContext } from '../routing/routingStrategy.js';
 import { debugLogger } from '../utils/debugLogger.js';
 import type { ModelConfigKey } from '../services/modelConfigService.js';
+import { ToolOutputMaskingService } from '../services/toolOutputMaskingService.js';
 import { calculateRequestTokenCount } from '../utils/tokenCalculation.js';
 import {
  applyModelSelection,
@@ -84,6 +85,7 @@ export class GeminiClient {

  private readonly loopDetector: LoopDetectionService;
  private readonly compressionService: ChatCompressionService;
+  private readonly toolOutputMaskingService: ToolOutputMaskingService;
  private lastPromptId: string;
  private currentSequenceModel: string | null = null;
  private lastSentIdeContext: IdeContext | undefined;
@@ -98,6 +100,7 @@ export class GeminiClient {
  constructor(private readonly config: Config) {
    this.loopDetector = new LoopDetectionService(config);
    this.compressionService = new ChatCompressionService();
+    this.toolOutputMaskingService = new ToolOutputMaskingService();
    this.lastPromptId = this.config.getSessionId();

    coreEvents.on(CoreEvent.ModelChanged, this.handleModelChanged);
@@ -562,6 +565,8 @@ export class GeminiClient {
    const remainingTokenCount =
      tokenLimit(modelForLimitCheck) - this.getChat().getLastPromptTokenCount();

+    await this.tryMaskToolOutputs(this.getHistory());
+
    // Estimate tokens. For text-only requests, we estimate based on character length.
    // For requests with non-text parts (like images, tools), we use the countTokens API.
    const estimatedRequestTokenCount = await calculateRequestTokenCount(
@@ -1056,4 +1061,20 @@ export class GeminiClient {

    return info;
  }
+
+  /**
+   * Masks bulky tool outputs to save context window space.
+   */
+  private async tryMaskToolOutputs(history: Content[]): Promise<void> {
+    if (!this.config.getToolOutputMaskingEnabled()) {
+      return;
+    }
+    const result = await this.toolOutputMaskingService.mask(
+      history,
+      this.config,
+    );
+    if (result.maskedCount > 0) {
+      this.getChat().setHistory(result.newHistory);
+    }
+  }
 }
--- a/packages/core/src/services/snapshots/toolOutputMaskingService.test.ts.snap
+++ b/packages/core/src/services/snapshots/toolOutputMaskingService.test.ts.snap
@@ -0,0 +1,31 @@
+// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html
+
+exports[`ToolOutputMaskingService > should match the expected snapshot for a masked tool output 1`] = `
+"<tool_output_masked>
+Line
+Line
+Line
+Line
+Line
+Line
+Line
+Line
+Line
+Line
+
+... [6 lines omitted] ...
+
+Line
+Line
+Line
+Line
+Line
+Line
+Line
+Line
+Line
+
+
+Output too large. Full output available at: /mock/history/tool-outputs/run_shell_command_deterministic.txt
+</tool_output_masked>"
+`;
--- a/packages/core/src/services/toolOutputMaskingService.test.ts
+++ b/packages/core/src/services/toolOutputMaskingService.test.ts
@@ -0,0 +1,500 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, it, expect, vi, beforeEach } from 'vitest';
+import {
+  ToolOutputMaskingService,
+  MASKING_INDICATOR_TAG,
+} from './toolOutputMaskingService.js';
+import { SHELL_TOOL_NAME } from '../tools/tool-names.js';
+import { estimateTokenCountSync } from '../utils/tokenCalculation.js';
+import type { Config } from '../config/config.js';
+import type { Content, Part } from '@google/genai';
+
+vi.mock('../utils/tokenCalculation.js', () => ({
+  estimateTokenCountSync: vi.fn(),
+}));
+
+vi.mock('node:fs/promises', () => ({
+  mkdir: vi.fn().mockResolvedValue(undefined),
+  writeFile: vi.fn().mockResolvedValue(undefined),
+}));
+
+describe('ToolOutputMaskingService', () => {
+  let service: ToolOutputMaskingService;
+  let mockConfig: Config;
+
+  const mockedEstimateTokenCountSync = vi.mocked(estimateTokenCountSync);
+
+  beforeEach(() => {
+    service = new ToolOutputMaskingService();
+    mockConfig = {
+      storage: {
+        getHistoryDir: () => '/mock/history',
+      },
+      getUsageStatisticsEnabled: () => false,
+      getToolOutputMaskingConfig: () => ({
+        enabled: true,
+        toolProtectionThreshold: 50000,
+        minPrunableTokensThreshold: 30000,
+        protectLatestTurn: true,
+      }),
+    } as unknown as Config;
+    vi.clearAllMocks();
+  });
+
+  it('should not mask if total tool tokens are below protection threshold', async () => {
+    const history: Content[] = [
+      {
+        role: 'user',
+        parts: [
+          {
+            functionResponse: {
+              name: 'test_tool',
+              response: { output: 'small output' },
+            },
+          },
+        ],
+      },
+    ];
+
+    mockedEstimateTokenCountSync.mockReturnValue(100);
+
+    const result = await service.mask(history, mockConfig);
+
+    expect(result.maskedCount).toBe(0);
+    expect(result.newHistory).toEqual(history);
+  });
+
+  const getToolResponse = (part: Part | undefined): string => {
+    const resp = part?.functionResponse?.response as
+      | { output: string }
+      | undefined;
+    return resp?.output ?? (resp as unknown as string) ?? '';
+  };
+
+  it('should protect the latest turn and mask older outputs beyond 50k window if total > 30k', async () => {
+    // History:
+    // Turn 1: 60k (Oldest)
+    // Turn 2: 20k
+    // Turn 3: 10k (Latest) - Protected because PROTECT_LATEST_TURN is true
+    const history: Content[] = [
+      {
+        role: 'user',
+        parts: [
+          {
+            functionResponse: {
+              name: 't1',
+              response: { output: 'A'.repeat(60000) },
+            },
+          },
+        ],
+      },
+      {
+        role: 'user',
+        parts: [
+          {
+            functionResponse: {
+              name: 't2',
+              response: { output: 'B'.repeat(20000) },
+            },
+          },
+        ],
+      },
+      {
+        role: 'user',
+        parts: [
+          {
+            functionResponse: {
+              name: 't3',
+              response: { output: 'C'.repeat(10000) },
+            },
+          },
+        ],
+      },
+    ];
+
+    mockedEstimateTokenCountSync.mockImplementation((parts: Part[]) => {
+      const toolName = parts[0].functionResponse?.name;
+      const resp = parts[0].functionResponse?.response as Record<
+        string,
+        unknown
+      >;
+      const content = (resp?.['output'] as string) ?? JSON.stringify(resp);
+      if (content.includes(`<${MASKING_INDICATOR_TAG}`)) return 100;
+
+      if (toolName === 't1') return 60000;
+      if (toolName === 't2') return 20000;
+      if (toolName === 't3') return 10000;
+      return 0;
+    });
+
+    // Scanned: Turn 2 (20k), Turn 1 (60k). Total = 80k.
+    // Turn 2: Cumulative = 20k. Protected (<= 50k).
+    // Turn 1: Cumulative = 80k. Crossed 50k boundary. Prunabled.
+    // Total Prunable = 60k (> 30k trigger).
+    const result = await service.mask(history, mockConfig);
+
+    expect(result.maskedCount).toBe(1);
+    expect(getToolResponse(result.newHistory[0].parts?.[0])).toContain(
+      `<${MASKING_INDICATOR_TAG}`,
+    );
+    expect(getToolResponse(result.newHistory[1].parts?.[0])).toEqual(
+      'B'.repeat(20000),
+    );
+    expect(getToolResponse(result.newHistory[2].parts?.[0])).toEqual(
+      'C'.repeat(10000),
+    );
+  });
+
+  it('should perform global aggregation for many small parts once boundary is hit', async () => {
+    // history.length = 12. Skip index 11 (latest).
+    // Indices 0-10: 10k each.
+    // Index 10: 10k (Sum 10k)
+    // Index 9: 10k (Sum 20k)
+    // Index 8: 10k (Sum 30k)
+    // Index 7: 10k (Sum 40k)
+    // Index 6: 10k (Sum 50k) - Boundary hit here?
+    // Actually, Boundary is 50k. So Index 6 crosses it.
+    // Index 6, 5, 4, 3, 2, 1, 0 are all prunable. (7 * 10k = 70k).
+    const history: Content[] = Array.from({ length: 12 }, (_, i) => ({
+      role: 'user',
+      parts: [
+        {
+          functionResponse: {
+            name: `tool${i}`,
+            response: { output: 'A'.repeat(10000) },
+          },
+        },
+      ],
+    }));
+
+    mockedEstimateTokenCountSync.mockImplementation((parts: Part[]) => {
+      const resp = parts[0].functionResponse?.response as
+        | { output?: string; result?: string }
+        | string
+        | undefined;
+      const content =
+        typeof resp === 'string'
+          ? resp
+          : resp?.output || resp?.result || JSON.stringify(resp);
+      if (content?.includes(`<${MASKING_INDICATOR_TAG}`)) return 100;
+      return content?.length || 0;
+    });
+
+    const result = await service.mask(history, mockConfig);
+
+    expect(result.maskedCount).toBe(6); // boundary at 50k protects 0-5
+    expect(result.tokensSaved).toBeGreaterThan(0);
+  });
+
+  it('should verify tool-aware previews (shell vs generic)', async () => {
+    const shellHistory: Content[] = [
+      {
+        role: 'user',
+        parts: [
+          {
+            functionResponse: {
+              name: SHELL_TOOL_NAME,
+              response: {
+                output:
+                  'Output: line1\nline2\nline3\nline4\nline5\nError: failed\nExit Code: 1',
+              },
+            },
+          },
+        ],
+      },
+      // Protection buffer
+      {
+        role: 'user',
+        parts: [
+          {
+            functionResponse: {
+              name: 'p',
+              response: { output: 'p'.repeat(60000) },
+            },
+          },
+        ],
+      },
+      // Latest turn
+      {
+        role: 'user',
+        parts: [{ functionResponse: { name: 'l', response: { output: 'l' } } }],
+      },
+    ];
+
+    mockedEstimateTokenCountSync.mockImplementation((parts: Part[]) => {
+      const name = parts[0].functionResponse?.name;
+      const resp = parts[0].functionResponse?.response as Record<
+        string,
+        unknown
+      >;
+      const content = (resp?.['output'] as string) ?? JSON.stringify(resp);
+      if (content.includes(`<${MASKING_INDICATOR_TAG}`)) return 100;
+
+      if (name === SHELL_TOOL_NAME) return 100000;
+      if (name === 'p') return 60000;
+      return 100;
+    });
+
+    const result = await service.mask(shellHistory, mockConfig);
+    const maskedBash = getToolResponse(result.newHistory[0].parts?.[0]);
+
+    expect(maskedBash).toContain('Output: line1\nline2\nline3\nline4\nline5');
+    expect(maskedBash).toContain('Exit Code: 1');
+    expect(maskedBash).toContain('Error: failed');
+  });
+
+  it('should skip already masked content and not count it towards totals', async () => {
+    const history: Content[] = [
+      {
+        role: 'user',
+        parts: [
+          {
+            functionResponse: {
+              name: 'tool1',
+              response: {
+                output: `<${MASKING_INDICATOR_TAG}>...</${MASKING_INDICATOR_TAG}>`,
+              },
+            },
+          },
+        ],
+      },
+      {
+        role: 'user',
+        parts: [
+          {
+            functionResponse: {
+              name: 'tool2',
+              response: { output: 'A'.repeat(60000) },
+            },
+          },
+        ],
+      },
+    ];
+    mockedEstimateTokenCountSync.mockReturnValue(60000);
+
+    const result = await service.mask(history, mockConfig);
+    expect(result.maskedCount).toBe(0); // tool1 skipped, tool2 is the "latest" which is protected
+  });
+
+  it('should handle different response keys in masked update', async () => {
+    const history: Content[] = [
+      {
+        role: 'model',
+        parts: [
+          {
+            functionResponse: {
+              name: 't1',
+              response: { result: 'A'.repeat(60000) },
+            },
+          },
+        ],
+      },
+      {
+        role: 'model',
+        parts: [
+          {
+            functionResponse: {
+              name: 'p',
+              response: { output: 'P'.repeat(60000) },
+            },
+          },
+        ],
+      },
+      { role: 'user', parts: [{ text: 'latest' }] },
+    ];
+
+    mockedEstimateTokenCountSync.mockImplementation((parts: Part[]) => {
+      const resp = parts[0].functionResponse?.response as Record<
+        string,
+        unknown
+      >;
+      const content =
+        (resp?.['output'] as string) ??
+        (resp?.['result'] as string) ??
+        JSON.stringify(resp);
+      if (content.includes(`<${MASKING_INDICATOR_TAG}`)) return 100;
+      return 60000;
+    });
+
+    const result = await service.mask(history, mockConfig);
+    expect(result.maskedCount).toBe(2); // both t1 and p are prunable (cumulative 60k and 120k)
+    const responseObj = result.newHistory[0].parts?.[0].functionResponse
+      ?.response as Record<string, unknown>;
+    expect(Object.keys(responseObj)).toEqual(['output']);
+  });
+
+  it('should preserve multimodal parts while masking tool responses', async () => {
+    const history: Content[] = [
+      {
+        role: 'user',
+        parts: [
+          {
+            functionResponse: {
+              name: 't1',
+              response: { output: 'A'.repeat(60000) },
+            },
+          },
+          {
+            inlineData: {
+              data: 'base64data',
+              mimeType: 'image/png',
+            },
+          },
+        ],
+      },
+      // Protection buffer
+      {
+        role: 'user',
+        parts: [
+          {
+            functionResponse: {
+              name: 'p',
+              response: { output: 'p'.repeat(60000) },
+            },
+          },
+        ],
+      },
+      // Latest turn
+      { role: 'user', parts: [{ text: 'latest' }] },
+    ];
+
+    mockedEstimateTokenCountSync.mockImplementation((parts: Part[]) => {
+      const resp = parts[0].functionResponse?.response as Record<
+        string,
+        unknown
+      >;
+      const content = (resp?.['output'] as string) ?? JSON.stringify(resp);
+      if (content.includes(`<${MASKING_INDICATOR_TAG}`)) return 100;
+
+      if (parts[0].functionResponse?.name === 't1') return 60000;
+      if (parts[0].functionResponse?.name === 'p') return 60000;
+      return 100;
+    });
+
+    const result = await service.mask(history, mockConfig);
+
+    expect(result.maskedCount).toBe(2); //Both t1 and p are prunable (cumulative 60k each > 50k protection)
+    expect(result.newHistory[0].parts).toHaveLength(2);
+    expect(result.newHistory[0].parts?.[0].functionResponse).toBeDefined();
+    expect(
+      (
+        result.newHistory[0].parts?.[0].functionResponse?.response as Record<
+          string,
+          unknown
+        >
+      )['output'],
+    ).toContain(`<${MASKING_INDICATOR_TAG}`);
+    expect(result.newHistory[0].parts?.[1].inlineData).toEqual({
+      data: 'base64data',
+      mimeType: 'image/png',
+    });
+  });
+
+  it('should match the expected snapshot for a masked tool output', async () => {
+    const history: Content[] = [
+      {
+        role: 'user',
+        parts: [
+          {
+            functionResponse: {
+              name: SHELL_TOOL_NAME,
+              response: {
+                output: 'Line\n'.repeat(25),
+                exitCode: 0,
+              },
+            },
+          },
+        ],
+      },
+      // Buffer to push shell_tool into prunable territory
+      {
+        role: 'user',
+        parts: [
+          {
+            functionResponse: {
+              name: 'padding',
+              response: { output: 'B'.repeat(60000) },
+            },
+          },
+        ],
+      },
+      { role: 'user', parts: [{ text: 'latest' }] },
+    ];
+
+    mockedEstimateTokenCountSync.mockImplementation((parts: Part[]) => {
+      const resp = parts[0].functionResponse?.response as Record<
+        string,
+        unknown
+      >;
+      const content = (resp?.['output'] as string) ?? JSON.stringify(resp);
+      if (content.includes(`<${MASKING_INDICATOR_TAG}`)) return 100;
+
+      if (parts[0].functionResponse?.name === SHELL_TOOL_NAME) return 1000;
+      if (parts[0].functionResponse?.name === 'padding') return 60000;
+      return 10;
+    });
+
+    const result = await service.mask(history, mockConfig);
+
+    // Verify complete masking: only 'output' key should exist
+    const responseObj = result.newHistory[0].parts?.[0].functionResponse
+      ?.response as Record<string, unknown>;
+    expect(Object.keys(responseObj)).toEqual(['output']);
+
+    const response = responseObj['output'] as string;
+
+    // We replace the random part of the filename for deterministic snapshots
+    // and normalize path separators for cross-platform compatibility
+    const deterministicResponse = response
+      .replace(
+        new RegExp(`${SHELL_TOOL_NAME}_[^\\s"]+\\.txt`, 'g'),
+        `${SHELL_TOOL_NAME}_deterministic.txt`,
+      )
+      .replace(/\\/g, '/');
+
+    expect(deterministicResponse).toMatchSnapshot();
+  });
+
+  it('should not mask if masking increases token count (due to overhead)', async () => {
+    const history: Content[] = [
+      {
+        role: 'user',
+        parts: [
+          {
+            functionResponse: {
+              name: 'tiny_tool',
+              response: { output: 'tiny' },
+            },
+          },
+        ],
+      },
+      // Protection buffer to push tiny_tool into prunable territory
+      {
+        role: 'user',
+        parts: [
+          {
+            functionResponse: {
+              name: 'padding',
+              response: { output: 'B'.repeat(60000) },
+            },
+          },
+        ],
+      },
+      { role: 'user', parts: [{ text: 'latest' }] },
+    ];
+
+    mockedEstimateTokenCountSync.mockImplementation((parts: Part[]) => {
+      if (parts[0].functionResponse?.name === 'tiny_tool') return 5;
+      if (parts[0].functionResponse?.name === 'padding') return 60000;
+      return 1000; // The masked version would be huge due to boilerplate
+    });
+
+    const result = await service.mask(history, mockConfig);
+    expect(result.maskedCount).toBe(0); // padding is protected, tiny_tool would increase size
+  });
+});
--- a/packages/core/src/services/toolOutputMaskingService.ts
+++ b/packages/core/src/services/toolOutputMaskingService.ts
@@ -0,0 +1,344 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import type { Content, Part } from '@google/genai';
+import path from 'node:path';
+import * as fsPromises from 'node:fs/promises';
+import { estimateTokenCountSync } from '../utils/tokenCalculation.js';
+import { debugLogger } from '../utils/debugLogger.js';
+import { sanitizeFilenamePart } from '../utils/fileUtils.js';
+import type { Config } from '../config/config.js';
+import { logToolOutputMasking } from '../telemetry/loggers.js';
+import { SHELL_TOOL_NAME } from '../tools/tool-names.js';
+import { ToolOutputMaskingEvent } from '../telemetry/types.js';
+
+// Tool output masking defaults
+export const DEFAULT_TOOL_PROTECTION_THRESHOLD = 50000;
+export const DEFAULT_MIN_PRUNABLE_TOKENS_THRESHOLD = 30000;
+export const DEFAULT_PROTECT_LATEST_TURN = true;
+export const MASKING_INDICATOR_TAG = 'tool_output_masked';
+
+export const TOOL_OUTPUTS_DIR = 'tool-outputs';
+
+export interface MaskingResult {
+  newHistory: Content[];
+  maskedCount: number;
+  tokensSaved: number;
+}
+
+/**
+ * Service to manage context window efficiency by masking bulky tool outputs (Tool Output Masking).
+ *
+ * It implements a "Hybrid Backward Scanned FIFO" algorithm to balance context relevance with
+ * token savings:
+ * 1. **Protection Window**: Protects the newest `toolProtectionThreshold` (default 50k) tool tokens
+ *    from pruning. Optionally skips the entire latest conversation turn to ensure full context for
+ *    the model's next response.
+ * 2. **Global Aggregation**: Scans backwards past the protection window to identify all remaining
+ *    tool outputs that haven't been masked yet.
+ * 3. **Batch Trigger**: Trigger masking only if the total prunable tokens exceed
+ *    `minPrunableTokensThreshold` (default 30k).
+ *
+ * @remarks
+ * Effectively, this means masking only starts once the conversation contains approximately 80k
+ * tokens of prunable tool outputs (50k protected + 30k prunable buffer). Small tool outputs
+ * are preserved until they collectively reach the threshold.
+ */
+export class ToolOutputMaskingService {
+  async mask(history: Content[], config: Config): Promise<MaskingResult> {
+    if (history.length === 0) {
+      return { newHistory: history, maskedCount: 0, tokensSaved: 0 };
+    }
+
+    let cumulativeToolTokens = 0;
+    let protectionBoundaryReached = false;
+    let totalPrunableTokens = 0;
+    let maskedCount = 0;
+
+    const prunableParts: Array<{
+      contentIndex: number;
+      partIndex: number;
+      tokens: number;
+      content: string;
+      originalPart: Part;
+    }> = [];
+
+    const maskingConfig = config.getToolOutputMaskingConfig();
+
+    // Decide where to start scanning.
+    // If PROTECT_LATEST_TURN is true, we skip the most recent message (index history.length - 1).
+    const scanStartIdx = maskingConfig.protectLatestTurn
+      ? history.length - 2
+      : history.length - 1;
+
+    // Backward scan to identify prunable tool outputs
+    for (let i = scanStartIdx; i >= 0; i--) {
+      const content = history[i];
+      const parts = content.parts || [];
+
+      for (let j = parts.length - 1; j >= 0; j--) {
+        const part = parts[j];
+
+        // Tool outputs (functionResponse) are the primary targets for pruning because
+        // they often contain voluminous data (e.g., shell logs, file content) that
+        // can exceed context limits. We preserve other parts—such as user text,
+        // model reasoning, and multimodal data—because they define the conversation's
+        // core intent and logic, which are harder for the model to recover if lost.
+        if (!part.functionResponse) continue;
+
+        const toolOutputContent = this.getToolOutputContent(part);
+        if (!toolOutputContent || this.isAlreadyMasked(toolOutputContent)) {
+          continue;
+        }
+
+        const partTokens = estimateTokenCountSync([part]);
+
+        if (!protectionBoundaryReached) {
+          cumulativeToolTokens += partTokens;
+          if (cumulativeToolTokens > maskingConfig.toolProtectionThreshold) {
+            protectionBoundaryReached = true;
+            // The part that crossed the boundary is prunable.
+            totalPrunableTokens += partTokens;
+            prunableParts.push({
+              contentIndex: i,
+              partIndex: j,
+              tokens: partTokens,
+              content: toolOutputContent,
+              originalPart: part,
+            });
+          }
+        } else {
+          totalPrunableTokens += partTokens;
+          prunableParts.push({
+            contentIndex: i,
+            partIndex: j,
+            tokens: partTokens,
+            content: toolOutputContent,
+            originalPart: part,
+          });
+        }
+      }
+    }
+
+    // Trigger pruning only if we have accumulated enough savings to justify the
+    // overhead of masking and file I/O (batch pruning threshold).
+    if (totalPrunableTokens < maskingConfig.minPrunableTokensThreshold) {
+      return { newHistory: history, maskedCount: 0, tokensSaved: 0 };
+    }
+
+    debugLogger.debug(
+      `[ToolOutputMasking] Triggering masking. Prunable tool tokens: ${totalPrunableTokens.toLocaleString()} (> ${maskingConfig.minPrunableTokensThreshold.toLocaleString()})`,
+    );
+
+    // Perform masking and offloading
+    const newHistory = [...history]; // Shallow copy of history
+    let actualTokensSaved = 0;
+    const toolOutputsDir = path.join(
+      config.storage.getHistoryDir(),
+      TOOL_OUTPUTS_DIR,
+    );
+    await fsPromises.mkdir(toolOutputsDir, { recursive: true });
+
+    for (const item of prunableParts) {
+      const { contentIndex, partIndex, content, tokens } = item;
+      const contentRecord = newHistory[contentIndex];
+      const part = contentRecord.parts![partIndex];
+
+      if (!part.functionResponse) continue;
+
+      const toolName = part.functionResponse.name || 'unknown_tool';
+      const callId = part.functionResponse.id || Date.now().toString();
+      const safeToolName = sanitizeFilenamePart(toolName).toLowerCase();
+      const safeCallId = sanitizeFilenamePart(callId).toLowerCase();
+      const fileName = `${safeToolName}_${safeCallId}_${Math.random()
+        .toString(36)
+        .substring(7)}.txt`;
+      const filePath = path.join(toolOutputsDir, fileName);
+
+      await fsPromises.writeFile(filePath, content, 'utf-8');
+
+      const originalResponse =
+        (part.functionResponse.response as Record<string, unknown>) || {};
+
+      const totalLines = content.split('\n').length;
+      const fileSizeMB = (
+        Buffer.byteLength(content, 'utf8') /
+        1024 /
+        1024
+      ).toFixed(2);
+
+      let preview = '';
+      if (toolName === SHELL_TOOL_NAME) {
+        preview = this.formatShellPreview(originalResponse);
+      } else {
+        // General tools: Head + Tail preview (250 chars each)
+        if (content.length > 500) {
+          preview = `${content.slice(0, 250)}\n... [TRUNCATED] ...\n${content.slice(-250)}`;
+        } else {
+          preview = content;
+        }
+      }
+
+      const maskedSnippet = this.formatMaskedSnippet({
+        toolName,
+        filePath,
+        fileSizeMB,
+        totalLines,
+        tokens,
+        preview,
+      });
+
+      const maskedPart = {
+        ...part,
+        functionResponse: {
+          ...part.functionResponse,
+          response: { output: maskedSnippet },
+        },
+      };
+
+      const newTaskTokens = estimateTokenCountSync([maskedPart]);
+      const savings = tokens - newTaskTokens;
+
+      if (savings > 0) {
+        const newParts = [...contentRecord.parts!];
+        newParts[partIndex] = maskedPart;
+        newHistory[contentIndex] = { ...contentRecord, parts: newParts };
+        actualTokensSaved += savings;
+        maskedCount++;
+      }
+    }
+
+    debugLogger.debug(
+      `[ToolOutputMasking] Masked ${maskedCount} tool outputs. Saved ~${actualTokensSaved.toLocaleString()} tokens.`,
+    );
+
+    const result = {
+      newHistory,
+      maskedCount,
+      tokensSaved: actualTokensSaved,
+    };
+
+    if (actualTokensSaved <= 0) {
+      return result;
+    }
+
+    logToolOutputMasking(
+      config,
+      new ToolOutputMaskingEvent({
+        tokens_before: totalPrunableTokens,
+        tokens_after: totalPrunableTokens - actualTokensSaved,
+        masked_count: maskedCount,
+        total_prunable_tokens: totalPrunableTokens,
+      }),
+    );
+
+    return result;
+  }
+
+  private getToolOutputContent(part: Part): string | null {
+    if (!part.functionResponse) return null;
+    const response = part.functionResponse.response as Record<string, unknown>;
+    if (!response) return null;
+
+    // Stringify the entire response for saving.
+    // This handles any tool output schema automatically.
+    const content = JSON.stringify(response, null, 2);
+
+    // Multimodal safety check: Sibling parts (inlineData, etc.) are handled by mask()
+    // by keeping the original part structure and only replacing the functionResponse content.
+
+    return content;
+  }
+
+  private isAlreadyMasked(content: string): boolean {
+    return content.includes(`<${MASKING_INDICATOR_TAG}`);
+  }
+
+  private formatShellPreview(response: Record<string, unknown>): string {
+    const content = (response['output'] || response['stdout'] || '') as string;
+    if (typeof content !== 'string') {
+      return typeof content === 'object'
+        ? JSON.stringify(content)
+        : String(content);
+    }
+
+    // The shell tool output is structured in shell.ts with specific section prefixes:
+    const sectionRegex =
+      /^(Output|Error|Exit Code|Signal|Background PIDs|Process Group PGID): /m;
+    const parts = content.split(sectionRegex);
+
+    if (parts.length < 3) {
+      // Fallback to simple head/tail if not in expected shell.ts format
+      return this.formatSimplePreview(content);
+    }
+
+    const previewParts: string[] = [];
+    if (parts[0].trim()) {
+      previewParts.push(this.formatSimplePreview(parts[0].trim()));
+    }
+
+    for (let i = 1; i < parts.length; i += 2) {
+      const name = parts[i];
+      const sectionContent = parts[i + 1]?.trim() || '';
+
+      if (name === 'Output') {
+        previewParts.push(
+          `Output: ${this.formatSimplePreview(sectionContent)}`,
+        );
+      } else {
+        // Keep other sections (Error, Exit Code, etc.) in full as they are usually high-signal and small
+        previewParts.push(`${name}: ${sectionContent}`);
+      }
+    }
+
+    let preview = previewParts.join('\n');
+
+    // Also check root levels just in case some tool uses them or for future-proofing
+    const exitCode = response['exitCode'] ?? response['exit_code'];
+    const error = response['error'];
+    if (
+      exitCode !== undefined &&
+      exitCode !== 0 &&
+      exitCode !== null &&
+      !content.includes(`Exit Code: ${exitCode}`)
+    ) {
+      preview += `\n[Exit Code: ${exitCode}]`;
+    }
+    if (error && !content.includes(`Error: ${error}`)) {
+      preview += `\n[Error: ${error}]`;
+    }
+
+    return preview;
+  }
+
+  private formatSimplePreview(content: string): string {
+    const lines = content.split('\n');
+    if (lines.length <= 20) return content;
+    const head = lines.slice(0, 10);
+    const tail = lines.slice(-10);
+    return `${head.join('\n')}\n\n... [${
+      lines.length - head.length - tail.length
+    } lines omitted] ...\n\n${tail.join('\n')}`;
+  }
+
+  private formatMaskedSnippet(params: MaskedSnippetParams): string {
+    const { filePath, preview } = params;
+    return `<${MASKING_INDICATOR_TAG}>
+${preview}
+
+Output too large. Full output available at: ${filePath}
+</${MASKING_INDICATOR_TAG}>`;
+  }
+}
+
+interface MaskedSnippetParams {
+  toolName: string;
+  filePath: string;
+  fileSizeMB: string;
+  totalLines: number;
+  tokens: number;
+  preview: string;
+}
--- a/packages/core/src/telemetry/clearcut-logger/clearcut-logger.ts
+++ b/packages/core/src/telemetry/clearcut-logger/clearcut-logger.ts
@@ -46,6 +46,7 @@ import type {
  ApprovalModeSwitchEvent,
  ApprovalModeDurationEvent,
  PlanExecutionEvent,
+  ToolOutputMaskingEvent,
 } from '../types.js';
 import { EventMetadataKey } from './event-metadata-key.js';
 import type { Config } from '../../config/config.js';
@@ -108,6 +109,7 @@ export enum EventNames {
  APPROVAL_MODE_SWITCH = 'approval_mode_switch',
  APPROVAL_MODE_DURATION = 'approval_mode_duration',
  PLAN_EXECUTION = 'plan_execution',
+  TOOL_OUTPUT_MASKING = 'tool_output_masking',
 }

 export interface LogResponse {
@@ -1217,8 +1219,40 @@ export class ClearcutLogger {
      },
    ];

+    const logEvent = this.createLogEvent(
+      EventNames.TOOL_OUTPUT_TRUNCATED,
+      data,
+    );
+    this.enqueueLogEvent(logEvent);
+    this.flushIfNeeded();
+  }
+
+  logToolOutputMaskingEvent(event: ToolOutputMaskingEvent): void {
+    const data: EventValue[] = [
+      {
+        gemini_cli_key:
+          EventMetadataKey.GEMINI_CLI_TOOL_OUTPUT_MASKING_TOKENS_BEFORE,
+        value: event.tokens_before.toString(),
+      },
+      {
+        gemini_cli_key:
+          EventMetadataKey.GEMINI_CLI_TOOL_OUTPUT_MASKING_TOKENS_AFTER,
+        value: event.tokens_after.toString(),
+      },
+      {
+        gemini_cli_key:
+          EventMetadataKey.GEMINI_CLI_TOOL_OUTPUT_MASKING_MASKED_COUNT,
+        value: event.masked_count.toString(),
+      },
+      {
+        gemini_cli_key:
+          EventMetadataKey.GEMINI_CLI_TOOL_OUTPUT_MASKING_TOTAL_PRUNABLE_TOKENS,
+        value: event.total_prunable_tokens.toString(),
+      },
+    ];
+
    this.enqueueLogEvent(
-      this.createLogEvent(EventNames.TOOL_OUTPUT_TRUNCATED, data),
+      this.createLogEvent(EventNames.TOOL_OUTPUT_MASKING, data),
    );
    this.flushIfNeeded();
  }
--- a/packages/core/src/telemetry/clearcut-logger/event-metadata-key.ts
+++ b/packages/core/src/telemetry/clearcut-logger/event-metadata-key.ts
@@ -7,7 +7,7 @@
 // Defines valid event metadata keys for Clearcut logging.
 export enum EventMetadataKey {
  // Deleted enums: 24
-  // Next ID: 148
+  // Next ID: 152

  GEMINI_CLI_KEY_UNKNOWN = 0,

@@ -561,4 +561,20 @@ export enum EventMetadataKey {

  // Logs the classifier threshold used.
  GEMINI_CLI_ROUTING_CLASSIFIER_THRESHOLD = 147,
+
+  // ==========================================================================
+  // Tool Output Masking Event Keys
+  // ==========================================================================
+
+  // Logs the total tokens in the prunable block before masking.
+  GEMINI_CLI_TOOL_OUTPUT_MASKING_TOKENS_BEFORE = 148,
+
+  // Logs the total tokens in the masked remnants after masking.
+  GEMINI_CLI_TOOL_OUTPUT_MASKING_TOKENS_AFTER = 149,
+
+  // Logs the number of tool outputs masked in this operation.
+  GEMINI_CLI_TOOL_OUTPUT_MASKING_MASKED_COUNT = 150,
+
+  // Logs the total prunable tokens identified at the trigger point.
+  GEMINI_CLI_TOOL_OUTPUT_MASKING_TOTAL_PRUNABLE_TOKENS = 151,
 }
--- a/packages/core/src/telemetry/loggers.ts
+++ b/packages/core/src/telemetry/loggers.ts
@@ -56,6 +56,7 @@ import type {
  StartupStatsEvent,
  LlmLoopCheckEvent,
  PlanExecutionEvent,
+  ToolOutputMaskingEvent,
 } from './types.js';
 import {
  recordApiErrorMetrics,
@@ -163,6 +164,21 @@ export function logToolOutputTruncated(
  });
 }

+export function logToolOutputMasking(
+  config: Config,
+  event: ToolOutputMaskingEvent,
+): void {
+  ClearcutLogger.getInstance(config)?.logToolOutputMaskingEvent(event);
+  bufferTelemetryEvent(() => {
+    const logger = logs.getLogger(SERVICE_NAME);
+    const logRecord: LogRecord = {
+      body: event.toLogBody(),
+      attributes: event.toOpenTelemetryAttributes(config),
+    };
+    logger.emit(logRecord);
+  });
+}
+
 export function logFileOperation(
  config: Config,
  event: FileOperationEvent,
--- a/packages/core/src/telemetry/types.ts
+++ b/packages/core/src/telemetry/types.ts
@@ -1376,6 +1376,49 @@ export class ToolOutputTruncatedEvent implements BaseTelemetryEvent {
  }
 }

+export const EVENT_TOOL_OUTPUT_MASKING = 'gemini_cli.tool_output_masking';
+
+export class ToolOutputMaskingEvent implements BaseTelemetryEvent {
+  'event.name': 'tool_output_masking';
+  'event.timestamp': string;
+  tokens_before: number;
+  tokens_after: number;
+  masked_count: number;
+  total_prunable_tokens: number;
+
+  constructor(details: {
+    tokens_before: number;
+    tokens_after: number;
+    masked_count: number;
+    total_prunable_tokens: number;
+  }) {
+    this['event.name'] = 'tool_output_masking';
+    this['event.timestamp'] = new Date().toISOString();
+    this.tokens_before = details.tokens_before;
+    this.tokens_after = details.tokens_after;
+    this.masked_count = details.masked_count;
+    this.total_prunable_tokens = details.total_prunable_tokens;
+  }
+
+  toOpenTelemetryAttributes(config: Config): LogAttributes {
+    return {
+      ...getCommonAttributes(config),
+      'event.name': EVENT_TOOL_OUTPUT_MASKING,
+      'event.timestamp': this['event.timestamp'],
+      tokens_before: this.tokens_before,
+      tokens_after: this.tokens_after,
+      masked_count: this.masked_count,
+      total_prunable_tokens: this.total_prunable_tokens,
+    };
+  }
+
+  toLogBody(): string {
+    return `Tool output masking (Masked ${this.masked_count} tool outputs. Saved ${
+      this.tokens_before - this.tokens_after
+    } tokens)`;
+  }
+}
+
 export const EVENT_EXTENSION_UNINSTALL = 'gemini_cli.extension_uninstall';
 export class ExtensionUninstallEvent implements BaseTelemetryEvent {
  'event.name': 'extension_uninstall';
@@ -1602,6 +1645,7 @@ export type TelemetryEvent =
  | LlmLoopCheckEvent
  | StartupStatsEvent
  | WebFetchFallbackAttemptEvent
+  | ToolOutputMaskingEvent
  | EditStrategyEvent
  | PlanExecutionEvent
  | RewindEvent
--- a/packages/core/src/utils/fileUtils.ts
+++ b/packages/core/src/utils/fileUtils.ts
@@ -572,6 +572,14 @@ export async function fileExists(filePath: string): Promise<boolean> {
 const MAX_TRUNCATED_LINE_WIDTH = 1000;
 const MAX_TRUNCATED_CHARS = 4000;

+/**
+ * Sanitizes a string for use as a filename part by removing path traversal
+ * characters and other non-alphanumeric characters.
+ */
+export function sanitizeFilenamePart(part: string): string {
+  return part.replace(/[^a-zA-Z0-9_-]/g, '_');
+}
+
 /**
 * Formats a truncated message for tool output, handling multi-line and single-line (elephant) cases.
 */
@@ -623,11 +631,8 @@ export async function saveTruncatedToolOutput(
  id: string | number, // Accept string (callId) or number (truncationId)
  projectTempDir: string,
 ): Promise<{ outputFile: string; totalLines: number }> {
-  const safeToolName = toolName.replace(/[^a-z0-9]/gi, '_').toLowerCase();
-  const safeId = id
-    .toString()
-    .replace(/[^a-z0-9]/gi, '_')
-    .toLowerCase();
+  const safeToolName = sanitizeFilenamePart(toolName).toLowerCase();
+  const safeId = sanitizeFilenamePart(id.toString()).toLowerCase();
  const fileName = `${safeToolName}_${safeId}.txt`;
  const toolOutputDir = path.join(projectTempDir, TOOL_OUTPUT_DIR);
  const outputFile = path.join(toolOutputDir, fileName);
--- a/schemas/settings.schema.json
+++ b/schemas/settings.schema.json
@@ -1428,6 +1428,44 @@
      "default": {},
      "type": "object",
      "properties": {
+        "toolOutputMasking": {
+          "title": "Tool Output Masking",
+          "description": "Advanced settings for tool output masking to manage context window efficiency.",
+          "markdownDescription": "Advanced settings for tool output masking to manage context window efficiency.\n\n- Category: `Experimental`\n- Requires restart: `yes`\n- Default: `{}`",
+          "default": {},
+          "type": "object",
+          "properties": {
+            "enabled": {
+              "title": "Enable Tool Output Masking",
+              "description": "Enables tool output masking to save tokens.",
+              "markdownDescription": "Enables tool output masking to save tokens.\n\n- Category: `Experimental`\n- Requires restart: `yes`\n- Default: `false`",
+              "default": false,
+              "type": "boolean"
+            },
+            "toolProtectionThreshold": {
+              "title": "Tool Protection Threshold",
+              "description": "Minimum number of tokens to protect from masking (most recent tool outputs).",
+              "markdownDescription": "Minimum number of tokens to protect from masking (most recent tool outputs).\n\n- Category: `Experimental`\n- Requires restart: `yes`\n- Default: `50000`",
+              "default": 50000,
+              "type": "number"
+            },
+            "minPrunableTokensThreshold": {
+              "title": "Min Prunable Tokens Threshold",
+              "description": "Minimum prunable tokens required to trigger a masking pass.",
+              "markdownDescription": "Minimum prunable tokens required to trigger a masking pass.\n\n- Category: `Experimental`\n- Requires restart: `yes`\n- Default: `30000`",
+              "default": 30000,
+              "type": "number"
+            },
+            "protectLatestTurn": {
+              "title": "Protect Latest Turn",
+              "description": "Ensures the absolute latest turn is never masked, regardless of token count.",
+              "markdownDescription": "Ensures the absolute latest turn is never masked, regardless of token count.\n\n- Category: `Experimental`\n- Requires restart: `yes`\n- Default: `true`",
+              "default": true,
+              "type": "boolean"
+            }
+          },
+          "additionalProperties": false
+        },
        "enableAgents": {
          "title": "Enable Agents",
          "description": "Enable local and remote subagents. Warning: Experimental feature, uses YOLO mode for subagents",