Files
gemini-cli/packages/core/src/services/toolOutputMaskingService.test.ts

666 lines
19 KiB
TypeScript

/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
import fs from 'node:fs';
import path from 'node:path';
import os from 'node:os';
import {
ToolOutputMaskingService,
MASKING_INDICATOR_TAG,
} from './toolOutputMaskingService.js';
import {
SHELL_TOOL_NAME,
ACTIVATE_SKILL_TOOL_NAME,
MEMORY_TOOL_NAME,
} from '../tools/tool-names.js';
import { estimateTokenCountSync } from '../utils/tokenCalculation.js';
import type { Config } from '../config/config.js';
import type { Content, Part } from '@google/genai';
vi.mock('../utils/tokenCalculation.js', () => ({
estimateTokenCountSync: vi.fn(),
}));
describe('ToolOutputMaskingService', () => {
let service: ToolOutputMaskingService;
let mockConfig: Config;
let testTempDir: string;
const mockedEstimateTokenCountSync = vi.mocked(estimateTokenCountSync);
beforeEach(async () => {
testTempDir = await fs.promises.mkdtemp(
path.join(os.tmpdir(), 'tool-masking-test-'),
);
service = new ToolOutputMaskingService();
mockConfig = {
storage: {
getHistoryDir: () => path.join(testTempDir, 'history'),
getProjectTempDir: () => testTempDir,
},
getSessionId: () => 'mock-session',
getUsageStatisticsEnabled: () => false,
getToolOutputMaskingEnabled: () => true,
getToolOutputMaskingConfig: async () => ({
enabled: true,
toolProtectionThreshold: 50000,
minPrunableTokensThreshold: 30000,
protectLatestTurn: true,
}),
} as unknown as Config;
vi.clearAllMocks();
});
afterEach(async () => {
vi.restoreAllMocks();
if (testTempDir) {
await fs.promises.rm(testTempDir, { recursive: true, force: true });
}
});
it('should respect remote configuration overrides', async () => {
mockConfig.getToolOutputMaskingConfig = async () => ({
enabled: true,
toolProtectionThreshold: 100, // Very low threshold
minPrunableTokensThreshold: 50,
protectLatestTurn: false,
});
const history: Content[] = [
{
role: 'user',
parts: [
{
functionResponse: {
name: 'test_tool',
response: { output: 'A'.repeat(200) },
},
},
],
},
];
mockedEstimateTokenCountSync.mockImplementation((parts) => {
const resp = parts[0].functionResponse?.response as Record<
string,
unknown
>;
const content = (resp?.['output'] as string) ?? JSON.stringify(resp);
return content.includes(MASKING_INDICATOR_TAG) ? 10 : 200;
});
const result = await service.mask(history, mockConfig);
// With low thresholds and protectLatestTurn=false, it should mask even the latest turn
expect(result.maskedCount).toBe(1);
expect(result.tokensSaved).toBeGreaterThan(0);
});
it('should not mask if total tool tokens are below protection threshold', async () => {
const history: Content[] = [
{
role: 'user',
parts: [
{
functionResponse: {
name: 'test_tool',
response: { output: 'small output' },
},
},
],
},
];
mockedEstimateTokenCountSync.mockReturnValue(100);
const result = await service.mask(history, mockConfig);
expect(result.maskedCount).toBe(0);
expect(result.newHistory).toEqual(history);
});
const getToolResponse = (part: Part | undefined): string => {
const resp = part?.functionResponse?.response as
| { output: string }
| undefined;
return resp?.output ?? (resp as unknown as string) ?? '';
};
it('should protect the latest turn and mask older outputs beyond 50k window if total > 30k', async () => {
// History:
// Turn 1: 60k (Oldest)
// Turn 2: 20k
// Turn 3: 10k (Latest) - Protected because PROTECT_LATEST_TURN is true
const history: Content[] = [
{
role: 'user',
parts: [
{
functionResponse: {
name: 't1',
response: { output: 'A'.repeat(60000) },
},
},
],
},
{
role: 'user',
parts: [
{
functionResponse: {
name: 't2',
response: { output: 'B'.repeat(20000) },
},
},
],
},
{
role: 'user',
parts: [
{
functionResponse: {
name: 't3',
response: { output: 'C'.repeat(10000) },
},
},
],
},
];
mockedEstimateTokenCountSync.mockImplementation((parts: Part[]) => {
const toolName = parts[0].functionResponse?.name;
const resp = parts[0].functionResponse?.response as Record<
string,
unknown
>;
const content = (resp?.['output'] as string) ?? JSON.stringify(resp);
if (content.includes(`<${MASKING_INDICATOR_TAG}`)) return 100;
if (toolName === 't1') return 60000;
if (toolName === 't2') return 20000;
if (toolName === 't3') return 10000;
return 0;
});
// Scanned: Turn 2 (20k), Turn 1 (60k). Total = 80k.
// Turn 2: Cumulative = 20k. Protected (<= 50k).
// Turn 1: Cumulative = 80k. Crossed 50k boundary. Prunabled.
// Total Prunable = 60k (> 30k trigger).
const result = await service.mask(history, mockConfig);
expect(result.maskedCount).toBe(1);
expect(getToolResponse(result.newHistory[0].parts?.[0])).toContain(
`<${MASKING_INDICATOR_TAG}`,
);
expect(getToolResponse(result.newHistory[1].parts?.[0])).toEqual(
'B'.repeat(20000),
);
expect(getToolResponse(result.newHistory[2].parts?.[0])).toEqual(
'C'.repeat(10000),
);
});
it('should perform global aggregation for many small parts once boundary is hit', async () => {
// history.length = 12. Skip index 11 (latest).
// Indices 0-10: 10k each.
// Index 10: 10k (Sum 10k)
// Index 9: 10k (Sum 20k)
// Index 8: 10k (Sum 30k)
// Index 7: 10k (Sum 40k)
// Index 6: 10k (Sum 50k) - Boundary hit here?
// Actually, Boundary is 50k. So Index 6 crosses it.
// Index 6, 5, 4, 3, 2, 1, 0 are all prunable. (7 * 10k = 70k).
const history: Content[] = Array.from({ length: 12 }, (_, i) => ({
role: 'user',
parts: [
{
functionResponse: {
name: `tool${i}`,
response: { output: 'A'.repeat(10000) },
},
},
],
}));
mockedEstimateTokenCountSync.mockImplementation((parts: Part[]) => {
const resp = parts[0].functionResponse?.response as
| { output?: string; result?: string }
| string
| undefined;
const content =
typeof resp === 'string'
? resp
: resp?.output || resp?.result || JSON.stringify(resp);
if (content?.includes(`<${MASKING_INDICATOR_TAG}`)) return 100;
return content?.length || 0;
});
const result = await service.mask(history, mockConfig);
expect(result.maskedCount).toBe(6); // boundary at 50k protects 0-5
expect(result.tokensSaved).toBeGreaterThan(0);
});
it('should verify tool-aware previews (shell vs generic)', async () => {
const shellHistory: Content[] = [
{
role: 'user',
parts: [
{
functionResponse: {
name: SHELL_TOOL_NAME,
response: {
output:
'Output: line1\nline2\nline3\nline4\nline5\nError: failed\nExit Code: 1',
},
},
},
],
},
// Protection buffer
{
role: 'user',
parts: [
{
functionResponse: {
name: 'p',
response: { output: 'p'.repeat(60000) },
},
},
],
},
// Latest turn
{
role: 'user',
parts: [{ functionResponse: { name: 'l', response: { output: 'l' } } }],
},
];
mockedEstimateTokenCountSync.mockImplementation((parts: Part[]) => {
const name = parts[0].functionResponse?.name;
const resp = parts[0].functionResponse?.response as Record<
string,
unknown
>;
const content = (resp?.['output'] as string) ?? JSON.stringify(resp);
if (content.includes(`<${MASKING_INDICATOR_TAG}`)) return 100;
if (name === SHELL_TOOL_NAME) return 100000;
if (name === 'p') return 60000;
return 100;
});
const result = await service.mask(shellHistory, mockConfig);
const maskedBash = getToolResponse(result.newHistory[0].parts?.[0]);
expect(maskedBash).toContain('Output: line1\nline2\nline3\nline4\nline5');
expect(maskedBash).toContain('Exit Code: 1');
expect(maskedBash).toContain('Error: failed');
});
it('should skip already masked content and not count it towards totals', async () => {
const history: Content[] = [
{
role: 'user',
parts: [
{
functionResponse: {
name: 'tool1',
response: {
output: `<${MASKING_INDICATOR_TAG}>...</${MASKING_INDICATOR_TAG}>`,
},
},
},
],
},
{
role: 'user',
parts: [
{
functionResponse: {
name: 'tool2',
response: { output: 'A'.repeat(60000) },
},
},
],
},
];
mockedEstimateTokenCountSync.mockReturnValue(60000);
const result = await service.mask(history, mockConfig);
expect(result.maskedCount).toBe(0); // tool1 skipped, tool2 is the "latest" which is protected
});
it('should handle different response keys in masked update', async () => {
const history: Content[] = [
{
role: 'model',
parts: [
{
functionResponse: {
name: 't1',
response: { result: 'A'.repeat(60000) },
},
},
],
},
{
role: 'model',
parts: [
{
functionResponse: {
name: 'p',
response: { output: 'P'.repeat(60000) },
},
},
],
},
{ role: 'user', parts: [{ text: 'latest' }] },
];
mockedEstimateTokenCountSync.mockImplementation((parts: Part[]) => {
const resp = parts[0].functionResponse?.response as Record<
string,
unknown
>;
const content =
(resp?.['output'] as string) ??
(resp?.['result'] as string) ??
JSON.stringify(resp);
if (content.includes(`<${MASKING_INDICATOR_TAG}`)) return 100;
return 60000;
});
const result = await service.mask(history, mockConfig);
expect(result.maskedCount).toBe(2); // both t1 and p are prunable (cumulative 60k and 120k)
const responseObj = result.newHistory[0].parts?.[0].functionResponse
?.response as Record<string, unknown>;
expect(Object.keys(responseObj)).toEqual(['output']);
});
it('should preserve multimodal parts while masking tool responses', async () => {
const history: Content[] = [
{
role: 'user',
parts: [
{
functionResponse: {
name: 't1',
response: { output: 'A'.repeat(60000) },
},
},
{
inlineData: {
data: 'base64data',
mimeType: 'image/png',
},
},
],
},
// Protection buffer
{
role: 'user',
parts: [
{
functionResponse: {
name: 'p',
response: { output: 'p'.repeat(60000) },
},
},
],
},
// Latest turn
{ role: 'user', parts: [{ text: 'latest' }] },
];
mockedEstimateTokenCountSync.mockImplementation((parts: Part[]) => {
const resp = parts[0].functionResponse?.response as Record<
string,
unknown
>;
const content = (resp?.['output'] as string) ?? JSON.stringify(resp);
if (content.includes(`<${MASKING_INDICATOR_TAG}`)) return 100;
if (parts[0].functionResponse?.name === 't1') return 60000;
if (parts[0].functionResponse?.name === 'p') return 60000;
return 100;
});
const result = await service.mask(history, mockConfig);
expect(result.maskedCount).toBe(2); //Both t1 and p are prunable (cumulative 60k each > 50k protection)
expect(result.newHistory[0].parts).toHaveLength(2);
expect(result.newHistory[0].parts?.[0].functionResponse).toBeDefined();
expect(
(
result.newHistory[0].parts?.[0].functionResponse?.response as Record<
string,
unknown
>
)['output'],
).toContain(`<${MASKING_INDICATOR_TAG}`);
expect(result.newHistory[0].parts?.[1].inlineData).toEqual({
data: 'base64data',
mimeType: 'image/png',
});
});
it('should match the expected snapshot for a masked tool output', async () => {
const history: Content[] = [
{
role: 'user',
parts: [
{
functionResponse: {
name: SHELL_TOOL_NAME,
response: {
output: 'Line\n'.repeat(25),
exitCode: 0,
},
},
},
],
},
// Buffer to push shell_tool into prunable territory
{
role: 'user',
parts: [
{
functionResponse: {
name: 'padding',
response: { output: 'B'.repeat(60000) },
},
},
],
},
{ role: 'user', parts: [{ text: 'latest' }] },
];
mockedEstimateTokenCountSync.mockImplementation((parts: Part[]) => {
const resp = parts[0].functionResponse?.response as Record<
string,
unknown
>;
const content = (resp?.['output'] as string) ?? JSON.stringify(resp);
if (content.includes(`<${MASKING_INDICATOR_TAG}`)) return 100;
if (parts[0].functionResponse?.name === SHELL_TOOL_NAME) return 1000;
if (parts[0].functionResponse?.name === 'padding') return 60000;
return 10;
});
const result = await service.mask(history, mockConfig);
// Verify complete masking: only 'output' key should exist
const responseObj = result.newHistory[0].parts?.[0].functionResponse
?.response as Record<string, unknown>;
expect(Object.keys(responseObj)).toEqual(['output']);
const response = responseObj['output'] as string;
// We replace the random part of the filename for deterministic snapshots
// and normalize path separators for cross-platform compatibility
const normalizedResponse = response.replace(/\\/g, '/');
const deterministicResponse = normalizedResponse
.replace(new RegExp(testTempDir.replace(/\\/g, '/'), 'g'), '/mock/temp')
.replace(
new RegExp(`${SHELL_TOOL_NAME}_[^\\s"]+\\.txt`, 'g'),
`${SHELL_TOOL_NAME}_deterministic.txt`,
);
expect(deterministicResponse).toMatchSnapshot();
});
it('should not mask if masking increases token count (due to overhead)', async () => {
const history: Content[] = [
{
role: 'user',
parts: [
{
functionResponse: {
name: 'tiny_tool',
response: { output: 'tiny' },
},
},
],
},
// Protection buffer to push tiny_tool into prunable territory
{
role: 'user',
parts: [
{
functionResponse: {
name: 'padding',
response: { output: 'B'.repeat(60000) },
},
},
],
},
{ role: 'user', parts: [{ text: 'latest' }] },
];
mockedEstimateTokenCountSync.mockImplementation((parts: Part[]) => {
if (parts[0].functionResponse?.name === 'tiny_tool') return 5;
if (parts[0].functionResponse?.name === 'padding') return 60000;
return 1000; // The masked version would be huge due to boilerplate
});
const result = await service.mask(history, mockConfig);
expect(result.maskedCount).toBe(0); // padding is protected, tiny_tool would increase size
});
it('should never mask exempt tools (like activate_skill) even if they are deep in history', async () => {
const history: Content[] = [
{
role: 'user',
parts: [
{
functionResponse: {
name: ACTIVATE_SKILL_TOOL_NAME,
response: { output: 'High value instructions for skill' },
},
},
],
},
{
role: 'user',
parts: [
{
functionResponse: {
name: MEMORY_TOOL_NAME,
response: { output: 'Important user preference' },
},
},
],
},
{
role: 'user',
parts: [
{
functionResponse: {
name: 'bulky_tool',
response: { output: 'A'.repeat(60000) },
},
},
],
},
// Protection buffer
{
role: 'user',
parts: [
{
functionResponse: {
name: 'padding',
response: { output: 'B'.repeat(60000) },
},
},
],
},
{ role: 'user', parts: [{ text: 'latest' }] },
];
mockedEstimateTokenCountSync.mockImplementation((parts: Part[]) => {
const resp = parts[0].functionResponse?.response as Record<
string,
unknown
>;
const content = (resp?.['output'] as string) ?? JSON.stringify(resp);
if (content.includes(`<${MASKING_INDICATOR_TAG}`)) return 100;
const name = parts[0].functionResponse?.name;
if (name === ACTIVATE_SKILL_TOOL_NAME) return 1000;
if (name === MEMORY_TOOL_NAME) return 500;
if (name === 'bulky_tool') return 60000;
if (name === 'padding') return 60000;
return 10;
});
const result = await service.mask(history, mockConfig);
// Both 'bulky_tool' and 'padding' should be masked.
// 'padding' (Index 3) crosses the 50k protection boundary immediately.
// ACTIVATE_SKILL and MEMORY are exempt.
expect(result.maskedCount).toBe(2);
expect(result.newHistory[0].parts?.[0].functionResponse?.name).toBe(
ACTIVATE_SKILL_TOOL_NAME,
);
expect(
(
result.newHistory[0].parts?.[0].functionResponse?.response as Record<
string,
unknown
>
)['output'],
).toBe('High value instructions for skill');
expect(result.newHistory[1].parts?.[0].functionResponse?.name).toBe(
MEMORY_TOOL_NAME,
);
expect(
(
result.newHistory[1].parts?.[0].functionResponse?.response as Record<
string,
unknown
>
)['output'],
).toBe('Important user preference');
expect(result.newHistory[2].parts?.[0].functionResponse?.name).toBe(
'bulky_tool',
);
expect(
(
result.newHistory[2].parts?.[0].functionResponse?.response as Record<
string,
unknown
>
)['output'],
).toContain(MASKING_INDICATOR_TAG);
});
});