mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-03-13 23:51:16 -07:00
666 lines
19 KiB
TypeScript
666 lines
19 KiB
TypeScript
/**
|
|
* @license
|
|
* Copyright 2026 Google LLC
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*/
|
|
|
|
import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
|
|
import fs from 'node:fs';
|
|
import path from 'node:path';
|
|
import os from 'node:os';
|
|
import {
|
|
ToolOutputMaskingService,
|
|
MASKING_INDICATOR_TAG,
|
|
} from './toolOutputMaskingService.js';
|
|
import {
|
|
SHELL_TOOL_NAME,
|
|
ACTIVATE_SKILL_TOOL_NAME,
|
|
MEMORY_TOOL_NAME,
|
|
} from '../tools/tool-names.js';
|
|
import { estimateTokenCountSync } from '../utils/tokenCalculation.js';
|
|
import type { Config } from '../config/config.js';
|
|
import type { Content, Part } from '@google/genai';
|
|
|
|
vi.mock('../utils/tokenCalculation.js', () => ({
|
|
estimateTokenCountSync: vi.fn(),
|
|
}));
|
|
|
|
describe('ToolOutputMaskingService', () => {
|
|
let service: ToolOutputMaskingService;
|
|
let mockConfig: Config;
|
|
let testTempDir: string;
|
|
|
|
const mockedEstimateTokenCountSync = vi.mocked(estimateTokenCountSync);
|
|
|
|
beforeEach(async () => {
|
|
testTempDir = await fs.promises.mkdtemp(
|
|
path.join(os.tmpdir(), 'tool-masking-test-'),
|
|
);
|
|
|
|
service = new ToolOutputMaskingService();
|
|
mockConfig = {
|
|
storage: {
|
|
getHistoryDir: () => path.join(testTempDir, 'history'),
|
|
getProjectTempDir: () => testTempDir,
|
|
},
|
|
getSessionId: () => 'mock-session',
|
|
getUsageStatisticsEnabled: () => false,
|
|
getToolOutputMaskingEnabled: () => true,
|
|
getToolOutputMaskingConfig: async () => ({
|
|
enabled: true,
|
|
toolProtectionThreshold: 50000,
|
|
minPrunableTokensThreshold: 30000,
|
|
protectLatestTurn: true,
|
|
}),
|
|
} as unknown as Config;
|
|
vi.clearAllMocks();
|
|
});
|
|
|
|
afterEach(async () => {
|
|
vi.restoreAllMocks();
|
|
if (testTempDir) {
|
|
await fs.promises.rm(testTempDir, { recursive: true, force: true });
|
|
}
|
|
});
|
|
|
|
it('should respect remote configuration overrides', async () => {
|
|
mockConfig.getToolOutputMaskingConfig = async () => ({
|
|
enabled: true,
|
|
toolProtectionThreshold: 100, // Very low threshold
|
|
minPrunableTokensThreshold: 50,
|
|
protectLatestTurn: false,
|
|
});
|
|
|
|
const history: Content[] = [
|
|
{
|
|
role: 'user',
|
|
parts: [
|
|
{
|
|
functionResponse: {
|
|
name: 'test_tool',
|
|
response: { output: 'A'.repeat(200) },
|
|
},
|
|
},
|
|
],
|
|
},
|
|
];
|
|
|
|
mockedEstimateTokenCountSync.mockImplementation((parts) => {
|
|
const resp = parts[0].functionResponse?.response as Record<
|
|
string,
|
|
unknown
|
|
>;
|
|
const content = (resp?.['output'] as string) ?? JSON.stringify(resp);
|
|
return content.includes(MASKING_INDICATOR_TAG) ? 10 : 200;
|
|
});
|
|
|
|
const result = await service.mask(history, mockConfig);
|
|
|
|
// With low thresholds and protectLatestTurn=false, it should mask even the latest turn
|
|
expect(result.maskedCount).toBe(1);
|
|
expect(result.tokensSaved).toBeGreaterThan(0);
|
|
});
|
|
|
|
it('should not mask if total tool tokens are below protection threshold', async () => {
|
|
const history: Content[] = [
|
|
{
|
|
role: 'user',
|
|
parts: [
|
|
{
|
|
functionResponse: {
|
|
name: 'test_tool',
|
|
response: { output: 'small output' },
|
|
},
|
|
},
|
|
],
|
|
},
|
|
];
|
|
|
|
mockedEstimateTokenCountSync.mockReturnValue(100);
|
|
|
|
const result = await service.mask(history, mockConfig);
|
|
|
|
expect(result.maskedCount).toBe(0);
|
|
expect(result.newHistory).toEqual(history);
|
|
});
|
|
|
|
const getToolResponse = (part: Part | undefined): string => {
|
|
const resp = part?.functionResponse?.response as
|
|
| { output: string }
|
|
| undefined;
|
|
return resp?.output ?? (resp as unknown as string) ?? '';
|
|
};
|
|
|
|
it('should protect the latest turn and mask older outputs beyond 50k window if total > 30k', async () => {
|
|
// History:
|
|
// Turn 1: 60k (Oldest)
|
|
// Turn 2: 20k
|
|
// Turn 3: 10k (Latest) - Protected because PROTECT_LATEST_TURN is true
|
|
const history: Content[] = [
|
|
{
|
|
role: 'user',
|
|
parts: [
|
|
{
|
|
functionResponse: {
|
|
name: 't1',
|
|
response: { output: 'A'.repeat(60000) },
|
|
},
|
|
},
|
|
],
|
|
},
|
|
{
|
|
role: 'user',
|
|
parts: [
|
|
{
|
|
functionResponse: {
|
|
name: 't2',
|
|
response: { output: 'B'.repeat(20000) },
|
|
},
|
|
},
|
|
],
|
|
},
|
|
{
|
|
role: 'user',
|
|
parts: [
|
|
{
|
|
functionResponse: {
|
|
name: 't3',
|
|
response: { output: 'C'.repeat(10000) },
|
|
},
|
|
},
|
|
],
|
|
},
|
|
];
|
|
|
|
mockedEstimateTokenCountSync.mockImplementation((parts: Part[]) => {
|
|
const toolName = parts[0].functionResponse?.name;
|
|
const resp = parts[0].functionResponse?.response as Record<
|
|
string,
|
|
unknown
|
|
>;
|
|
const content = (resp?.['output'] as string) ?? JSON.stringify(resp);
|
|
if (content.includes(`<${MASKING_INDICATOR_TAG}`)) return 100;
|
|
|
|
if (toolName === 't1') return 60000;
|
|
if (toolName === 't2') return 20000;
|
|
if (toolName === 't3') return 10000;
|
|
return 0;
|
|
});
|
|
|
|
// Scanned: Turn 2 (20k), Turn 1 (60k). Total = 80k.
|
|
// Turn 2: Cumulative = 20k. Protected (<= 50k).
|
|
// Turn 1: Cumulative = 80k. Crossed 50k boundary. Prunabled.
|
|
// Total Prunable = 60k (> 30k trigger).
|
|
const result = await service.mask(history, mockConfig);
|
|
|
|
expect(result.maskedCount).toBe(1);
|
|
expect(getToolResponse(result.newHistory[0].parts?.[0])).toContain(
|
|
`<${MASKING_INDICATOR_TAG}`,
|
|
);
|
|
expect(getToolResponse(result.newHistory[1].parts?.[0])).toEqual(
|
|
'B'.repeat(20000),
|
|
);
|
|
expect(getToolResponse(result.newHistory[2].parts?.[0])).toEqual(
|
|
'C'.repeat(10000),
|
|
);
|
|
});
|
|
|
|
it('should perform global aggregation for many small parts once boundary is hit', async () => {
|
|
// history.length = 12. Skip index 11 (latest).
|
|
// Indices 0-10: 10k each.
|
|
// Index 10: 10k (Sum 10k)
|
|
// Index 9: 10k (Sum 20k)
|
|
// Index 8: 10k (Sum 30k)
|
|
// Index 7: 10k (Sum 40k)
|
|
// Index 6: 10k (Sum 50k) - Boundary hit here?
|
|
// Actually, Boundary is 50k. So Index 6 crosses it.
|
|
// Index 6, 5, 4, 3, 2, 1, 0 are all prunable. (7 * 10k = 70k).
|
|
const history: Content[] = Array.from({ length: 12 }, (_, i) => ({
|
|
role: 'user',
|
|
parts: [
|
|
{
|
|
functionResponse: {
|
|
name: `tool${i}`,
|
|
response: { output: 'A'.repeat(10000) },
|
|
},
|
|
},
|
|
],
|
|
}));
|
|
|
|
mockedEstimateTokenCountSync.mockImplementation((parts: Part[]) => {
|
|
const resp = parts[0].functionResponse?.response as
|
|
| { output?: string; result?: string }
|
|
| string
|
|
| undefined;
|
|
const content =
|
|
typeof resp === 'string'
|
|
? resp
|
|
: resp?.output || resp?.result || JSON.stringify(resp);
|
|
if (content?.includes(`<${MASKING_INDICATOR_TAG}`)) return 100;
|
|
return content?.length || 0;
|
|
});
|
|
|
|
const result = await service.mask(history, mockConfig);
|
|
|
|
expect(result.maskedCount).toBe(6); // boundary at 50k protects 0-5
|
|
expect(result.tokensSaved).toBeGreaterThan(0);
|
|
});
|
|
|
|
it('should verify tool-aware previews (shell vs generic)', async () => {
|
|
const shellHistory: Content[] = [
|
|
{
|
|
role: 'user',
|
|
parts: [
|
|
{
|
|
functionResponse: {
|
|
name: SHELL_TOOL_NAME,
|
|
response: {
|
|
output:
|
|
'Output: line1\nline2\nline3\nline4\nline5\nError: failed\nExit Code: 1',
|
|
},
|
|
},
|
|
},
|
|
],
|
|
},
|
|
// Protection buffer
|
|
{
|
|
role: 'user',
|
|
parts: [
|
|
{
|
|
functionResponse: {
|
|
name: 'p',
|
|
response: { output: 'p'.repeat(60000) },
|
|
},
|
|
},
|
|
],
|
|
},
|
|
// Latest turn
|
|
{
|
|
role: 'user',
|
|
parts: [{ functionResponse: { name: 'l', response: { output: 'l' } } }],
|
|
},
|
|
];
|
|
|
|
mockedEstimateTokenCountSync.mockImplementation((parts: Part[]) => {
|
|
const name = parts[0].functionResponse?.name;
|
|
const resp = parts[0].functionResponse?.response as Record<
|
|
string,
|
|
unknown
|
|
>;
|
|
const content = (resp?.['output'] as string) ?? JSON.stringify(resp);
|
|
if (content.includes(`<${MASKING_INDICATOR_TAG}`)) return 100;
|
|
|
|
if (name === SHELL_TOOL_NAME) return 100000;
|
|
if (name === 'p') return 60000;
|
|
return 100;
|
|
});
|
|
|
|
const result = await service.mask(shellHistory, mockConfig);
|
|
const maskedBash = getToolResponse(result.newHistory[0].parts?.[0]);
|
|
|
|
expect(maskedBash).toContain('Output: line1\nline2\nline3\nline4\nline5');
|
|
expect(maskedBash).toContain('Exit Code: 1');
|
|
expect(maskedBash).toContain('Error: failed');
|
|
});
|
|
|
|
it('should skip already masked content and not count it towards totals', async () => {
|
|
const history: Content[] = [
|
|
{
|
|
role: 'user',
|
|
parts: [
|
|
{
|
|
functionResponse: {
|
|
name: 'tool1',
|
|
response: {
|
|
output: `<${MASKING_INDICATOR_TAG}>...</${MASKING_INDICATOR_TAG}>`,
|
|
},
|
|
},
|
|
},
|
|
],
|
|
},
|
|
{
|
|
role: 'user',
|
|
parts: [
|
|
{
|
|
functionResponse: {
|
|
name: 'tool2',
|
|
response: { output: 'A'.repeat(60000) },
|
|
},
|
|
},
|
|
],
|
|
},
|
|
];
|
|
mockedEstimateTokenCountSync.mockReturnValue(60000);
|
|
|
|
const result = await service.mask(history, mockConfig);
|
|
expect(result.maskedCount).toBe(0); // tool1 skipped, tool2 is the "latest" which is protected
|
|
});
|
|
|
|
it('should handle different response keys in masked update', async () => {
|
|
const history: Content[] = [
|
|
{
|
|
role: 'model',
|
|
parts: [
|
|
{
|
|
functionResponse: {
|
|
name: 't1',
|
|
response: { result: 'A'.repeat(60000) },
|
|
},
|
|
},
|
|
],
|
|
},
|
|
{
|
|
role: 'model',
|
|
parts: [
|
|
{
|
|
functionResponse: {
|
|
name: 'p',
|
|
response: { output: 'P'.repeat(60000) },
|
|
},
|
|
},
|
|
],
|
|
},
|
|
{ role: 'user', parts: [{ text: 'latest' }] },
|
|
];
|
|
|
|
mockedEstimateTokenCountSync.mockImplementation((parts: Part[]) => {
|
|
const resp = parts[0].functionResponse?.response as Record<
|
|
string,
|
|
unknown
|
|
>;
|
|
const content =
|
|
(resp?.['output'] as string) ??
|
|
(resp?.['result'] as string) ??
|
|
JSON.stringify(resp);
|
|
if (content.includes(`<${MASKING_INDICATOR_TAG}`)) return 100;
|
|
return 60000;
|
|
});
|
|
|
|
const result = await service.mask(history, mockConfig);
|
|
expect(result.maskedCount).toBe(2); // both t1 and p are prunable (cumulative 60k and 120k)
|
|
const responseObj = result.newHistory[0].parts?.[0].functionResponse
|
|
?.response as Record<string, unknown>;
|
|
expect(Object.keys(responseObj)).toEqual(['output']);
|
|
});
|
|
|
|
it('should preserve multimodal parts while masking tool responses', async () => {
|
|
const history: Content[] = [
|
|
{
|
|
role: 'user',
|
|
parts: [
|
|
{
|
|
functionResponse: {
|
|
name: 't1',
|
|
response: { output: 'A'.repeat(60000) },
|
|
},
|
|
},
|
|
{
|
|
inlineData: {
|
|
data: 'base64data',
|
|
mimeType: 'image/png',
|
|
},
|
|
},
|
|
],
|
|
},
|
|
// Protection buffer
|
|
{
|
|
role: 'user',
|
|
parts: [
|
|
{
|
|
functionResponse: {
|
|
name: 'p',
|
|
response: { output: 'p'.repeat(60000) },
|
|
},
|
|
},
|
|
],
|
|
},
|
|
// Latest turn
|
|
{ role: 'user', parts: [{ text: 'latest' }] },
|
|
];
|
|
|
|
mockedEstimateTokenCountSync.mockImplementation((parts: Part[]) => {
|
|
const resp = parts[0].functionResponse?.response as Record<
|
|
string,
|
|
unknown
|
|
>;
|
|
const content = (resp?.['output'] as string) ?? JSON.stringify(resp);
|
|
if (content.includes(`<${MASKING_INDICATOR_TAG}`)) return 100;
|
|
|
|
if (parts[0].functionResponse?.name === 't1') return 60000;
|
|
if (parts[0].functionResponse?.name === 'p') return 60000;
|
|
return 100;
|
|
});
|
|
|
|
const result = await service.mask(history, mockConfig);
|
|
|
|
expect(result.maskedCount).toBe(2); //Both t1 and p are prunable (cumulative 60k each > 50k protection)
|
|
expect(result.newHistory[0].parts).toHaveLength(2);
|
|
expect(result.newHistory[0].parts?.[0].functionResponse).toBeDefined();
|
|
expect(
|
|
(
|
|
result.newHistory[0].parts?.[0].functionResponse?.response as Record<
|
|
string,
|
|
unknown
|
|
>
|
|
)['output'],
|
|
).toContain(`<${MASKING_INDICATOR_TAG}`);
|
|
expect(result.newHistory[0].parts?.[1].inlineData).toEqual({
|
|
data: 'base64data',
|
|
mimeType: 'image/png',
|
|
});
|
|
});
|
|
|
|
it('should match the expected snapshot for a masked tool output', async () => {
|
|
const history: Content[] = [
|
|
{
|
|
role: 'user',
|
|
parts: [
|
|
{
|
|
functionResponse: {
|
|
name: SHELL_TOOL_NAME,
|
|
response: {
|
|
output: 'Line\n'.repeat(25),
|
|
exitCode: 0,
|
|
},
|
|
},
|
|
},
|
|
],
|
|
},
|
|
// Buffer to push shell_tool into prunable territory
|
|
{
|
|
role: 'user',
|
|
parts: [
|
|
{
|
|
functionResponse: {
|
|
name: 'padding',
|
|
response: { output: 'B'.repeat(60000) },
|
|
},
|
|
},
|
|
],
|
|
},
|
|
{ role: 'user', parts: [{ text: 'latest' }] },
|
|
];
|
|
|
|
mockedEstimateTokenCountSync.mockImplementation((parts: Part[]) => {
|
|
const resp = parts[0].functionResponse?.response as Record<
|
|
string,
|
|
unknown
|
|
>;
|
|
const content = (resp?.['output'] as string) ?? JSON.stringify(resp);
|
|
if (content.includes(`<${MASKING_INDICATOR_TAG}`)) return 100;
|
|
|
|
if (parts[0].functionResponse?.name === SHELL_TOOL_NAME) return 1000;
|
|
if (parts[0].functionResponse?.name === 'padding') return 60000;
|
|
return 10;
|
|
});
|
|
|
|
const result = await service.mask(history, mockConfig);
|
|
|
|
// Verify complete masking: only 'output' key should exist
|
|
const responseObj = result.newHistory[0].parts?.[0].functionResponse
|
|
?.response as Record<string, unknown>;
|
|
expect(Object.keys(responseObj)).toEqual(['output']);
|
|
|
|
const response = responseObj['output'] as string;
|
|
|
|
// We replace the random part of the filename for deterministic snapshots
|
|
// and normalize path separators for cross-platform compatibility
|
|
const normalizedResponse = response.replace(/\\/g, '/');
|
|
const deterministicResponse = normalizedResponse
|
|
.replace(new RegExp(testTempDir.replace(/\\/g, '/'), 'g'), '/mock/temp')
|
|
.replace(
|
|
new RegExp(`${SHELL_TOOL_NAME}_[^\\s"]+\\.txt`, 'g'),
|
|
`${SHELL_TOOL_NAME}_deterministic.txt`,
|
|
);
|
|
|
|
expect(deterministicResponse).toMatchSnapshot();
|
|
});
|
|
|
|
it('should not mask if masking increases token count (due to overhead)', async () => {
|
|
const history: Content[] = [
|
|
{
|
|
role: 'user',
|
|
parts: [
|
|
{
|
|
functionResponse: {
|
|
name: 'tiny_tool',
|
|
response: { output: 'tiny' },
|
|
},
|
|
},
|
|
],
|
|
},
|
|
// Protection buffer to push tiny_tool into prunable territory
|
|
{
|
|
role: 'user',
|
|
parts: [
|
|
{
|
|
functionResponse: {
|
|
name: 'padding',
|
|
response: { output: 'B'.repeat(60000) },
|
|
},
|
|
},
|
|
],
|
|
},
|
|
{ role: 'user', parts: [{ text: 'latest' }] },
|
|
];
|
|
|
|
mockedEstimateTokenCountSync.mockImplementation((parts: Part[]) => {
|
|
if (parts[0].functionResponse?.name === 'tiny_tool') return 5;
|
|
if (parts[0].functionResponse?.name === 'padding') return 60000;
|
|
return 1000; // The masked version would be huge due to boilerplate
|
|
});
|
|
|
|
const result = await service.mask(history, mockConfig);
|
|
expect(result.maskedCount).toBe(0); // padding is protected, tiny_tool would increase size
|
|
});
|
|
|
|
it('should never mask exempt tools (like activate_skill) even if they are deep in history', async () => {
|
|
const history: Content[] = [
|
|
{
|
|
role: 'user',
|
|
parts: [
|
|
{
|
|
functionResponse: {
|
|
name: ACTIVATE_SKILL_TOOL_NAME,
|
|
response: { output: 'High value instructions for skill' },
|
|
},
|
|
},
|
|
],
|
|
},
|
|
{
|
|
role: 'user',
|
|
parts: [
|
|
{
|
|
functionResponse: {
|
|
name: MEMORY_TOOL_NAME,
|
|
response: { output: 'Important user preference' },
|
|
},
|
|
},
|
|
],
|
|
},
|
|
{
|
|
role: 'user',
|
|
parts: [
|
|
{
|
|
functionResponse: {
|
|
name: 'bulky_tool',
|
|
response: { output: 'A'.repeat(60000) },
|
|
},
|
|
},
|
|
],
|
|
},
|
|
// Protection buffer
|
|
{
|
|
role: 'user',
|
|
parts: [
|
|
{
|
|
functionResponse: {
|
|
name: 'padding',
|
|
response: { output: 'B'.repeat(60000) },
|
|
},
|
|
},
|
|
],
|
|
},
|
|
{ role: 'user', parts: [{ text: 'latest' }] },
|
|
];
|
|
|
|
mockedEstimateTokenCountSync.mockImplementation((parts: Part[]) => {
|
|
const resp = parts[0].functionResponse?.response as Record<
|
|
string,
|
|
unknown
|
|
>;
|
|
const content = (resp?.['output'] as string) ?? JSON.stringify(resp);
|
|
if (content.includes(`<${MASKING_INDICATOR_TAG}`)) return 100;
|
|
|
|
const name = parts[0].functionResponse?.name;
|
|
if (name === ACTIVATE_SKILL_TOOL_NAME) return 1000;
|
|
if (name === MEMORY_TOOL_NAME) return 500;
|
|
if (name === 'bulky_tool') return 60000;
|
|
if (name === 'padding') return 60000;
|
|
return 10;
|
|
});
|
|
|
|
const result = await service.mask(history, mockConfig);
|
|
|
|
// Both 'bulky_tool' and 'padding' should be masked.
|
|
// 'padding' (Index 3) crosses the 50k protection boundary immediately.
|
|
// ACTIVATE_SKILL and MEMORY are exempt.
|
|
expect(result.maskedCount).toBe(2);
|
|
expect(result.newHistory[0].parts?.[0].functionResponse?.name).toBe(
|
|
ACTIVATE_SKILL_TOOL_NAME,
|
|
);
|
|
expect(
|
|
(
|
|
result.newHistory[0].parts?.[0].functionResponse?.response as Record<
|
|
string,
|
|
unknown
|
|
>
|
|
)['output'],
|
|
).toBe('High value instructions for skill');
|
|
|
|
expect(result.newHistory[1].parts?.[0].functionResponse?.name).toBe(
|
|
MEMORY_TOOL_NAME,
|
|
);
|
|
expect(
|
|
(
|
|
result.newHistory[1].parts?.[0].functionResponse?.response as Record<
|
|
string,
|
|
unknown
|
|
>
|
|
)['output'],
|
|
).toBe('Important user preference');
|
|
|
|
expect(result.newHistory[2].parts?.[0].functionResponse?.name).toBe(
|
|
'bulky_tool',
|
|
);
|
|
expect(
|
|
(
|
|
result.newHistory[2].parts?.[0].functionResponse?.response as Record<
|
|
string,
|
|
unknown
|
|
>
|
|
)['output'],
|
|
).toContain(MASKING_INDICATOR_TAG);
|
|
});
|
|
});
|