Files
gemini-cli/packages/core/src/context/toolOutputMaskingService.test.ts
T
jacob314 859c7c3a70 Checkpoint of shell optimization
fix(cli): Write shell command output to a file and limit memory buffered in UI

Fixes.

Checkpoint.

fix(core, cli): await outputStream.end() to prevent race conditions

This commit fixes a critical race condition where
was called synchronously without being awaited. This led to potential file
truncation or EBUSY errors on Windows when attempting to manipulate the file
immediately after the  call.

Additionally, this change removes fixed wait times (`setTimeout`) that
were previously used in test files as a band-aid.

fix(core): stream processed xterm output to file to remove spurious escape codes

test(core): update shell regression tests to use file_data events
2026-04-07 22:50:15 +00:00

743 lines
21 KiB
TypeScript

/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
import fs from 'node:fs';
import path from 'node:path';
import os from 'node:os';
import {
ToolOutputMaskingService,
MASKING_INDICATOR_TAG,
} from './toolOutputMaskingService.js';
import {
SHELL_TOOL_NAME,
ACTIVATE_SKILL_TOOL_NAME,
MEMORY_TOOL_NAME,
} from '../tools/tool-names.js';
import { estimateTokenCountSync } from '../utils/tokenCalculation.js';
import type { Config } from '../config/config.js';
import type { Content, Part } from '@google/genai';
vi.mock('../utils/tokenCalculation.js', () => ({
estimateTokenCountSync: vi.fn(),
}));
describe('ToolOutputMaskingService', () => {
let service: ToolOutputMaskingService;
let mockConfig: Config;
let testTempDir: string;
const mockedEstimateTokenCountSync = vi.mocked(estimateTokenCountSync);
beforeEach(async () => {
testTempDir = await fs.promises.mkdtemp(
path.join(os.tmpdir(), 'tool-masking-test-'),
);
service = new ToolOutputMaskingService();
mockConfig = {
storage: {
getHistoryDir: () => path.join(testTempDir, 'history'),
getProjectTempDir: () => testTempDir,
},
getSessionId: () => 'mock-session',
getUsageStatisticsEnabled: () => false,
getToolOutputMaskingConfig: async () => ({
enabled: true,
protectionThresholdTokens: 50000,
minPrunableThresholdTokens: 30000,
protectLatestTurn: true,
}),
} as unknown as Config;
vi.clearAllMocks();
});
afterEach(async () => {
vi.restoreAllMocks();
if (testTempDir) {
await fs.promises.rm(testTempDir, { recursive: true, force: true });
}
});
it('should respect remote configuration overrides', async () => {
mockConfig.getToolOutputMaskingConfig = async () => ({
enabled: true,
protectionThresholdTokens: 100, // Very low threshold
minPrunableThresholdTokens: 50,
protectLatestTurn: false,
});
const history: Content[] = [
{
role: 'user',
parts: [
{
functionResponse: {
name: 'test_tool',
response: { output: 'A'.repeat(200) },
},
},
],
},
];
mockedEstimateTokenCountSync.mockImplementation((parts) => {
const resp = parts[0].functionResponse?.response as Record<
string,
unknown
>;
const content = (resp?.['output'] as string) ?? JSON.stringify(resp);
return content.includes(MASKING_INDICATOR_TAG) ? 10 : 200;
});
const result = await service.mask(history, mockConfig);
// With low thresholds and protectLatestTurn=false, it should mask even the latest turn
expect(result.maskedCount).toBe(1);
expect(result.tokensSaved).toBeGreaterThan(0);
});
it('should not mask if total tool tokens are below protection threshold', async () => {
const history: Content[] = [
{
role: 'user',
parts: [
{
functionResponse: {
name: 'test_tool',
response: { output: 'small output' },
},
},
],
},
];
mockedEstimateTokenCountSync.mockReturnValue(100);
const result = await service.mask(history, mockConfig);
expect(result.maskedCount).toBe(0);
expect(result.newHistory).toEqual(history);
});
const getToolResponse = (part: Part | undefined): string => {
const resp = part?.functionResponse?.response as
| { output: string }
| undefined;
return resp?.output ?? (resp as unknown as string) ?? '';
};
it('should protect the latest turn and mask older outputs beyond 50k window if total > 30k', async () => {
// History:
// Turn 1: 60k (Oldest)
// Turn 2: 20k
// Turn 3: 10k (Latest) - Protected because PROTECT_LATEST_TURN is true
const history: Content[] = [
{
role: 'user',
parts: [
{
functionResponse: {
name: 't1',
response: { output: 'A'.repeat(60000) },
},
},
],
},
{
role: 'user',
parts: [
{
functionResponse: {
name: 't2',
response: { output: 'B'.repeat(20000) },
},
},
],
},
{
role: 'user',
parts: [
{
functionResponse: {
name: 't3',
response: { output: 'C'.repeat(10000) },
},
},
],
},
];
mockedEstimateTokenCountSync.mockImplementation((parts: Part[]) => {
const toolName = parts[0].functionResponse?.name;
const resp = parts[0].functionResponse?.response as Record<
string,
unknown
>;
const content = (resp?.['output'] as string) ?? JSON.stringify(resp);
if (content.includes(`<${MASKING_INDICATOR_TAG}`)) return 100;
if (toolName === 't1') return 60000;
if (toolName === 't2') return 20000;
if (toolName === 't3') return 10000;
return 0;
});
// Scanned: Turn 2 (20k), Turn 1 (60k). Total = 80k.
// Turn 2: Cumulative = 20k. Protected (<= 50k).
// Turn 1: Cumulative = 80k. Crossed 50k boundary. Prunabled.
// Total Prunable = 60k (> 30k trigger).
const result = await service.mask(history, mockConfig);
expect(result.maskedCount).toBe(1);
expect(getToolResponse(result.newHistory[0].parts?.[0])).toContain(
`<${MASKING_INDICATOR_TAG}`,
);
expect(getToolResponse(result.newHistory[1].parts?.[0])).toEqual(
'B'.repeat(20000),
);
expect(getToolResponse(result.newHistory[2].parts?.[0])).toEqual(
'C'.repeat(10000),
);
});
it('should perform global aggregation for many small parts once boundary is hit', async () => {
// history.length = 12. Skip index 11 (latest).
// Indices 0-10: 10k each.
// Index 10: 10k (Sum 10k)
// Index 9: 10k (Sum 20k)
// Index 8: 10k (Sum 30k)
// Index 7: 10k (Sum 40k)
// Index 6: 10k (Sum 50k) - Boundary hit here?
// Actually, Boundary is 50k. So Index 6 crosses it.
// Index 6, 5, 4, 3, 2, 1, 0 are all prunable. (7 * 10k = 70k).
const history: Content[] = Array.from({ length: 12 }, (_, i) => ({
role: 'user',
parts: [
{
functionResponse: {
name: `tool${i}`,
response: { output: 'A'.repeat(10000) },
},
},
],
}));
mockedEstimateTokenCountSync.mockImplementation((parts: Part[]) => {
const resp = parts[0].functionResponse?.response as
| { output?: string; result?: string }
| string
| undefined;
const content =
typeof resp === 'string'
? resp
: resp?.output || resp?.result || JSON.stringify(resp);
if (content?.includes(`<${MASKING_INDICATOR_TAG}`)) return 100;
return content?.length || 0;
});
const result = await service.mask(history, mockConfig);
expect(result.maskedCount).toBe(6); // boundary at 50k protects 0-5
expect(result.tokensSaved).toBeGreaterThan(0);
});
it('should verify tool-aware previews (shell vs generic)', async () => {
const shellHistory: Content[] = [
{
role: 'user',
parts: [
{
functionResponse: {
name: SHELL_TOOL_NAME,
response: {
output:
'Output: line1\nline2\nline3\nline4\nline5\nError: failed\nExit Code: 1',
},
},
},
],
},
// Protection buffer
{
role: 'user',
parts: [
{
functionResponse: {
name: 'p',
response: { output: 'p'.repeat(60000) },
},
},
],
},
// Latest turn
{
role: 'user',
parts: [{ functionResponse: { name: 'l', response: { output: 'l' } } }],
},
];
mockedEstimateTokenCountSync.mockImplementation((parts: Part[]) => {
const name = parts[0].functionResponse?.name;
const resp = parts[0].functionResponse?.response as Record<
string,
unknown
>;
const content = (resp?.['output'] as string) ?? JSON.stringify(resp);
if (content.includes(`<${MASKING_INDICATOR_TAG}`)) return 100;
if (name === SHELL_TOOL_NAME) return 100000;
if (name === 'p') return 60000;
return 100;
});
const result = await service.mask(shellHistory, mockConfig);
const maskedBash = getToolResponse(result.newHistory[0].parts?.[0]);
expect(maskedBash).toContain('Output: line1\nline2\nline3\nline4\nline5');
expect(maskedBash).toContain('Exit Code: 1');
expect(maskedBash).toContain('Error: failed');
});
it('should skip already masked content and not count it towards totals', async () => {
const history: Content[] = [
{
role: 'user',
parts: [
{
functionResponse: {
name: 'tool1',
response: {
output: `<${MASKING_INDICATOR_TAG}>...</${MASKING_INDICATOR_TAG}>`,
},
},
},
],
},
{
role: 'user',
parts: [
{
functionResponse: {
name: 'tool2',
response: { output: 'A'.repeat(60000) },
},
},
],
},
];
mockedEstimateTokenCountSync.mockReturnValue(60000);
const result = await service.mask(history, mockConfig);
expect(result.maskedCount).toBe(0); // tool1 skipped, tool2 is the "latest" which is protected
});
it('should handle different response keys in masked update', async () => {
const history: Content[] = [
{
role: 'model',
parts: [
{
functionResponse: {
name: 't1',
response: { result: 'A'.repeat(60000) },
},
},
],
},
{
role: 'model',
parts: [
{
functionResponse: {
name: 'p',
response: { output: 'P'.repeat(60000) },
},
},
],
},
{ role: 'user', parts: [{ text: 'latest' }] },
];
mockedEstimateTokenCountSync.mockImplementation((parts: Part[]) => {
const resp = parts[0].functionResponse?.response as Record<
string,
unknown
>;
const content =
(resp?.['output'] as string) ??
(resp?.['result'] as string) ??
JSON.stringify(resp);
if (content.includes(`<${MASKING_INDICATOR_TAG}`)) return 100;
return 60000;
});
const result = await service.mask(history, mockConfig);
expect(result.maskedCount).toBe(2); // both t1 and p are prunable (cumulative 60k and 120k)
const responseObj = result.newHistory[0].parts?.[0].functionResponse
?.response as Record<string, unknown>;
expect(Object.keys(responseObj)).toEqual(['output']);
});
it('should preserve multimodal parts while masking tool responses', async () => {
const history: Content[] = [
{
role: 'user',
parts: [
{
functionResponse: {
name: 't1',
response: { output: 'A'.repeat(60000) },
},
},
{
inlineData: {
data: 'base64data',
mimeType: 'image/png',
},
},
],
},
// Protection buffer
{
role: 'user',
parts: [
{
functionResponse: {
name: 'p',
response: { output: 'p'.repeat(60000) },
},
},
],
},
// Latest turn
{ role: 'user', parts: [{ text: 'latest' }] },
];
mockedEstimateTokenCountSync.mockImplementation((parts: Part[]) => {
const resp = parts[0].functionResponse?.response as Record<
string,
unknown
>;
const content = (resp?.['output'] as string) ?? JSON.stringify(resp);
if (content.includes(`<${MASKING_INDICATOR_TAG}`)) return 100;
if (parts[0].functionResponse?.name === 't1') return 60000;
if (parts[0].functionResponse?.name === 'p') return 60000;
return 100;
});
const result = await service.mask(history, mockConfig);
expect(result.maskedCount).toBe(2); //Both t1 and p are prunable (cumulative 60k each > 50k protection)
expect(result.newHistory[0].parts).toHaveLength(2);
expect(result.newHistory[0].parts?.[0].functionResponse).toBeDefined();
expect(
(
result.newHistory[0].parts?.[0].functionResponse?.response as Record<
string,
unknown
>
)['output'],
).toContain(`<${MASKING_INDICATOR_TAG}`);
expect(result.newHistory[0].parts?.[1].inlineData).toEqual({
data: 'base64data',
mimeType: 'image/png',
});
});
it('should match the expected snapshot for a masked tool output', async () => {
const history: Content[] = [
{
role: 'user',
parts: [
{
functionResponse: {
name: SHELL_TOOL_NAME,
response: {
output: 'Line\n'.repeat(25),
exitCode: 0,
},
},
},
],
},
// Buffer to push shell_tool into prunable territory
{
role: 'user',
parts: [
{
functionResponse: {
name: 'padding',
response: { output: 'B'.repeat(60000) },
},
},
],
},
{ role: 'user', parts: [{ text: 'latest' }] },
];
mockedEstimateTokenCountSync.mockImplementation((parts: Part[]) => {
const resp = parts[0].functionResponse?.response as Record<
string,
unknown
>;
const content = (resp?.['output'] as string) ?? JSON.stringify(resp);
if (content.includes(`<${MASKING_INDICATOR_TAG}`)) return 100;
if (parts[0].functionResponse?.name === SHELL_TOOL_NAME) return 1000;
if (parts[0].functionResponse?.name === 'padding') return 60000;
return 10;
});
const result = await service.mask(history, mockConfig);
// Verify complete masking: only 'output' key should exist
const responseObj = result.newHistory[0].parts?.[0].functionResponse
?.response as Record<string, unknown>;
expect(Object.keys(responseObj)).toEqual(['output']);
const response = responseObj['output'] as string;
// We replace the random part of the filename for deterministic snapshots
// and normalize path separators for cross-platform compatibility
const normalizedResponse = response.replace(/\\/g, '/');
const deterministicResponse = normalizedResponse
.replace(new RegExp(testTempDir.replace(/\\/g, '/'), 'g'), '/mock/temp')
.replace(
new RegExp(`${SHELL_TOOL_NAME}_[^\\s"]+\\.txt`, 'g'),
`${SHELL_TOOL_NAME}_deterministic.txt`,
);
expect(deterministicResponse).toMatchSnapshot();
});
it('should not mask if masking increases token count (due to overhead)', async () => {
const history: Content[] = [
{
role: 'user',
parts: [
{
functionResponse: {
name: 'tiny_tool',
response: { output: 'tiny' },
},
},
],
},
// Protection buffer to push tiny_tool into prunable territory
{
role: 'user',
parts: [
{
functionResponse: {
name: 'padding',
response: { output: 'B'.repeat(60000) },
},
},
],
},
{ role: 'user', parts: [{ text: 'latest' }] },
];
mockedEstimateTokenCountSync.mockImplementation((parts: Part[]) => {
if (parts[0].functionResponse?.name === 'tiny_tool') return 5;
if (parts[0].functionResponse?.name === 'padding') return 60000;
return 1000; // The masked version would be huge due to boilerplate
});
const result = await service.mask(history, mockConfig);
expect(result.maskedCount).toBe(0); // padding is protected, tiny_tool would increase size
});
it('should never mask exempt tools (like activate_skill) even if they are deep in history', async () => {
const history: Content[] = [
{
role: 'user',
parts: [
{
functionResponse: {
name: ACTIVATE_SKILL_TOOL_NAME,
response: { output: 'High value instructions for skill' },
},
},
],
},
{
role: 'user',
parts: [
{
functionResponse: {
name: MEMORY_TOOL_NAME,
response: { output: 'Important user preference' },
},
},
],
},
{
role: 'user',
parts: [
{
functionResponse: {
name: 'bulky_tool',
response: { output: 'A'.repeat(60000) },
},
},
],
},
// Protection buffer
{
role: 'user',
parts: [
{
functionResponse: {
name: 'padding',
response: { output: 'B'.repeat(60000) },
},
},
],
},
{ role: 'user', parts: [{ text: 'latest' }] },
];
mockedEstimateTokenCountSync.mockImplementation((parts: Part[]) => {
const resp = parts[0].functionResponse?.response as Record<
string,
unknown
>;
const content = (resp?.['output'] as string) ?? JSON.stringify(resp);
if (content.includes(`<${MASKING_INDICATOR_TAG}`)) return 100;
const name = parts[0].functionResponse?.name;
if (name === ACTIVATE_SKILL_TOOL_NAME) return 1000;
if (name === MEMORY_TOOL_NAME) return 500;
if (name === 'bulky_tool') return 60000;
if (name === 'padding') return 60000;
return 10;
});
const result = await service.mask(history, mockConfig);
// Both 'bulky_tool' and 'padding' should be masked.
// 'padding' (Index 3) crosses the 50k protection boundary immediately.
// ACTIVATE_SKILL and MEMORY are exempt.
expect(result.maskedCount).toBe(2);
expect(result.newHistory[0].parts?.[0].functionResponse?.name).toBe(
ACTIVATE_SKILL_TOOL_NAME,
);
expect(
(
result.newHistory[0].parts?.[0].functionResponse?.response as Record<
string,
unknown
>
)['output'],
).toBe('High value instructions for skill');
expect(result.newHistory[1].parts?.[0].functionResponse?.name).toBe(
MEMORY_TOOL_NAME,
);
expect(
(
result.newHistory[1].parts?.[0].functionResponse?.response as Record<
string,
unknown
>
)['output'],
).toBe('Important user preference');
expect(result.newHistory[2].parts?.[0].functionResponse?.name).toBe(
'bulky_tool',
);
expect(
(
result.newHistory[2].parts?.[0].functionResponse?.response as Record<
string,
unknown
>
)['output'],
).toContain(MASKING_INDICATOR_TAG);
});
it('should use existing outputFile if available in the tool response', async () => {
// Setup: Create a large history to trigger masking
const largeContent = 'a'.repeat(60000);
const existingOutputFile = path.join(testTempDir, 'truly_full_output.txt');
await fs.promises.writeFile(existingOutputFile, 'truly full content');
const history: Content[] = [
{
role: 'user',
parts: [{ text: 'Old turn' }],
},
{
role: 'model',
parts: [
{
functionResponse: {
name: 'shell',
id: 'call-1',
response: {
output: largeContent,
outputFile: existingOutputFile,
},
},
},
],
},
// Protection buffer
{
role: 'user',
parts: [
{
functionResponse: {
name: 'padding',
response: { output: 'B'.repeat(60000) },
},
},
],
},
{
role: 'user',
parts: [{ text: 'Newest turn' }],
},
];
mockedEstimateTokenCountSync.mockImplementation((parts: Part[]) => {
const resp = parts[0].functionResponse?.response as Record<
string,
unknown
>;
const content = (resp?.['output'] as string) ?? JSON.stringify(resp);
if (content.includes(`<${MASKING_INDICATOR_TAG}`)) return 100;
const name = parts[0].functionResponse?.name;
if (name === 'shell') return 60000;
if (name === 'padding') return 60000;
return 10;
});
// Trigger masking
const result = await service.mask(history, mockConfig);
expect(result.maskedCount).toBe(2);
const maskedPart = result.newHistory[1].parts![0];
const maskedResponse = maskedPart.functionResponse?.response as Record<
string,
unknown
>;
const maskedOutput = maskedResponse['output'] as string;
// Verify the masked snippet points to the existing file
expect(maskedOutput).toContain(
`Full output available at: ${existingOutputFile}`,
);
// Verify the path in maskedOutput is exactly the one we provided
expect(maskedOutput).toContain(existingOutputFile);
});
});