mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-06-10 11:12:35 -07:00
Proposal: deterministic encoding for child-process I/O (#27247)
This commit is contained in:
Generated
-7
@@ -6078,12 +6078,6 @@
|
||||
"node": ">=8"
|
||||
}
|
||||
},
|
||||
"node_modules/chardet": {
|
||||
"version": "2.1.0",
|
||||
"resolved": "https://registry.npmjs.org/chardet/-/chardet-2.1.0.tgz",
|
||||
"integrity": "sha512-bNFETTG/pM5ryzQ9Ad0lJOTa6HWD/YsScAR3EnCPZRPlQh77JocYktSHOUHelyhm8IARL+o4c4F1bP5KVOjiRA==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/check-error": {
|
||||
"version": "2.1.1",
|
||||
"resolved": "https://registry.npmjs.org/check-error/-/check-error-2.1.1.tgz",
|
||||
@@ -18434,7 +18428,6 @@
|
||||
"@xterm/headless": "5.5.0",
|
||||
"ajv": "^8.17.1",
|
||||
"ajv-formats": "^3.0.0",
|
||||
"chardet": "^2.1.0",
|
||||
"chokidar": "^5.0.0",
|
||||
"command-exists": "^1.2.9",
|
||||
"diff": "^8.0.3",
|
||||
|
||||
@@ -54,7 +54,6 @@
|
||||
"@xterm/headless": "5.5.0",
|
||||
"ajv": "^8.17.1",
|
||||
"ajv-formats": "^3.0.0",
|
||||
"chardet": "^2.1.0",
|
||||
"chokidar": "^5.0.0",
|
||||
"command-exists": "^1.2.9",
|
||||
"diff": "^8.0.3",
|
||||
|
||||
@@ -103,7 +103,6 @@ export {
|
||||
export * from './utils/tool-utils.js';
|
||||
export * from './utils/tool-visibility.js';
|
||||
export * from './utils/terminalSerializer.js';
|
||||
export * from './utils/systemEncoding.js';
|
||||
export * from './utils/textUtils.js';
|
||||
export * from './utils/formatters.js';
|
||||
export * from './utils/generateContentResponseUtilities.js';
|
||||
|
||||
@@ -120,10 +120,6 @@ vi.mock('../utils/terminalSerializer.js', () => ({
|
||||
convertColorToHex: () => '#000000',
|
||||
ColorMode: { DEFAULT: 0, PALETTE: 1, RGB: 2 },
|
||||
}));
|
||||
vi.mock('../utils/systemEncoding.js', () => ({
|
||||
getCachedEncodingForBuffer: vi.fn().mockReturnValue('utf-8'),
|
||||
}));
|
||||
|
||||
const mockProcessKill = vi
|
||||
.spyOn(process, 'kill')
|
||||
.mockImplementation(() => true);
|
||||
@@ -1030,7 +1026,7 @@ describe('ShellExecutionService', () => {
|
||||
});
|
||||
|
||||
describe('Platform-Specific Behavior', () => {
|
||||
it('should use powershell.exe on Windows', async () => {
|
||||
it('should use powershell.exe on Windows and prefix the command with chcp 65001 for the PTY session', async () => {
|
||||
mockPlatform.mockReturnValue('win32');
|
||||
await simulateExecution('dir "foo bar"', (pty) =>
|
||||
pty.onExit.mock.calls[0][0]({ exitCode: 0, signal: null }),
|
||||
@@ -1038,7 +1034,7 @@ describe('ShellExecutionService', () => {
|
||||
|
||||
expect(mockPtySpawn).toHaveBeenCalledWith(
|
||||
'powershell.exe',
|
||||
['-NoProfile', '-Command', 'dir "foo bar"'],
|
||||
['-NoProfile', '-Command', 'chcp 65001 >$null;dir "foo bar"'],
|
||||
expect.any(Object),
|
||||
);
|
||||
});
|
||||
|
||||
@@ -13,7 +13,6 @@ import os from 'node:os';
|
||||
import fs, { mkdirSync } from 'node:fs';
|
||||
import path from 'node:path';
|
||||
import type { IPty } from '@lydell/node-pty';
|
||||
import { getCachedEncodingForBuffer } from '../utils/systemEncoding.js';
|
||||
import {
|
||||
getShellConfiguration,
|
||||
resolveExecutable,
|
||||
@@ -81,6 +80,40 @@ function ensurePromptvarsDisabled(command: string, shell: ShellType): string {
|
||||
return `${BASH_SHOPT_GUARD} ${command}`;
|
||||
}
|
||||
|
||||
// On Windows, a new ConPTY session inherits its codepage from the system
|
||||
// OEMCP (microsoft/terminal `src/host/settings.cpp:41` defaults
|
||||
// `_uCodePage` to `Globals.uiOEMCP`, set from `GetOEMCP()` in
|
||||
// `srvinit.cpp:44`). On locales without "Beta: Use Unicode UTF-8 for
|
||||
// worldwide language support" the OEMCP is a legacy codepage (e.g. 850,
|
||||
// 866, 936, 932), and conhost converts every byte from the child via
|
||||
// `MultiByteToWideChar(gci.OutputCP, ...)` in `_stream.cpp:341-343`,
|
||||
// turning UTF-8 output from child processes (perl, python, node, ...)
|
||||
// into mojibake.
|
||||
//
|
||||
// `CreatePseudoConsole` does not accept a codepage argument
|
||||
// (microsoft/terminal#9174 — open as a feature request). The only way
|
||||
// to set the ConPTY codepage is from inside the new session via
|
||||
// `SetConsoleOutputCP` (intercepted by conhost in `getset.cpp:1144`).
|
||||
// Prefix the command with `chcp 65001` so the first thing the new
|
||||
// session does is switch its codepage to UTF-8.
|
||||
function injectUtf8CodepageForPty(
|
||||
command: string,
|
||||
shell: ShellType,
|
||||
isWindows: boolean,
|
||||
usingPty: boolean,
|
||||
): string {
|
||||
if (!isWindows || !usingPty) {
|
||||
return command;
|
||||
}
|
||||
if (shell === 'powershell') {
|
||||
return `chcp 65001 >$null;${command}`;
|
||||
}
|
||||
if (shell === 'cmd') {
|
||||
return `chcp 65001>nul&${command}`;
|
||||
}
|
||||
return command;
|
||||
}
|
||||
|
||||
/** A structured result from a shell command execution. */
|
||||
export type ShellExecutionResult = ExecutionResult;
|
||||
|
||||
@@ -389,6 +422,7 @@ export class ShellExecutionService {
|
||||
cwd: string,
|
||||
shellExecutionConfig: ShellExecutionConfig,
|
||||
isInteractive: boolean,
|
||||
usingPty: boolean,
|
||||
): Promise<{
|
||||
program: string;
|
||||
args: string[];
|
||||
@@ -417,7 +451,13 @@ export class ShellExecutionService {
|
||||
const resolvedExecutable = resolveExecutable(executable) ?? executable;
|
||||
|
||||
const guardedCommand = ensurePromptvarsDisabled(commandToExecute, shell);
|
||||
const spawnArgs = [...argsPrefix, guardedCommand];
|
||||
const finalCommand = injectUtf8CodepageForPty(
|
||||
guardedCommand,
|
||||
shell,
|
||||
isWindows,
|
||||
usingPty,
|
||||
);
|
||||
const spawnArgs = [...argsPrefix, finalCommand];
|
||||
|
||||
// 2. Prepare Environment
|
||||
const gitConfigKeys: string[] = [];
|
||||
@@ -520,6 +560,7 @@ export class ShellExecutionService {
|
||||
cwd,
|
||||
shellExecutionConfig,
|
||||
isInteractive,
|
||||
false,
|
||||
);
|
||||
cmdCleanup = prepared.cleanup;
|
||||
|
||||
@@ -620,14 +661,8 @@ export class ShellExecutionService {
|
||||
|
||||
const handleOutput = (data: Buffer, stream: 'stdout' | 'stderr') => {
|
||||
if (!stdoutDecoder || !stderrDecoder) {
|
||||
const encoding = getCachedEncodingForBuffer(data);
|
||||
try {
|
||||
stdoutDecoder = new TextDecoder(encoding);
|
||||
stderrDecoder = new TextDecoder(encoding);
|
||||
} catch {
|
||||
stdoutDecoder = new TextDecoder('utf-8');
|
||||
stderrDecoder = new TextDecoder('utf-8');
|
||||
}
|
||||
stdoutDecoder = new TextDecoder('utf-8');
|
||||
stderrDecoder = new TextDecoder('utf-8');
|
||||
}
|
||||
|
||||
if (isStreamingRawContent && sniffedBytes < MAX_SNIFF_SIZE) {
|
||||
@@ -900,6 +935,7 @@ export class ShellExecutionService {
|
||||
cwd,
|
||||
shellExecutionConfig,
|
||||
true,
|
||||
true,
|
||||
);
|
||||
cmdCleanup = prepared.cleanup;
|
||||
|
||||
@@ -1115,12 +1151,7 @@ export class ShellExecutionService {
|
||||
() =>
|
||||
new Promise<void>((resolveChunk) => {
|
||||
if (!decoder) {
|
||||
const encoding = getCachedEncodingForBuffer(data);
|
||||
try {
|
||||
decoder = new TextDecoder(encoding);
|
||||
} catch {
|
||||
decoder = new TextDecoder('utf-8');
|
||||
}
|
||||
decoder = new TextDecoder('utf-8');
|
||||
}
|
||||
|
||||
if (isStreamingRawContent && sniffedBytes < MAX_SNIFF_SIZE) {
|
||||
|
||||
@@ -1,497 +0,0 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2025 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { vi, describe, it, expect, beforeEach, afterEach } from 'vitest';
|
||||
import { execSync } from 'node:child_process';
|
||||
import * as os from 'node:os';
|
||||
import { detect as chardetDetect } from 'chardet';
|
||||
import { debugLogger } from './debugLogger.js';
|
||||
|
||||
// Mock dependencies
|
||||
vi.mock('child_process');
|
||||
vi.mock('os');
|
||||
vi.mock('chardet');
|
||||
|
||||
// Import the functions we want to test after refactoring
|
||||
import {
|
||||
getCachedEncodingForBuffer,
|
||||
getSystemEncoding,
|
||||
windowsCodePageToEncoding,
|
||||
detectEncodingFromBuffer,
|
||||
resetEncodingCache,
|
||||
} from './systemEncoding.js';
|
||||
|
||||
describe('Shell Command Processor - Encoding Functions', () => {
|
||||
let consoleWarnSpy: ReturnType<typeof vi.spyOn>;
|
||||
let mockedExecSync: ReturnType<typeof vi.mocked<typeof execSync>>;
|
||||
let mockedOsPlatform: ReturnType<typeof vi.mocked<() => string>>;
|
||||
let mockedChardetDetect: ReturnType<typeof vi.mocked<typeof chardetDetect>>;
|
||||
|
||||
beforeEach(() => {
|
||||
consoleWarnSpy = vi.spyOn(debugLogger, 'warn').mockImplementation(() => {});
|
||||
mockedExecSync = vi.mocked(execSync);
|
||||
mockedOsPlatform = vi.mocked(os.platform);
|
||||
mockedChardetDetect = vi.mocked(chardetDetect);
|
||||
|
||||
// Reset the encoding cache before each test
|
||||
resetEncodingCache();
|
||||
|
||||
// Clear environment variables that might affect tests
|
||||
delete process.env['LC_ALL'];
|
||||
delete process.env['LC_CTYPE'];
|
||||
delete process.env['LANG'];
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
vi.restoreAllMocks();
|
||||
resetEncodingCache();
|
||||
});
|
||||
|
||||
describe('windowsCodePageToEncoding', () => {
|
||||
it('should map common Windows code pages correctly', () => {
|
||||
expect(windowsCodePageToEncoding(437)).toBe('cp437');
|
||||
expect(windowsCodePageToEncoding(850)).toBe('cp850');
|
||||
expect(windowsCodePageToEncoding(65001)).toBe('utf-8');
|
||||
expect(windowsCodePageToEncoding(1252)).toBe('windows-1252');
|
||||
expect(windowsCodePageToEncoding(932)).toBe('shift_jis');
|
||||
expect(windowsCodePageToEncoding(936)).toBe('gb2312');
|
||||
expect(windowsCodePageToEncoding(949)).toBe('euc-kr');
|
||||
expect(windowsCodePageToEncoding(950)).toBe('big5');
|
||||
expect(windowsCodePageToEncoding(1200)).toBe('utf-16le');
|
||||
expect(windowsCodePageToEncoding(1201)).toBe('utf-16be');
|
||||
});
|
||||
|
||||
it('should return null for unmapped code pages and warn', () => {
|
||||
expect(windowsCodePageToEncoding(99999)).toBe(null);
|
||||
expect(consoleWarnSpy).toHaveBeenCalledWith(
|
||||
'Unable to determine encoding for windows code page 99999.',
|
||||
);
|
||||
});
|
||||
|
||||
it('should handle all Windows-specific code pages', () => {
|
||||
expect(windowsCodePageToEncoding(874)).toBe('windows-874');
|
||||
expect(windowsCodePageToEncoding(1250)).toBe('windows-1250');
|
||||
expect(windowsCodePageToEncoding(1251)).toBe('windows-1251');
|
||||
expect(windowsCodePageToEncoding(1253)).toBe('windows-1253');
|
||||
expect(windowsCodePageToEncoding(1254)).toBe('windows-1254');
|
||||
expect(windowsCodePageToEncoding(1255)).toBe('windows-1255');
|
||||
expect(windowsCodePageToEncoding(1256)).toBe('windows-1256');
|
||||
expect(windowsCodePageToEncoding(1257)).toBe('windows-1257');
|
||||
expect(windowsCodePageToEncoding(1258)).toBe('windows-1258');
|
||||
});
|
||||
});
|
||||
|
||||
describe('detectEncodingFromBuffer', () => {
|
||||
it('should detect encoding using chardet successfully', () => {
|
||||
const buffer = Buffer.from('test content', 'utf8');
|
||||
mockedChardetDetect.mockReturnValue('UTF-8');
|
||||
|
||||
const result = detectEncodingFromBuffer(buffer);
|
||||
expect(result).toBe('utf-8');
|
||||
expect(mockedChardetDetect).toHaveBeenCalledWith(buffer);
|
||||
});
|
||||
|
||||
it('should handle chardet returning mixed case encoding', () => {
|
||||
const buffer = Buffer.from('test content', 'utf8');
|
||||
mockedChardetDetect.mockReturnValue('ISO-8859-1');
|
||||
|
||||
const result = detectEncodingFromBuffer(buffer);
|
||||
expect(result).toBe('iso-8859-1');
|
||||
});
|
||||
|
||||
it('should return null when chardet fails', () => {
|
||||
const buffer = Buffer.from('test content', 'utf8');
|
||||
mockedChardetDetect.mockImplementation(() => {
|
||||
throw new Error('Detection failed');
|
||||
});
|
||||
|
||||
const result = detectEncodingFromBuffer(buffer);
|
||||
expect(result).toBe(null);
|
||||
expect(consoleWarnSpy).toHaveBeenCalledWith(
|
||||
'Failed to detect encoding with chardet:',
|
||||
expect.any(Error),
|
||||
);
|
||||
});
|
||||
|
||||
it('should return null when chardet returns null', () => {
|
||||
const buffer = Buffer.from('test content', 'utf8');
|
||||
mockedChardetDetect.mockReturnValue(null);
|
||||
|
||||
const result = detectEncodingFromBuffer(buffer);
|
||||
expect(result).toBe(null);
|
||||
});
|
||||
|
||||
it('should return null when chardet returns non-string', () => {
|
||||
const buffer = Buffer.from('test content', 'utf8');
|
||||
mockedChardetDetect.mockReturnValue([
|
||||
'utf-8',
|
||||
'iso-8859-1',
|
||||
] as unknown as string);
|
||||
|
||||
const result = detectEncodingFromBuffer(buffer);
|
||||
expect(result).toBe(null);
|
||||
});
|
||||
});
|
||||
|
||||
describe('getSystemEncoding - Windows', () => {
|
||||
beforeEach(() => {
|
||||
mockedOsPlatform.mockReturnValue('win32');
|
||||
});
|
||||
|
||||
it('should parse Windows chcp output correctly', () => {
|
||||
mockedExecSync.mockReturnValue('Active code page: 65001');
|
||||
|
||||
const result = getSystemEncoding();
|
||||
expect(result).toBe('utf-8');
|
||||
expect(mockedExecSync).toHaveBeenCalledWith('chcp', { encoding: 'utf8' });
|
||||
});
|
||||
|
||||
it('should handle different chcp output formats', () => {
|
||||
mockedExecSync.mockReturnValue('Current code page: 1252');
|
||||
|
||||
const result = getSystemEncoding();
|
||||
expect(result).toBe('windows-1252');
|
||||
});
|
||||
|
||||
it('should handle chcp output with extra whitespace', () => {
|
||||
mockedExecSync.mockReturnValue('Active code page: 437 ');
|
||||
|
||||
const result = getSystemEncoding();
|
||||
expect(result).toBe('cp437');
|
||||
});
|
||||
|
||||
it('should return null when chcp command fails', () => {
|
||||
mockedExecSync.mockImplementation(() => {
|
||||
throw new Error('Command failed');
|
||||
});
|
||||
|
||||
const result = getSystemEncoding();
|
||||
expect(result).toBe(null);
|
||||
expect(consoleWarnSpy).toHaveBeenCalledWith(
|
||||
expect.stringContaining(
|
||||
"Failed to get Windows code page using 'chcp' command",
|
||||
),
|
||||
);
|
||||
});
|
||||
|
||||
it('should return null when chcp output cannot be parsed', () => {
|
||||
mockedExecSync.mockReturnValue('Unexpected output format');
|
||||
|
||||
const result = getSystemEncoding();
|
||||
expect(result).toBe(null);
|
||||
expect(consoleWarnSpy).toHaveBeenCalledWith(
|
||||
expect.stringContaining(
|
||||
"Failed to get Windows code page using 'chcp' command",
|
||||
),
|
||||
);
|
||||
});
|
||||
|
||||
it('should return null when code page is not a number', () => {
|
||||
mockedExecSync.mockReturnValue('Active code page: abc');
|
||||
|
||||
const result = getSystemEncoding();
|
||||
expect(result).toBe(null);
|
||||
expect(consoleWarnSpy).toHaveBeenCalledWith(
|
||||
expect.stringContaining(
|
||||
"Failed to get Windows code page using 'chcp' command",
|
||||
),
|
||||
);
|
||||
});
|
||||
|
||||
it('should return null when code page maps to null', () => {
|
||||
mockedExecSync.mockReturnValue('Active code page: 99999');
|
||||
|
||||
const result = getSystemEncoding();
|
||||
expect(result).toBe(null);
|
||||
// Should warn about unknown code page from windowsCodePageToEncoding
|
||||
expect(consoleWarnSpy).toHaveBeenCalledWith(
|
||||
'Unable to determine encoding for windows code page 99999.',
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
describe('getSystemEncoding - Unix-like', () => {
|
||||
beforeEach(() => {
|
||||
mockedOsPlatform.mockReturnValue('linux');
|
||||
});
|
||||
|
||||
it('should parse locale from LC_ALL environment variable', () => {
|
||||
process.env['LC_ALL'] = 'en_US.UTF-8';
|
||||
|
||||
const result = getSystemEncoding();
|
||||
expect(result).toBe('utf-8');
|
||||
});
|
||||
|
||||
it('should parse locale from LC_CTYPE when LC_ALL is not set', () => {
|
||||
process.env['LC_CTYPE'] = 'fr_FR.ISO-8859-1';
|
||||
|
||||
const result = getSystemEncoding();
|
||||
expect(result).toBe('iso-8859-1');
|
||||
});
|
||||
|
||||
it('should parse locale from LANG when LC_ALL and LC_CTYPE are not set', () => {
|
||||
process.env['LANG'] = 'de_DE.UTF-8';
|
||||
|
||||
const result = getSystemEncoding();
|
||||
expect(result).toBe('utf-8');
|
||||
});
|
||||
|
||||
it('should handle locale charmap command when environment variables are empty', () => {
|
||||
mockedExecSync.mockReturnValue('UTF-8\n');
|
||||
|
||||
const result = getSystemEncoding();
|
||||
expect(result).toBe('utf-8');
|
||||
expect(mockedExecSync).toHaveBeenCalledWith('locale charmap', {
|
||||
encoding: 'utf8',
|
||||
});
|
||||
});
|
||||
|
||||
it('should handle locale charmap with mixed case', () => {
|
||||
mockedExecSync.mockReturnValue('ISO-8859-1\n');
|
||||
|
||||
const result = getSystemEncoding();
|
||||
expect(result).toBe('iso-8859-1');
|
||||
});
|
||||
|
||||
it('should return null when locale charmap fails', () => {
|
||||
mockedExecSync.mockImplementation(() => {
|
||||
throw new Error('Command failed');
|
||||
});
|
||||
|
||||
const result = getSystemEncoding();
|
||||
expect(result).toBe(null);
|
||||
expect(consoleWarnSpy).toHaveBeenCalledWith(
|
||||
'Failed to get locale charmap.',
|
||||
);
|
||||
});
|
||||
|
||||
it('should handle locale without encoding (no dot)', () => {
|
||||
process.env['LANG'] = 'C';
|
||||
|
||||
const result = getSystemEncoding();
|
||||
expect(result).toBe('c');
|
||||
});
|
||||
|
||||
it('should handle empty locale environment variables', () => {
|
||||
process.env['LC_ALL'] = '';
|
||||
process.env['LC_CTYPE'] = '';
|
||||
process.env['LANG'] = '';
|
||||
mockedExecSync.mockReturnValue('UTF-8');
|
||||
|
||||
const result = getSystemEncoding();
|
||||
expect(result).toBe('utf-8');
|
||||
});
|
||||
|
||||
it('should return locale as-is when locale format has no dot', () => {
|
||||
process.env['LANG'] = 'invalid_format';
|
||||
|
||||
const result = getSystemEncoding();
|
||||
expect(result).toBe('invalid_format');
|
||||
});
|
||||
|
||||
it('should prioritize LC_ALL over other environment variables', () => {
|
||||
process.env['LC_ALL'] = 'en_US.UTF-8';
|
||||
process.env['LC_CTYPE'] = 'fr_FR.ISO-8859-1';
|
||||
process.env['LANG'] = 'de_DE.CP1252';
|
||||
|
||||
const result = getSystemEncoding();
|
||||
expect(result).toBe('utf-8');
|
||||
});
|
||||
|
||||
it('should prioritize LC_CTYPE over LANG', () => {
|
||||
process.env['LC_CTYPE'] = 'fr_FR.ISO-8859-1';
|
||||
process.env['LANG'] = 'de_DE.CP1252';
|
||||
|
||||
const result = getSystemEncoding();
|
||||
expect(result).toBe('iso-8859-1');
|
||||
});
|
||||
});
|
||||
|
||||
describe('getEncodingForBuffer', () => {
|
||||
beforeEach(() => {
|
||||
mockedOsPlatform.mockReturnValue('linux');
|
||||
});
|
||||
|
||||
it('should use cached system encoding on subsequent calls', () => {
|
||||
process.env['LANG'] = 'en_US.UTF-8';
|
||||
const buffer = Buffer.from('test');
|
||||
|
||||
// First call
|
||||
const result1 = getCachedEncodingForBuffer(buffer);
|
||||
expect(result1).toBe('utf-8');
|
||||
|
||||
// Change environment (should not affect cached result)
|
||||
process.env['LANG'] = 'fr_FR.ISO-8859-1';
|
||||
|
||||
// Second call should use cached value
|
||||
const result2 = getCachedEncodingForBuffer(buffer);
|
||||
expect(result2).toBe('utf-8');
|
||||
});
|
||||
|
||||
it('should fall back to buffer detection when system encoding fails', () => {
|
||||
// No environment variables set
|
||||
mockedExecSync.mockImplementation(() => {
|
||||
throw new Error('locale command failed');
|
||||
});
|
||||
|
||||
const buffer = Buffer.from('test');
|
||||
mockedChardetDetect.mockReturnValue('ISO-8859-1');
|
||||
|
||||
const result = getCachedEncodingForBuffer(buffer);
|
||||
expect(result).toBe('iso-8859-1');
|
||||
expect(mockedChardetDetect).toHaveBeenCalledWith(buffer);
|
||||
});
|
||||
|
||||
it('should fall back to utf-8 when both system and buffer detection fail', () => {
|
||||
// System encoding fails
|
||||
mockedExecSync.mockImplementation(() => {
|
||||
throw new Error('locale command failed');
|
||||
});
|
||||
|
||||
// Buffer detection fails
|
||||
mockedChardetDetect.mockImplementation(() => {
|
||||
throw new Error('chardet failed');
|
||||
});
|
||||
|
||||
const buffer = Buffer.from('test');
|
||||
const result = getCachedEncodingForBuffer(buffer);
|
||||
expect(result).toBe('utf-8');
|
||||
});
|
||||
|
||||
it('should not cache buffer detection results', () => {
|
||||
// System encoding fails initially
|
||||
mockedExecSync.mockImplementation(() => {
|
||||
throw new Error('locale command failed');
|
||||
});
|
||||
|
||||
const buffer1 = Buffer.from('test1');
|
||||
const buffer2 = Buffer.from('test2');
|
||||
|
||||
mockedChardetDetect
|
||||
.mockReturnValueOnce('ISO-8859-1')
|
||||
.mockReturnValueOnce('UTF-16');
|
||||
|
||||
const result1 = getCachedEncodingForBuffer(buffer1);
|
||||
const result2 = getCachedEncodingForBuffer(buffer2);
|
||||
|
||||
expect(result1).toBe('iso-8859-1');
|
||||
expect(result2).toBe('utf-16');
|
||||
expect(mockedChardetDetect).toHaveBeenCalledTimes(2);
|
||||
});
|
||||
|
||||
it('should handle Windows system encoding', () => {
|
||||
mockedOsPlatform.mockReturnValue('win32');
|
||||
mockedExecSync.mockReturnValue('Active code page: 1252');
|
||||
|
||||
const buffer = Buffer.from('test');
|
||||
const result = getCachedEncodingForBuffer(buffer);
|
||||
|
||||
expect(result).toBe('windows-1252');
|
||||
});
|
||||
|
||||
it('should cache null system encoding result', () => {
|
||||
// Reset the cache specifically for this test
|
||||
resetEncodingCache();
|
||||
|
||||
// Ensure we're on Unix-like for this test
|
||||
mockedOsPlatform.mockReturnValue('linux');
|
||||
|
||||
// System encoding detection returns null
|
||||
mockedExecSync.mockImplementation(() => {
|
||||
throw new Error('locale command failed');
|
||||
});
|
||||
|
||||
const buffer1 = Buffer.from('test1');
|
||||
const buffer2 = Buffer.from('test2');
|
||||
|
||||
mockedChardetDetect
|
||||
.mockReturnValueOnce('ISO-8859-1')
|
||||
.mockReturnValueOnce('UTF-16');
|
||||
|
||||
// Clear any previous calls from beforeEach setup or previous tests
|
||||
mockedExecSync.mockClear();
|
||||
|
||||
const result1 = getCachedEncodingForBuffer(buffer1);
|
||||
const result2 = getCachedEncodingForBuffer(buffer2);
|
||||
|
||||
// Should call execSync only once due to caching (null result is cached)
|
||||
expect(mockedExecSync).toHaveBeenCalledTimes(1);
|
||||
expect(result1).toBe('iso-8859-1');
|
||||
expect(result2).toBe('utf-16');
|
||||
|
||||
// Call a third time to verify cache is still used
|
||||
const buffer3 = Buffer.from('test3');
|
||||
mockedChardetDetect.mockReturnValueOnce('UTF-32');
|
||||
const result3 = getCachedEncodingForBuffer(buffer3);
|
||||
|
||||
// Still should be only one call to execSync
|
||||
expect(mockedExecSync).toHaveBeenCalledTimes(1);
|
||||
expect(result3).toBe('utf-32');
|
||||
});
|
||||
});
|
||||
|
||||
describe('Cross-platform behavior', () => {
|
||||
it('should work correctly on macOS', () => {
|
||||
mockedOsPlatform.mockReturnValue('darwin');
|
||||
process.env['LANG'] = 'en_US.UTF-8';
|
||||
|
||||
const result = getSystemEncoding();
|
||||
expect(result).toBe('utf-8');
|
||||
});
|
||||
|
||||
it('should work correctly on other Unix-like systems', () => {
|
||||
mockedOsPlatform.mockReturnValue('freebsd');
|
||||
process.env['LANG'] = 'en_US.UTF-8';
|
||||
|
||||
const result = getSystemEncoding();
|
||||
expect(result).toBe('utf-8');
|
||||
});
|
||||
|
||||
it('should handle unknown platforms as Unix-like', () => {
|
||||
mockedOsPlatform.mockReturnValue('unknown' as NodeJS.Platform);
|
||||
process.env['LANG'] = 'en_US.UTF-8';
|
||||
|
||||
const result = getSystemEncoding();
|
||||
expect(result).toBe('utf-8');
|
||||
});
|
||||
});
|
||||
|
||||
describe('Edge cases and error handling', () => {
|
||||
it('should handle empty buffer gracefully', () => {
|
||||
mockedOsPlatform.mockReturnValue('linux');
|
||||
process.env['LANG'] = 'en_US.UTF-8';
|
||||
|
||||
const buffer = Buffer.alloc(0);
|
||||
const result = getCachedEncodingForBuffer(buffer);
|
||||
expect(result).toBe('utf-8');
|
||||
});
|
||||
|
||||
it('should handle very large buffers', () => {
|
||||
mockedOsPlatform.mockReturnValue('linux');
|
||||
process.env['LANG'] = 'en_US.UTF-8';
|
||||
|
||||
const buffer = Buffer.alloc(1024 * 1024, 'a');
|
||||
const result = getCachedEncodingForBuffer(buffer);
|
||||
expect(result).toBe('utf-8');
|
||||
});
|
||||
|
||||
it('should handle Unicode content', () => {
|
||||
mockedOsPlatform.mockReturnValue('linux');
|
||||
const unicodeText = '你好世界 🌍 ñoño';
|
||||
|
||||
// System encoding fails
|
||||
mockedExecSync.mockImplementation(() => {
|
||||
throw new Error('locale command failed');
|
||||
});
|
||||
|
||||
mockedChardetDetect.mockReturnValue('UTF-8');
|
||||
|
||||
const buffer = Buffer.from(unicodeText, 'utf8');
|
||||
const result = getCachedEncodingForBuffer(buffer);
|
||||
expect(result).toBe('utf-8');
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -1,167 +0,0 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2025 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { execSync } from 'node:child_process';
|
||||
import os from 'node:os';
|
||||
import { detect as chardetDetect } from 'chardet';
|
||||
import { debugLogger } from './debugLogger.js';
|
||||
|
||||
// Cache for system encoding to avoid repeated detection
|
||||
// Use undefined to indicate "not yet checked" vs null meaning "checked but failed"
|
||||
let cachedSystemEncoding: string | null | undefined = undefined;
|
||||
|
||||
/**
|
||||
* Reset the encoding cache - useful for testing
|
||||
*/
|
||||
export function resetEncodingCache(): void {
|
||||
cachedSystemEncoding = undefined;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the system encoding, caching the result to avoid repeated system calls.
|
||||
* If system encoding detection fails, falls back to detecting from the provided buffer.
|
||||
* Note: Only the system encoding is cached - buffer-based detection runs for each buffer
|
||||
* since different buffers may have different encodings.
|
||||
* @param buffer A buffer to use for detecting encoding if system detection fails.
|
||||
*/
|
||||
export function getCachedEncodingForBuffer(buffer: Buffer): string {
|
||||
// Cache system encoding detection since it's system-wide
|
||||
if (cachedSystemEncoding === undefined) {
|
||||
cachedSystemEncoding = getSystemEncoding();
|
||||
}
|
||||
|
||||
// If we have a cached system encoding, use it
|
||||
if (cachedSystemEncoding) {
|
||||
return cachedSystemEncoding;
|
||||
}
|
||||
|
||||
// Otherwise, detect from this specific buffer (don't cache this result)
|
||||
return detectEncodingFromBuffer(buffer) || 'utf-8';
|
||||
}
|
||||
|
||||
/**
|
||||
* Detects the system encoding based on the platform.
|
||||
* For Windows, it uses the 'chcp' command to get the current code page.
|
||||
* For Unix-like systems, it checks environment variables like LC_ALL, LC_CTYPE, and LANG.
|
||||
* If those are not set, it tries to run 'locale charmap' to get the encoding.
|
||||
* If detection fails, it returns null.
|
||||
* @returns The system encoding as a string, or null if detection fails.
|
||||
*/
|
||||
export function getSystemEncoding(): string | null {
|
||||
// Windows
|
||||
if (os.platform() === 'win32') {
|
||||
try {
|
||||
const output = execSync('chcp', { encoding: 'utf8' });
|
||||
const match = output.match(/:\s*(\d+)/);
|
||||
if (match) {
|
||||
const codePage = parseInt(match[1], 10);
|
||||
if (!isNaN(codePage)) {
|
||||
return windowsCodePageToEncoding(codePage);
|
||||
}
|
||||
}
|
||||
// Only warn if we can't parse the output format, not if windowsCodePageToEncoding fails
|
||||
throw new Error(
|
||||
`Unable to parse Windows code page from 'chcp' output "${output.trim()}". `,
|
||||
);
|
||||
} catch (error) {
|
||||
debugLogger.warn(
|
||||
`Failed to get Windows code page using 'chcp' command: ${error instanceof Error ? error.message : String(error)}. ` +
|
||||
`Will attempt to detect encoding from command output instead.`,
|
||||
);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// Unix-like
|
||||
// Use environment variables LC_ALL, LC_CTYPE, and LANG to determine the
|
||||
// system encoding. However, these environment variables might not always
|
||||
// be set or accurate. Handle cases where none of these variables are set.
|
||||
const env = process.env;
|
||||
let locale = env['LC_ALL'] || env['LC_CTYPE'] || env['LANG'] || '';
|
||||
|
||||
// Fallback to querying the system directly when environment variables are missing
|
||||
if (!locale) {
|
||||
try {
|
||||
locale = execSync('locale charmap', { encoding: 'utf8' })
|
||||
.toString()
|
||||
.trim();
|
||||
} catch {
|
||||
debugLogger.warn('Failed to get locale charmap.');
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
const match = locale.match(/\.(.+)/); // e.g., "en_US.UTF-8"
|
||||
if (match && match[1]) {
|
||||
return match[1].toLowerCase();
|
||||
}
|
||||
|
||||
// Handle cases where locale charmap returns just the encoding name (e.g., "UTF-8")
|
||||
if (locale && !locale.includes('.')) {
|
||||
return locale.toLowerCase();
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a Windows code page number to a corresponding encoding name.
|
||||
* @param cp The Windows code page number (e.g., 437, 850, etc.)
|
||||
* @returns The corresponding encoding name as a string, or null if no mapping exists.
|
||||
*/
|
||||
export function windowsCodePageToEncoding(cp: number): string | null {
|
||||
// Most common mappings; extend as needed
|
||||
const map: { [key: number]: string } = {
|
||||
437: 'cp437',
|
||||
850: 'cp850',
|
||||
852: 'cp852',
|
||||
866: 'cp866',
|
||||
874: 'windows-874',
|
||||
932: 'shift_jis',
|
||||
936: 'gb2312',
|
||||
949: 'euc-kr',
|
||||
950: 'big5',
|
||||
1200: 'utf-16le',
|
||||
1201: 'utf-16be',
|
||||
1250: 'windows-1250',
|
||||
1251: 'windows-1251',
|
||||
1252: 'windows-1252',
|
||||
1253: 'windows-1253',
|
||||
1254: 'windows-1254',
|
||||
1255: 'windows-1255',
|
||||
1256: 'windows-1256',
|
||||
1257: 'windows-1257',
|
||||
1258: 'windows-1258',
|
||||
65001: 'utf-8',
|
||||
};
|
||||
|
||||
if (map[cp]) {
|
||||
return map[cp];
|
||||
}
|
||||
|
||||
debugLogger.warn(`Unable to determine encoding for windows code page ${cp}.`);
|
||||
return null; // Return null if no mapping found
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempts to detect encoding from a buffer using chardet.
|
||||
* This is useful when system encoding detection fails.
|
||||
* Returns the detected encoding in lowercase, or null if detection fails.
|
||||
* @param buffer The buffer to analyze for encoding.
|
||||
* @return The detected encoding as a lowercase string, or null if detection fails.
|
||||
*/
|
||||
export function detectEncodingFromBuffer(buffer: Buffer): string | null {
|
||||
try {
|
||||
const detected = chardetDetect(buffer);
|
||||
if (detected && typeof detected === 'string') {
|
||||
return detected.toLowerCase();
|
||||
}
|
||||
} catch (error) {
|
||||
debugLogger.warn('Failed to detect encoding with chardet:', error);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
Reference in New Issue
Block a user