From 6589cdf11b3c06d0764590564e5d5a8c590b34d0 Mon Sep 17 00:00:00 2001 From: kaluchi Date: Tue, 19 May 2026 22:41:04 +0300 Subject: [PATCH] Proposal: deterministic encoding for child-process I/O (#27247) --- package-lock.json | 7 - packages/core/package.json | 1 - packages/core/src/index.ts | 1 - .../services/shellExecutionService.test.ts | 8 +- .../src/services/shellExecutionService.ts | 63 ++- .../core/src/utils/systemEncoding.test.ts | 497 ------------------ packages/core/src/utils/systemEncoding.ts | 167 ------ 7 files changed, 49 insertions(+), 695 deletions(-) delete mode 100644 packages/core/src/utils/systemEncoding.test.ts delete mode 100644 packages/core/src/utils/systemEncoding.ts diff --git a/package-lock.json b/package-lock.json index 5895a8ac51..5334ca723a 100644 --- a/package-lock.json +++ b/package-lock.json @@ -6078,12 +6078,6 @@ "node": ">=8" } }, - "node_modules/chardet": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/chardet/-/chardet-2.1.0.tgz", - "integrity": "sha512-bNFETTG/pM5ryzQ9Ad0lJOTa6HWD/YsScAR3EnCPZRPlQh77JocYktSHOUHelyhm8IARL+o4c4F1bP5KVOjiRA==", - "license": "MIT" - }, "node_modules/check-error": { "version": "2.1.1", "resolved": "https://registry.npmjs.org/check-error/-/check-error-2.1.1.tgz", @@ -18434,7 +18428,6 @@ "@xterm/headless": "5.5.0", "ajv": "^8.17.1", "ajv-formats": "^3.0.0", - "chardet": "^2.1.0", "chokidar": "^5.0.0", "command-exists": "^1.2.9", "diff": "^8.0.3", diff --git a/packages/core/package.json b/packages/core/package.json index 598aceae3c..14d0525d8a 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -54,7 +54,6 @@ "@xterm/headless": "5.5.0", "ajv": "^8.17.1", "ajv-formats": "^3.0.0", - "chardet": "^2.1.0", "chokidar": "^5.0.0", "command-exists": "^1.2.9", "diff": "^8.0.3", diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 091e59d619..092249c027 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -103,7 +103,6 @@ export { export * from './utils/tool-utils.js'; export * from './utils/tool-visibility.js'; export * from './utils/terminalSerializer.js'; -export * from './utils/systemEncoding.js'; export * from './utils/textUtils.js'; export * from './utils/formatters.js'; export * from './utils/generateContentResponseUtilities.js'; diff --git a/packages/core/src/services/shellExecutionService.test.ts b/packages/core/src/services/shellExecutionService.test.ts index aba20dff0e..bc448e57a7 100644 --- a/packages/core/src/services/shellExecutionService.test.ts +++ b/packages/core/src/services/shellExecutionService.test.ts @@ -120,10 +120,6 @@ vi.mock('../utils/terminalSerializer.js', () => ({ convertColorToHex: () => '#000000', ColorMode: { DEFAULT: 0, PALETTE: 1, RGB: 2 }, })); -vi.mock('../utils/systemEncoding.js', () => ({ - getCachedEncodingForBuffer: vi.fn().mockReturnValue('utf-8'), -})); - const mockProcessKill = vi .spyOn(process, 'kill') .mockImplementation(() => true); @@ -1030,7 +1026,7 @@ describe('ShellExecutionService', () => { }); describe('Platform-Specific Behavior', () => { - it('should use powershell.exe on Windows', async () => { + it('should use powershell.exe on Windows and prefix the command with chcp 65001 for the PTY session', async () => { mockPlatform.mockReturnValue('win32'); await simulateExecution('dir "foo bar"', (pty) => pty.onExit.mock.calls[0][0]({ exitCode: 0, signal: null }), @@ -1038,7 +1034,7 @@ describe('ShellExecutionService', () => { expect(mockPtySpawn).toHaveBeenCalledWith( 'powershell.exe', - ['-NoProfile', '-Command', 'dir "foo bar"'], + ['-NoProfile', '-Command', 'chcp 65001 >$null;dir "foo bar"'], expect.any(Object), ); }); diff --git a/packages/core/src/services/shellExecutionService.ts b/packages/core/src/services/shellExecutionService.ts index 5979add77f..26d2538f8d 100644 --- a/packages/core/src/services/shellExecutionService.ts +++ b/packages/core/src/services/shellExecutionService.ts @@ -13,7 +13,6 @@ import os from 'node:os'; import fs, { mkdirSync } from 'node:fs'; import path from 'node:path'; import type { IPty } from '@lydell/node-pty'; -import { getCachedEncodingForBuffer } from '../utils/systemEncoding.js'; import { getShellConfiguration, resolveExecutable, @@ -81,6 +80,40 @@ function ensurePromptvarsDisabled(command: string, shell: ShellType): string { return `${BASH_SHOPT_GUARD} ${command}`; } +// On Windows, a new ConPTY session inherits its codepage from the system +// OEMCP (microsoft/terminal `src/host/settings.cpp:41` defaults +// `_uCodePage` to `Globals.uiOEMCP`, set from `GetOEMCP()` in +// `srvinit.cpp:44`). On locales without "Beta: Use Unicode UTF-8 for +// worldwide language support" the OEMCP is a legacy codepage (e.g. 850, +// 866, 936, 932), and conhost converts every byte from the child via +// `MultiByteToWideChar(gci.OutputCP, ...)` in `_stream.cpp:341-343`, +// turning UTF-8 output from child processes (perl, python, node, ...) +// into mojibake. +// +// `CreatePseudoConsole` does not accept a codepage argument +// (microsoft/terminal#9174 — open as a feature request). The only way +// to set the ConPTY codepage is from inside the new session via +// `SetConsoleOutputCP` (intercepted by conhost in `getset.cpp:1144`). +// Prefix the command with `chcp 65001` so the first thing the new +// session does is switch its codepage to UTF-8. +function injectUtf8CodepageForPty( + command: string, + shell: ShellType, + isWindows: boolean, + usingPty: boolean, +): string { + if (!isWindows || !usingPty) { + return command; + } + if (shell === 'powershell') { + return `chcp 65001 >$null;${command}`; + } + if (shell === 'cmd') { + return `chcp 65001>nul&${command}`; + } + return command; +} + /** A structured result from a shell command execution. */ export type ShellExecutionResult = ExecutionResult; @@ -389,6 +422,7 @@ export class ShellExecutionService { cwd: string, shellExecutionConfig: ShellExecutionConfig, isInteractive: boolean, + usingPty: boolean, ): Promise<{ program: string; args: string[]; @@ -417,7 +451,13 @@ export class ShellExecutionService { const resolvedExecutable = resolveExecutable(executable) ?? executable; const guardedCommand = ensurePromptvarsDisabled(commandToExecute, shell); - const spawnArgs = [...argsPrefix, guardedCommand]; + const finalCommand = injectUtf8CodepageForPty( + guardedCommand, + shell, + isWindows, + usingPty, + ); + const spawnArgs = [...argsPrefix, finalCommand]; // 2. Prepare Environment const gitConfigKeys: string[] = []; @@ -520,6 +560,7 @@ export class ShellExecutionService { cwd, shellExecutionConfig, isInteractive, + false, ); cmdCleanup = prepared.cleanup; @@ -620,14 +661,8 @@ export class ShellExecutionService { const handleOutput = (data: Buffer, stream: 'stdout' | 'stderr') => { if (!stdoutDecoder || !stderrDecoder) { - const encoding = getCachedEncodingForBuffer(data); - try { - stdoutDecoder = new TextDecoder(encoding); - stderrDecoder = new TextDecoder(encoding); - } catch { - stdoutDecoder = new TextDecoder('utf-8'); - stderrDecoder = new TextDecoder('utf-8'); - } + stdoutDecoder = new TextDecoder('utf-8'); + stderrDecoder = new TextDecoder('utf-8'); } if (isStreamingRawContent && sniffedBytes < MAX_SNIFF_SIZE) { @@ -900,6 +935,7 @@ export class ShellExecutionService { cwd, shellExecutionConfig, true, + true, ); cmdCleanup = prepared.cleanup; @@ -1115,12 +1151,7 @@ export class ShellExecutionService { () => new Promise((resolveChunk) => { if (!decoder) { - const encoding = getCachedEncodingForBuffer(data); - try { - decoder = new TextDecoder(encoding); - } catch { - decoder = new TextDecoder('utf-8'); - } + decoder = new TextDecoder('utf-8'); } if (isStreamingRawContent && sniffedBytes < MAX_SNIFF_SIZE) { diff --git a/packages/core/src/utils/systemEncoding.test.ts b/packages/core/src/utils/systemEncoding.test.ts deleted file mode 100644 index d84e5914ea..0000000000 --- a/packages/core/src/utils/systemEncoding.test.ts +++ /dev/null @@ -1,497 +0,0 @@ -/** - * @license - * Copyright 2025 Google LLC - * SPDX-License-Identifier: Apache-2.0 - */ - -import { vi, describe, it, expect, beforeEach, afterEach } from 'vitest'; -import { execSync } from 'node:child_process'; -import * as os from 'node:os'; -import { detect as chardetDetect } from 'chardet'; -import { debugLogger } from './debugLogger.js'; - -// Mock dependencies -vi.mock('child_process'); -vi.mock('os'); -vi.mock('chardet'); - -// Import the functions we want to test after refactoring -import { - getCachedEncodingForBuffer, - getSystemEncoding, - windowsCodePageToEncoding, - detectEncodingFromBuffer, - resetEncodingCache, -} from './systemEncoding.js'; - -describe('Shell Command Processor - Encoding Functions', () => { - let consoleWarnSpy: ReturnType; - let mockedExecSync: ReturnType>; - let mockedOsPlatform: ReturnType string>>; - let mockedChardetDetect: ReturnType>; - - beforeEach(() => { - consoleWarnSpy = vi.spyOn(debugLogger, 'warn').mockImplementation(() => {}); - mockedExecSync = vi.mocked(execSync); - mockedOsPlatform = vi.mocked(os.platform); - mockedChardetDetect = vi.mocked(chardetDetect); - - // Reset the encoding cache before each test - resetEncodingCache(); - - // Clear environment variables that might affect tests - delete process.env['LC_ALL']; - delete process.env['LC_CTYPE']; - delete process.env['LANG']; - }); - - afterEach(() => { - vi.restoreAllMocks(); - resetEncodingCache(); - }); - - describe('windowsCodePageToEncoding', () => { - it('should map common Windows code pages correctly', () => { - expect(windowsCodePageToEncoding(437)).toBe('cp437'); - expect(windowsCodePageToEncoding(850)).toBe('cp850'); - expect(windowsCodePageToEncoding(65001)).toBe('utf-8'); - expect(windowsCodePageToEncoding(1252)).toBe('windows-1252'); - expect(windowsCodePageToEncoding(932)).toBe('shift_jis'); - expect(windowsCodePageToEncoding(936)).toBe('gb2312'); - expect(windowsCodePageToEncoding(949)).toBe('euc-kr'); - expect(windowsCodePageToEncoding(950)).toBe('big5'); - expect(windowsCodePageToEncoding(1200)).toBe('utf-16le'); - expect(windowsCodePageToEncoding(1201)).toBe('utf-16be'); - }); - - it('should return null for unmapped code pages and warn', () => { - expect(windowsCodePageToEncoding(99999)).toBe(null); - expect(consoleWarnSpy).toHaveBeenCalledWith( - 'Unable to determine encoding for windows code page 99999.', - ); - }); - - it('should handle all Windows-specific code pages', () => { - expect(windowsCodePageToEncoding(874)).toBe('windows-874'); - expect(windowsCodePageToEncoding(1250)).toBe('windows-1250'); - expect(windowsCodePageToEncoding(1251)).toBe('windows-1251'); - expect(windowsCodePageToEncoding(1253)).toBe('windows-1253'); - expect(windowsCodePageToEncoding(1254)).toBe('windows-1254'); - expect(windowsCodePageToEncoding(1255)).toBe('windows-1255'); - expect(windowsCodePageToEncoding(1256)).toBe('windows-1256'); - expect(windowsCodePageToEncoding(1257)).toBe('windows-1257'); - expect(windowsCodePageToEncoding(1258)).toBe('windows-1258'); - }); - }); - - describe('detectEncodingFromBuffer', () => { - it('should detect encoding using chardet successfully', () => { - const buffer = Buffer.from('test content', 'utf8'); - mockedChardetDetect.mockReturnValue('UTF-8'); - - const result = detectEncodingFromBuffer(buffer); - expect(result).toBe('utf-8'); - expect(mockedChardetDetect).toHaveBeenCalledWith(buffer); - }); - - it('should handle chardet returning mixed case encoding', () => { - const buffer = Buffer.from('test content', 'utf8'); - mockedChardetDetect.mockReturnValue('ISO-8859-1'); - - const result = detectEncodingFromBuffer(buffer); - expect(result).toBe('iso-8859-1'); - }); - - it('should return null when chardet fails', () => { - const buffer = Buffer.from('test content', 'utf8'); - mockedChardetDetect.mockImplementation(() => { - throw new Error('Detection failed'); - }); - - const result = detectEncodingFromBuffer(buffer); - expect(result).toBe(null); - expect(consoleWarnSpy).toHaveBeenCalledWith( - 'Failed to detect encoding with chardet:', - expect.any(Error), - ); - }); - - it('should return null when chardet returns null', () => { - const buffer = Buffer.from('test content', 'utf8'); - mockedChardetDetect.mockReturnValue(null); - - const result = detectEncodingFromBuffer(buffer); - expect(result).toBe(null); - }); - - it('should return null when chardet returns non-string', () => { - const buffer = Buffer.from('test content', 'utf8'); - mockedChardetDetect.mockReturnValue([ - 'utf-8', - 'iso-8859-1', - ] as unknown as string); - - const result = detectEncodingFromBuffer(buffer); - expect(result).toBe(null); - }); - }); - - describe('getSystemEncoding - Windows', () => { - beforeEach(() => { - mockedOsPlatform.mockReturnValue('win32'); - }); - - it('should parse Windows chcp output correctly', () => { - mockedExecSync.mockReturnValue('Active code page: 65001'); - - const result = getSystemEncoding(); - expect(result).toBe('utf-8'); - expect(mockedExecSync).toHaveBeenCalledWith('chcp', { encoding: 'utf8' }); - }); - - it('should handle different chcp output formats', () => { - mockedExecSync.mockReturnValue('Current code page: 1252'); - - const result = getSystemEncoding(); - expect(result).toBe('windows-1252'); - }); - - it('should handle chcp output with extra whitespace', () => { - mockedExecSync.mockReturnValue('Active code page: 437 '); - - const result = getSystemEncoding(); - expect(result).toBe('cp437'); - }); - - it('should return null when chcp command fails', () => { - mockedExecSync.mockImplementation(() => { - throw new Error('Command failed'); - }); - - const result = getSystemEncoding(); - expect(result).toBe(null); - expect(consoleWarnSpy).toHaveBeenCalledWith( - expect.stringContaining( - "Failed to get Windows code page using 'chcp' command", - ), - ); - }); - - it('should return null when chcp output cannot be parsed', () => { - mockedExecSync.mockReturnValue('Unexpected output format'); - - const result = getSystemEncoding(); - expect(result).toBe(null); - expect(consoleWarnSpy).toHaveBeenCalledWith( - expect.stringContaining( - "Failed to get Windows code page using 'chcp' command", - ), - ); - }); - - it('should return null when code page is not a number', () => { - mockedExecSync.mockReturnValue('Active code page: abc'); - - const result = getSystemEncoding(); - expect(result).toBe(null); - expect(consoleWarnSpy).toHaveBeenCalledWith( - expect.stringContaining( - "Failed to get Windows code page using 'chcp' command", - ), - ); - }); - - it('should return null when code page maps to null', () => { - mockedExecSync.mockReturnValue('Active code page: 99999'); - - const result = getSystemEncoding(); - expect(result).toBe(null); - // Should warn about unknown code page from windowsCodePageToEncoding - expect(consoleWarnSpy).toHaveBeenCalledWith( - 'Unable to determine encoding for windows code page 99999.', - ); - }); - }); - - describe('getSystemEncoding - Unix-like', () => { - beforeEach(() => { - mockedOsPlatform.mockReturnValue('linux'); - }); - - it('should parse locale from LC_ALL environment variable', () => { - process.env['LC_ALL'] = 'en_US.UTF-8'; - - const result = getSystemEncoding(); - expect(result).toBe('utf-8'); - }); - - it('should parse locale from LC_CTYPE when LC_ALL is not set', () => { - process.env['LC_CTYPE'] = 'fr_FR.ISO-8859-1'; - - const result = getSystemEncoding(); - expect(result).toBe('iso-8859-1'); - }); - - it('should parse locale from LANG when LC_ALL and LC_CTYPE are not set', () => { - process.env['LANG'] = 'de_DE.UTF-8'; - - const result = getSystemEncoding(); - expect(result).toBe('utf-8'); - }); - - it('should handle locale charmap command when environment variables are empty', () => { - mockedExecSync.mockReturnValue('UTF-8\n'); - - const result = getSystemEncoding(); - expect(result).toBe('utf-8'); - expect(mockedExecSync).toHaveBeenCalledWith('locale charmap', { - encoding: 'utf8', - }); - }); - - it('should handle locale charmap with mixed case', () => { - mockedExecSync.mockReturnValue('ISO-8859-1\n'); - - const result = getSystemEncoding(); - expect(result).toBe('iso-8859-1'); - }); - - it('should return null when locale charmap fails', () => { - mockedExecSync.mockImplementation(() => { - throw new Error('Command failed'); - }); - - const result = getSystemEncoding(); - expect(result).toBe(null); - expect(consoleWarnSpy).toHaveBeenCalledWith( - 'Failed to get locale charmap.', - ); - }); - - it('should handle locale without encoding (no dot)', () => { - process.env['LANG'] = 'C'; - - const result = getSystemEncoding(); - expect(result).toBe('c'); - }); - - it('should handle empty locale environment variables', () => { - process.env['LC_ALL'] = ''; - process.env['LC_CTYPE'] = ''; - process.env['LANG'] = ''; - mockedExecSync.mockReturnValue('UTF-8'); - - const result = getSystemEncoding(); - expect(result).toBe('utf-8'); - }); - - it('should return locale as-is when locale format has no dot', () => { - process.env['LANG'] = 'invalid_format'; - - const result = getSystemEncoding(); - expect(result).toBe('invalid_format'); - }); - - it('should prioritize LC_ALL over other environment variables', () => { - process.env['LC_ALL'] = 'en_US.UTF-8'; - process.env['LC_CTYPE'] = 'fr_FR.ISO-8859-1'; - process.env['LANG'] = 'de_DE.CP1252'; - - const result = getSystemEncoding(); - expect(result).toBe('utf-8'); - }); - - it('should prioritize LC_CTYPE over LANG', () => { - process.env['LC_CTYPE'] = 'fr_FR.ISO-8859-1'; - process.env['LANG'] = 'de_DE.CP1252'; - - const result = getSystemEncoding(); - expect(result).toBe('iso-8859-1'); - }); - }); - - describe('getEncodingForBuffer', () => { - beforeEach(() => { - mockedOsPlatform.mockReturnValue('linux'); - }); - - it('should use cached system encoding on subsequent calls', () => { - process.env['LANG'] = 'en_US.UTF-8'; - const buffer = Buffer.from('test'); - - // First call - const result1 = getCachedEncodingForBuffer(buffer); - expect(result1).toBe('utf-8'); - - // Change environment (should not affect cached result) - process.env['LANG'] = 'fr_FR.ISO-8859-1'; - - // Second call should use cached value - const result2 = getCachedEncodingForBuffer(buffer); - expect(result2).toBe('utf-8'); - }); - - it('should fall back to buffer detection when system encoding fails', () => { - // No environment variables set - mockedExecSync.mockImplementation(() => { - throw new Error('locale command failed'); - }); - - const buffer = Buffer.from('test'); - mockedChardetDetect.mockReturnValue('ISO-8859-1'); - - const result = getCachedEncodingForBuffer(buffer); - expect(result).toBe('iso-8859-1'); - expect(mockedChardetDetect).toHaveBeenCalledWith(buffer); - }); - - it('should fall back to utf-8 when both system and buffer detection fail', () => { - // System encoding fails - mockedExecSync.mockImplementation(() => { - throw new Error('locale command failed'); - }); - - // Buffer detection fails - mockedChardetDetect.mockImplementation(() => { - throw new Error('chardet failed'); - }); - - const buffer = Buffer.from('test'); - const result = getCachedEncodingForBuffer(buffer); - expect(result).toBe('utf-8'); - }); - - it('should not cache buffer detection results', () => { - // System encoding fails initially - mockedExecSync.mockImplementation(() => { - throw new Error('locale command failed'); - }); - - const buffer1 = Buffer.from('test1'); - const buffer2 = Buffer.from('test2'); - - mockedChardetDetect - .mockReturnValueOnce('ISO-8859-1') - .mockReturnValueOnce('UTF-16'); - - const result1 = getCachedEncodingForBuffer(buffer1); - const result2 = getCachedEncodingForBuffer(buffer2); - - expect(result1).toBe('iso-8859-1'); - expect(result2).toBe('utf-16'); - expect(mockedChardetDetect).toHaveBeenCalledTimes(2); - }); - - it('should handle Windows system encoding', () => { - mockedOsPlatform.mockReturnValue('win32'); - mockedExecSync.mockReturnValue('Active code page: 1252'); - - const buffer = Buffer.from('test'); - const result = getCachedEncodingForBuffer(buffer); - - expect(result).toBe('windows-1252'); - }); - - it('should cache null system encoding result', () => { - // Reset the cache specifically for this test - resetEncodingCache(); - - // Ensure we're on Unix-like for this test - mockedOsPlatform.mockReturnValue('linux'); - - // System encoding detection returns null - mockedExecSync.mockImplementation(() => { - throw new Error('locale command failed'); - }); - - const buffer1 = Buffer.from('test1'); - const buffer2 = Buffer.from('test2'); - - mockedChardetDetect - .mockReturnValueOnce('ISO-8859-1') - .mockReturnValueOnce('UTF-16'); - - // Clear any previous calls from beforeEach setup or previous tests - mockedExecSync.mockClear(); - - const result1 = getCachedEncodingForBuffer(buffer1); - const result2 = getCachedEncodingForBuffer(buffer2); - - // Should call execSync only once due to caching (null result is cached) - expect(mockedExecSync).toHaveBeenCalledTimes(1); - expect(result1).toBe('iso-8859-1'); - expect(result2).toBe('utf-16'); - - // Call a third time to verify cache is still used - const buffer3 = Buffer.from('test3'); - mockedChardetDetect.mockReturnValueOnce('UTF-32'); - const result3 = getCachedEncodingForBuffer(buffer3); - - // Still should be only one call to execSync - expect(mockedExecSync).toHaveBeenCalledTimes(1); - expect(result3).toBe('utf-32'); - }); - }); - - describe('Cross-platform behavior', () => { - it('should work correctly on macOS', () => { - mockedOsPlatform.mockReturnValue('darwin'); - process.env['LANG'] = 'en_US.UTF-8'; - - const result = getSystemEncoding(); - expect(result).toBe('utf-8'); - }); - - it('should work correctly on other Unix-like systems', () => { - mockedOsPlatform.mockReturnValue('freebsd'); - process.env['LANG'] = 'en_US.UTF-8'; - - const result = getSystemEncoding(); - expect(result).toBe('utf-8'); - }); - - it('should handle unknown platforms as Unix-like', () => { - mockedOsPlatform.mockReturnValue('unknown' as NodeJS.Platform); - process.env['LANG'] = 'en_US.UTF-8'; - - const result = getSystemEncoding(); - expect(result).toBe('utf-8'); - }); - }); - - describe('Edge cases and error handling', () => { - it('should handle empty buffer gracefully', () => { - mockedOsPlatform.mockReturnValue('linux'); - process.env['LANG'] = 'en_US.UTF-8'; - - const buffer = Buffer.alloc(0); - const result = getCachedEncodingForBuffer(buffer); - expect(result).toBe('utf-8'); - }); - - it('should handle very large buffers', () => { - mockedOsPlatform.mockReturnValue('linux'); - process.env['LANG'] = 'en_US.UTF-8'; - - const buffer = Buffer.alloc(1024 * 1024, 'a'); - const result = getCachedEncodingForBuffer(buffer); - expect(result).toBe('utf-8'); - }); - - it('should handle Unicode content', () => { - mockedOsPlatform.mockReturnValue('linux'); - const unicodeText = '你好世界 🌍 ñoño'; - - // System encoding fails - mockedExecSync.mockImplementation(() => { - throw new Error('locale command failed'); - }); - - mockedChardetDetect.mockReturnValue('UTF-8'); - - const buffer = Buffer.from(unicodeText, 'utf8'); - const result = getCachedEncodingForBuffer(buffer); - expect(result).toBe('utf-8'); - }); - }); -}); diff --git a/packages/core/src/utils/systemEncoding.ts b/packages/core/src/utils/systemEncoding.ts deleted file mode 100644 index 2ceed59287..0000000000 --- a/packages/core/src/utils/systemEncoding.ts +++ /dev/null @@ -1,167 +0,0 @@ -/** - * @license - * Copyright 2025 Google LLC - * SPDX-License-Identifier: Apache-2.0 - */ - -import { execSync } from 'node:child_process'; -import os from 'node:os'; -import { detect as chardetDetect } from 'chardet'; -import { debugLogger } from './debugLogger.js'; - -// Cache for system encoding to avoid repeated detection -// Use undefined to indicate "not yet checked" vs null meaning "checked but failed" -let cachedSystemEncoding: string | null | undefined = undefined; - -/** - * Reset the encoding cache - useful for testing - */ -export function resetEncodingCache(): void { - cachedSystemEncoding = undefined; -} - -/** - * Returns the system encoding, caching the result to avoid repeated system calls. - * If system encoding detection fails, falls back to detecting from the provided buffer. - * Note: Only the system encoding is cached - buffer-based detection runs for each buffer - * since different buffers may have different encodings. - * @param buffer A buffer to use for detecting encoding if system detection fails. - */ -export function getCachedEncodingForBuffer(buffer: Buffer): string { - // Cache system encoding detection since it's system-wide - if (cachedSystemEncoding === undefined) { - cachedSystemEncoding = getSystemEncoding(); - } - - // If we have a cached system encoding, use it - if (cachedSystemEncoding) { - return cachedSystemEncoding; - } - - // Otherwise, detect from this specific buffer (don't cache this result) - return detectEncodingFromBuffer(buffer) || 'utf-8'; -} - -/** - * Detects the system encoding based on the platform. - * For Windows, it uses the 'chcp' command to get the current code page. - * For Unix-like systems, it checks environment variables like LC_ALL, LC_CTYPE, and LANG. - * If those are not set, it tries to run 'locale charmap' to get the encoding. - * If detection fails, it returns null. - * @returns The system encoding as a string, or null if detection fails. - */ -export function getSystemEncoding(): string | null { - // Windows - if (os.platform() === 'win32') { - try { - const output = execSync('chcp', { encoding: 'utf8' }); - const match = output.match(/:\s*(\d+)/); - if (match) { - const codePage = parseInt(match[1], 10); - if (!isNaN(codePage)) { - return windowsCodePageToEncoding(codePage); - } - } - // Only warn if we can't parse the output format, not if windowsCodePageToEncoding fails - throw new Error( - `Unable to parse Windows code page from 'chcp' output "${output.trim()}". `, - ); - } catch (error) { - debugLogger.warn( - `Failed to get Windows code page using 'chcp' command: ${error instanceof Error ? error.message : String(error)}. ` + - `Will attempt to detect encoding from command output instead.`, - ); - } - return null; - } - - // Unix-like - // Use environment variables LC_ALL, LC_CTYPE, and LANG to determine the - // system encoding. However, these environment variables might not always - // be set or accurate. Handle cases where none of these variables are set. - const env = process.env; - let locale = env['LC_ALL'] || env['LC_CTYPE'] || env['LANG'] || ''; - - // Fallback to querying the system directly when environment variables are missing - if (!locale) { - try { - locale = execSync('locale charmap', { encoding: 'utf8' }) - .toString() - .trim(); - } catch { - debugLogger.warn('Failed to get locale charmap.'); - return null; - } - } - - const match = locale.match(/\.(.+)/); // e.g., "en_US.UTF-8" - if (match && match[1]) { - return match[1].toLowerCase(); - } - - // Handle cases where locale charmap returns just the encoding name (e.g., "UTF-8") - if (locale && !locale.includes('.')) { - return locale.toLowerCase(); - } - - return null; -} - -/** - * Converts a Windows code page number to a corresponding encoding name. - * @param cp The Windows code page number (e.g., 437, 850, etc.) - * @returns The corresponding encoding name as a string, or null if no mapping exists. - */ -export function windowsCodePageToEncoding(cp: number): string | null { - // Most common mappings; extend as needed - const map: { [key: number]: string } = { - 437: 'cp437', - 850: 'cp850', - 852: 'cp852', - 866: 'cp866', - 874: 'windows-874', - 932: 'shift_jis', - 936: 'gb2312', - 949: 'euc-kr', - 950: 'big5', - 1200: 'utf-16le', - 1201: 'utf-16be', - 1250: 'windows-1250', - 1251: 'windows-1251', - 1252: 'windows-1252', - 1253: 'windows-1253', - 1254: 'windows-1254', - 1255: 'windows-1255', - 1256: 'windows-1256', - 1257: 'windows-1257', - 1258: 'windows-1258', - 65001: 'utf-8', - }; - - if (map[cp]) { - return map[cp]; - } - - debugLogger.warn(`Unable to determine encoding for windows code page ${cp}.`); - return null; // Return null if no mapping found -} - -/** - * Attempts to detect encoding from a buffer using chardet. - * This is useful when system encoding detection fails. - * Returns the detected encoding in lowercase, or null if detection fails. - * @param buffer The buffer to analyze for encoding. - * @return The detected encoding as a lowercase string, or null if detection fails. - */ -export function detectEncodingFromBuffer(buffer: Buffer): string | null { - try { - const detected = chardetDetect(buffer); - if (detected && typeof detected === 'string') { - return detected.toLowerCase(); - } - } catch (error) { - debugLogger.warn('Failed to detect encoding with chardet:', error); - } - - return null; -}