mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-03-10 14:10:37 -07:00
fix(core): treat UTF16/32 BOM files as text and decode correctly (#6081)
Co-authored-by: Gal Zahavi <38544478+galz10@users.noreply.github.com> Co-authored-by: jacob314 <jacob314@gmail.com>
This commit is contained in:
132
integration-tests/utf-bom-encoding.test.ts
Normal file
132
integration-tests/utf-bom-encoding.test.ts
Normal file
@@ -0,0 +1,132 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2025 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { describe, it, expect, beforeAll, afterAll } from 'vitest';
|
||||
import { writeFileSync } from 'node:fs';
|
||||
import { join } from 'node:path';
|
||||
import { TestRig } from './test-helper.js';
|
||||
|
||||
// Windows skip (Option A: avoid infra scope)
|
||||
const d = process.platform === 'win32' ? describe.skip : describe;
|
||||
|
||||
// BOM encoders
|
||||
const utf8BOM = (s: string) =>
|
||||
Buffer.concat([Buffer.from([0xef, 0xbb, 0xbf]), Buffer.from(s, 'utf8')]);
|
||||
const utf16LE = (s: string) =>
|
||||
Buffer.concat([Buffer.from([0xff, 0xfe]), Buffer.from(s, 'utf16le')]);
|
||||
const utf16BE = (s: string) => {
|
||||
const bom = Buffer.from([0xfe, 0xff]);
|
||||
const le = Buffer.from(s, 'utf16le');
|
||||
le.swap16();
|
||||
return Buffer.concat([bom, le]);
|
||||
};
|
||||
const utf32LE = (s: string) => {
|
||||
const bom = Buffer.from([0xff, 0xfe, 0x00, 0x00]);
|
||||
const cps = Array.from(s, (c) => c.codePointAt(0)!);
|
||||
const payload = Buffer.alloc(cps.length * 4);
|
||||
cps.forEach((cp, i) => {
|
||||
const o = i * 4;
|
||||
payload[o] = cp & 0xff;
|
||||
payload[o + 1] = (cp >>> 8) & 0xff;
|
||||
payload[o + 2] = (cp >>> 16) & 0xff;
|
||||
payload[o + 3] = (cp >>> 24) & 0xff;
|
||||
});
|
||||
return Buffer.concat([bom, payload]);
|
||||
};
|
||||
const utf32BE = (s: string) => {
|
||||
const bom = Buffer.from([0x00, 0x00, 0xfe, 0xff]);
|
||||
const cps = Array.from(s, (c) => c.codePointAt(0)!);
|
||||
const payload = Buffer.alloc(cps.length * 4);
|
||||
cps.forEach((cp, i) => {
|
||||
const o = i * 4;
|
||||
payload[o] = (cp >>> 24) & 0xff;
|
||||
payload[o + 1] = (cp >>> 16) & 0xff;
|
||||
payload[o + 2] = (cp >>> 8) & 0xff;
|
||||
payload[o + 3] = cp & 0xff;
|
||||
});
|
||||
return Buffer.concat([bom, payload]);
|
||||
};
|
||||
|
||||
// Minimal binary sentinel (PNG header only)
|
||||
const fakePng = () =>
|
||||
Buffer.from([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a]);
|
||||
|
||||
let rig: TestRig;
|
||||
let dir: string;
|
||||
|
||||
d('BOM end-to-end integration', () => {
|
||||
beforeAll(async () => {
|
||||
rig = new TestRig();
|
||||
await rig.setup('bom-integration');
|
||||
dir = rig.testDir!;
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
await rig.cleanup();
|
||||
});
|
||||
|
||||
async function runAndAssert(
|
||||
filename: string,
|
||||
content: Buffer,
|
||||
expectedText: string | null,
|
||||
) {
|
||||
writeFileSync(join(dir, filename), content);
|
||||
const prompt = `read the file ${filename} and output its exact contents`;
|
||||
const output = await rig.run(prompt);
|
||||
await rig.waitForToolCall('read_file');
|
||||
const lower = output.toLowerCase();
|
||||
if (expectedText === null) {
|
||||
expect(
|
||||
lower.includes('binary') ||
|
||||
lower.includes('skipped binary file') ||
|
||||
lower.includes('cannot display'),
|
||||
).toBeTruthy();
|
||||
} else {
|
||||
expect(output.includes(expectedText)).toBeTruthy();
|
||||
expect(lower.includes('skipped binary file')).toBeFalsy();
|
||||
}
|
||||
}
|
||||
|
||||
it('UTF-8 BOM', async () => {
|
||||
await runAndAssert('utf8.txt', utf8BOM('BOM_OK UTF-8'), 'BOM_OK UTF-8');
|
||||
});
|
||||
|
||||
it('UTF-16 LE BOM', async () => {
|
||||
await runAndAssert(
|
||||
'utf16le.txt',
|
||||
utf16LE('BOM_OK UTF-16LE'),
|
||||
'BOM_OK UTF-16LE',
|
||||
);
|
||||
});
|
||||
|
||||
it('UTF-16 BE BOM', async () => {
|
||||
await runAndAssert(
|
||||
'utf16be.txt',
|
||||
utf16BE('BOM_OK UTF-16BE'),
|
||||
'BOM_OK UTF-16BE',
|
||||
);
|
||||
});
|
||||
|
||||
it('UTF-32 LE BOM', async () => {
|
||||
await runAndAssert(
|
||||
'utf32le.txt',
|
||||
utf32LE('BOM_OK UTF-32LE'),
|
||||
'BOM_OK UTF-32LE',
|
||||
);
|
||||
});
|
||||
|
||||
it('UTF-32 BE BOM', async () => {
|
||||
await runAndAssert(
|
||||
'utf32be.txt',
|
||||
utf32BE('BOM_OK UTF-32BE'),
|
||||
'BOM_OK UTF-32BE',
|
||||
);
|
||||
});
|
||||
|
||||
it('Binary sentinel', async () => {
|
||||
await runAndAssert('image.png', fakePng(), null);
|
||||
});
|
||||
});
|
||||
@@ -25,6 +25,8 @@ import {
|
||||
isBinaryFile,
|
||||
detectFileType,
|
||||
processSingleFileContent,
|
||||
detectBOM,
|
||||
readFileWithEncoding,
|
||||
} from './fileUtils.js';
|
||||
import { StandardFileSystemService } from '../services/fileSystemService.js';
|
||||
|
||||
@@ -181,6 +183,367 @@ describe('fileUtils', () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe('BOM detection and encoding', () => {
|
||||
let testDir: string;
|
||||
|
||||
beforeEach(async () => {
|
||||
testDir = await fsPromises.mkdtemp(
|
||||
path.join(
|
||||
await fsPromises.realpath(os.tmpdir()),
|
||||
'fileUtils-bom-test-',
|
||||
),
|
||||
);
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
if (testDir) {
|
||||
await fsPromises.rm(testDir, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
describe('detectBOM', () => {
|
||||
it('should detect UTF-8 BOM', () => {
|
||||
const buf = Buffer.from([
|
||||
0xef, 0xbb, 0xbf, 0x48, 0x65, 0x6c, 0x6c, 0x6f,
|
||||
]);
|
||||
const result = detectBOM(buf);
|
||||
expect(result).toEqual({ encoding: 'utf8', bomLength: 3 });
|
||||
});
|
||||
|
||||
it('should detect UTF-16 LE BOM', () => {
|
||||
const buf = Buffer.from([0xff, 0xfe, 0x48, 0x00, 0x65, 0x00]);
|
||||
const result = detectBOM(buf);
|
||||
expect(result).toEqual({ encoding: 'utf16le', bomLength: 2 });
|
||||
});
|
||||
|
||||
it('should detect UTF-16 BE BOM', () => {
|
||||
const buf = Buffer.from([0xfe, 0xff, 0x00, 0x48, 0x00, 0x65]);
|
||||
const result = detectBOM(buf);
|
||||
expect(result).toEqual({ encoding: 'utf16be', bomLength: 2 });
|
||||
});
|
||||
|
||||
it('should detect UTF-32 LE BOM', () => {
|
||||
const buf = Buffer.from([
|
||||
0xff, 0xfe, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
|
||||
]);
|
||||
const result = detectBOM(buf);
|
||||
expect(result).toEqual({ encoding: 'utf32le', bomLength: 4 });
|
||||
});
|
||||
|
||||
it('should detect UTF-32 BE BOM', () => {
|
||||
const buf = Buffer.from([
|
||||
0x00, 0x00, 0xfe, 0xff, 0x00, 0x00, 0x00, 0x48,
|
||||
]);
|
||||
const result = detectBOM(buf);
|
||||
expect(result).toEqual({ encoding: 'utf32be', bomLength: 4 });
|
||||
});
|
||||
|
||||
it('should return null for no BOM', () => {
|
||||
const buf = Buffer.from([0x48, 0x65, 0x6c, 0x6c, 0x6f]);
|
||||
const result = detectBOM(buf);
|
||||
expect(result).toBeNull();
|
||||
});
|
||||
|
||||
it('should return null for empty buffer', () => {
|
||||
const buf = Buffer.alloc(0);
|
||||
const result = detectBOM(buf);
|
||||
expect(result).toBeNull();
|
||||
});
|
||||
|
||||
it('should return null for partial BOM', () => {
|
||||
const buf = Buffer.from([0xef, 0xbb]); // Incomplete UTF-8 BOM
|
||||
const result = detectBOM(buf);
|
||||
expect(result).toBeNull();
|
||||
});
|
||||
});
|
||||
|
||||
describe('readFileWithEncoding', () => {
|
||||
it('should read UTF-8 BOM file correctly', async () => {
|
||||
const content = 'Hello, 世界! 🌍';
|
||||
const utf8Bom = Buffer.from([0xef, 0xbb, 0xbf]);
|
||||
const utf8Content = Buffer.from(content, 'utf8');
|
||||
const fullBuffer = Buffer.concat([utf8Bom, utf8Content]);
|
||||
|
||||
const filePath = path.join(testDir, 'utf8-bom.txt');
|
||||
await fsPromises.writeFile(filePath, fullBuffer);
|
||||
|
||||
const result = await readFileWithEncoding(filePath);
|
||||
expect(result).toBe(content);
|
||||
});
|
||||
|
||||
it('should read UTF-16 LE BOM file correctly', async () => {
|
||||
const content = 'Hello, 世界! 🌍';
|
||||
const utf16leBom = Buffer.from([0xff, 0xfe]);
|
||||
const utf16leContent = Buffer.from(content, 'utf16le');
|
||||
const fullBuffer = Buffer.concat([utf16leBom, utf16leContent]);
|
||||
|
||||
const filePath = path.join(testDir, 'utf16le-bom.txt');
|
||||
await fsPromises.writeFile(filePath, fullBuffer);
|
||||
|
||||
const result = await readFileWithEncoding(filePath);
|
||||
expect(result).toBe(content);
|
||||
});
|
||||
|
||||
it('should read UTF-16 BE BOM file correctly', async () => {
|
||||
const content = 'Hello, 世界! 🌍';
|
||||
// Manually encode UTF-16 BE: each char as big-endian 16-bit
|
||||
const utf16beBom = Buffer.from([0xfe, 0xff]);
|
||||
const chars = Array.from(content);
|
||||
const utf16beBytes: number[] = [];
|
||||
|
||||
for (const char of chars) {
|
||||
const code = char.codePointAt(0)!;
|
||||
if (code > 0xffff) {
|
||||
// Surrogate pair for emoji
|
||||
const surrogate1 = 0xd800 + ((code - 0x10000) >> 10);
|
||||
const surrogate2 = 0xdc00 + ((code - 0x10000) & 0x3ff);
|
||||
utf16beBytes.push((surrogate1 >> 8) & 0xff, surrogate1 & 0xff);
|
||||
utf16beBytes.push((surrogate2 >> 8) & 0xff, surrogate2 & 0xff);
|
||||
} else {
|
||||
utf16beBytes.push((code >> 8) & 0xff, code & 0xff);
|
||||
}
|
||||
}
|
||||
|
||||
const utf16beContent = Buffer.from(utf16beBytes);
|
||||
const fullBuffer = Buffer.concat([utf16beBom, utf16beContent]);
|
||||
|
||||
const filePath = path.join(testDir, 'utf16be-bom.txt');
|
||||
await fsPromises.writeFile(filePath, fullBuffer);
|
||||
|
||||
const result = await readFileWithEncoding(filePath);
|
||||
expect(result).toBe(content);
|
||||
});
|
||||
|
||||
it('should read UTF-32 LE BOM file correctly', async () => {
|
||||
const content = 'Hello, 世界! 🌍';
|
||||
const utf32leBom = Buffer.from([0xff, 0xfe, 0x00, 0x00]);
|
||||
|
||||
const utf32leBytes: number[] = [];
|
||||
for (const char of Array.from(content)) {
|
||||
const code = char.codePointAt(0)!;
|
||||
utf32leBytes.push(
|
||||
code & 0xff,
|
||||
(code >> 8) & 0xff,
|
||||
(code >> 16) & 0xff,
|
||||
(code >> 24) & 0xff,
|
||||
);
|
||||
}
|
||||
|
||||
const utf32leContent = Buffer.from(utf32leBytes);
|
||||
const fullBuffer = Buffer.concat([utf32leBom, utf32leContent]);
|
||||
|
||||
const filePath = path.join(testDir, 'utf32le-bom.txt');
|
||||
await fsPromises.writeFile(filePath, fullBuffer);
|
||||
|
||||
const result = await readFileWithEncoding(filePath);
|
||||
expect(result).toBe(content);
|
||||
});
|
||||
|
||||
it('should read UTF-32 BE BOM file correctly', async () => {
|
||||
const content = 'Hello, 世界! 🌍';
|
||||
const utf32beBom = Buffer.from([0x00, 0x00, 0xfe, 0xff]);
|
||||
|
||||
const utf32beBytes: number[] = [];
|
||||
for (const char of Array.from(content)) {
|
||||
const code = char.codePointAt(0)!;
|
||||
utf32beBytes.push(
|
||||
(code >> 24) & 0xff,
|
||||
(code >> 16) & 0xff,
|
||||
(code >> 8) & 0xff,
|
||||
code & 0xff,
|
||||
);
|
||||
}
|
||||
|
||||
const utf32beContent = Buffer.from(utf32beBytes);
|
||||
const fullBuffer = Buffer.concat([utf32beBom, utf32beContent]);
|
||||
|
||||
const filePath = path.join(testDir, 'utf32be-bom.txt');
|
||||
await fsPromises.writeFile(filePath, fullBuffer);
|
||||
|
||||
const result = await readFileWithEncoding(filePath);
|
||||
expect(result).toBe(content);
|
||||
});
|
||||
|
||||
it('should read file without BOM as UTF-8', async () => {
|
||||
const content = 'Hello, 世界!';
|
||||
const filePath = path.join(testDir, 'no-bom.txt');
|
||||
await fsPromises.writeFile(filePath, content, 'utf8');
|
||||
|
||||
const result = await readFileWithEncoding(filePath);
|
||||
expect(result).toBe(content);
|
||||
});
|
||||
|
||||
it('should handle empty file', async () => {
|
||||
const filePath = path.join(testDir, 'empty.txt');
|
||||
await fsPromises.writeFile(filePath, '');
|
||||
|
||||
const result = await readFileWithEncoding(filePath);
|
||||
expect(result).toBe('');
|
||||
});
|
||||
});
|
||||
|
||||
describe('isBinaryFile with BOM awareness', () => {
|
||||
it('should not treat UTF-8 BOM file as binary', async () => {
|
||||
const content = 'Hello, world!';
|
||||
const utf8Bom = Buffer.from([0xef, 0xbb, 0xbf]);
|
||||
const utf8Content = Buffer.from(content, 'utf8');
|
||||
const fullBuffer = Buffer.concat([utf8Bom, utf8Content]);
|
||||
|
||||
const filePath = path.join(testDir, 'utf8-bom-test.txt');
|
||||
await fsPromises.writeFile(filePath, fullBuffer);
|
||||
|
||||
const result = await isBinaryFile(filePath);
|
||||
expect(result).toBe(false);
|
||||
});
|
||||
|
||||
it('should not treat UTF-16 LE BOM file as binary', async () => {
|
||||
const content = 'Hello, world!';
|
||||
const utf16leBom = Buffer.from([0xff, 0xfe]);
|
||||
const utf16leContent = Buffer.from(content, 'utf16le');
|
||||
const fullBuffer = Buffer.concat([utf16leBom, utf16leContent]);
|
||||
|
||||
const filePath = path.join(testDir, 'utf16le-bom-test.txt');
|
||||
await fsPromises.writeFile(filePath, fullBuffer);
|
||||
|
||||
const result = await isBinaryFile(filePath);
|
||||
expect(result).toBe(false);
|
||||
});
|
||||
|
||||
it('should not treat UTF-16 BE BOM file as binary', async () => {
|
||||
const utf16beBom = Buffer.from([0xfe, 0xff]);
|
||||
// Simple ASCII in UTF-16 BE
|
||||
const utf16beContent = Buffer.from([
|
||||
0x00,
|
||||
0x48, // H
|
||||
0x00,
|
||||
0x65, // e
|
||||
0x00,
|
||||
0x6c, // l
|
||||
0x00,
|
||||
0x6c, // l
|
||||
0x00,
|
||||
0x6f, // o
|
||||
0x00,
|
||||
0x2c, // ,
|
||||
0x00,
|
||||
0x20, // space
|
||||
0x00,
|
||||
0x77, // w
|
||||
0x00,
|
||||
0x6f, // o
|
||||
0x00,
|
||||
0x72, // r
|
||||
0x00,
|
||||
0x6c, // l
|
||||
0x00,
|
||||
0x64, // d
|
||||
0x00,
|
||||
0x21, // !
|
||||
]);
|
||||
const fullBuffer = Buffer.concat([utf16beBom, utf16beContent]);
|
||||
|
||||
const filePath = path.join(testDir, 'utf16be-bom-test.txt');
|
||||
await fsPromises.writeFile(filePath, fullBuffer);
|
||||
|
||||
const result = await isBinaryFile(filePath);
|
||||
expect(result).toBe(false);
|
||||
});
|
||||
|
||||
it('should not treat UTF-32 LE BOM file as binary', async () => {
|
||||
const utf32leBom = Buffer.from([0xff, 0xfe, 0x00, 0x00]);
|
||||
const utf32leContent = Buffer.from([
|
||||
0x48,
|
||||
0x00,
|
||||
0x00,
|
||||
0x00, // H
|
||||
0x65,
|
||||
0x00,
|
||||
0x00,
|
||||
0x00, // e
|
||||
0x6c,
|
||||
0x00,
|
||||
0x00,
|
||||
0x00, // l
|
||||
0x6c,
|
||||
0x00,
|
||||
0x00,
|
||||
0x00, // l
|
||||
0x6f,
|
||||
0x00,
|
||||
0x00,
|
||||
0x00, // o
|
||||
]);
|
||||
const fullBuffer = Buffer.concat([utf32leBom, utf32leContent]);
|
||||
|
||||
const filePath = path.join(testDir, 'utf32le-bom-test.txt');
|
||||
await fsPromises.writeFile(filePath, fullBuffer);
|
||||
|
||||
const result = await isBinaryFile(filePath);
|
||||
expect(result).toBe(false);
|
||||
});
|
||||
|
||||
it('should not treat UTF-32 BE BOM file as binary', async () => {
|
||||
const utf32beBom = Buffer.from([0x00, 0x00, 0xfe, 0xff]);
|
||||
const utf32beContent = Buffer.from([
|
||||
0x00,
|
||||
0x00,
|
||||
0x00,
|
||||
0x48, // H
|
||||
0x00,
|
||||
0x00,
|
||||
0x00,
|
||||
0x65, // e
|
||||
0x00,
|
||||
0x00,
|
||||
0x00,
|
||||
0x6c, // l
|
||||
0x00,
|
||||
0x00,
|
||||
0x00,
|
||||
0x6c, // l
|
||||
0x00,
|
||||
0x00,
|
||||
0x00,
|
||||
0x6f, // o
|
||||
]);
|
||||
const fullBuffer = Buffer.concat([utf32beBom, utf32beContent]);
|
||||
|
||||
const filePath = path.join(testDir, 'utf32be-bom-test.txt');
|
||||
await fsPromises.writeFile(filePath, fullBuffer);
|
||||
|
||||
const result = await isBinaryFile(filePath);
|
||||
expect(result).toBe(false);
|
||||
});
|
||||
|
||||
it('should still treat actual binary file as binary', async () => {
|
||||
// PNG header + some binary data with null bytes
|
||||
const pngHeader = Buffer.from([
|
||||
0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a,
|
||||
]);
|
||||
const binaryData = Buffer.from([
|
||||
0x00, 0x00, 0x00, 0x0d, 0x49, 0x48, 0x44, 0x52,
|
||||
]); // IHDR chunk with nulls
|
||||
const fullContent = Buffer.concat([pngHeader, binaryData]);
|
||||
const filePath = path.join(testDir, 'test.png');
|
||||
await fsPromises.writeFile(filePath, fullContent);
|
||||
|
||||
const result = await isBinaryFile(filePath);
|
||||
expect(result).toBe(true);
|
||||
});
|
||||
|
||||
it('should treat file with null bytes (no BOM) as binary', async () => {
|
||||
const content = Buffer.from([
|
||||
0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x00, 0x77, 0x6f, 0x72, 0x6c, 0x64,
|
||||
]);
|
||||
const filePath = path.join(testDir, 'null-bytes.bin');
|
||||
await fsPromises.writeFile(filePath, content);
|
||||
|
||||
const result = await isBinaryFile(filePath);
|
||||
expect(result).toBe(true);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe('detectFileType', () => {
|
||||
let filePathForDetectTest: string;
|
||||
|
||||
|
||||
@@ -19,6 +19,138 @@ const MAX_LINE_LENGTH_TEXT_FILE = 2000;
|
||||
// Default values for encoding and separator format
|
||||
export const DEFAULT_ENCODING: BufferEncoding = 'utf-8';
|
||||
|
||||
// --- Unicode BOM detection & decoding helpers --------------------------------
|
||||
|
||||
type UnicodeEncoding = 'utf8' | 'utf16le' | 'utf16be' | 'utf32le' | 'utf32be';
|
||||
|
||||
interface BOMInfo {
|
||||
encoding: UnicodeEncoding;
|
||||
bomLength: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect a Unicode BOM (Byte Order Mark) if present.
|
||||
* Reads up to the first 4 bytes and returns encoding + BOM length, else null.
|
||||
*/
|
||||
export function detectBOM(buf: Buffer): BOMInfo | null {
|
||||
if (buf.length >= 4) {
|
||||
// UTF-32 LE: FF FE 00 00
|
||||
if (
|
||||
buf[0] === 0xff &&
|
||||
buf[1] === 0xfe &&
|
||||
buf[2] === 0x00 &&
|
||||
buf[3] === 0x00
|
||||
) {
|
||||
return { encoding: 'utf32le', bomLength: 4 };
|
||||
}
|
||||
// UTF-32 BE: 00 00 FE FF
|
||||
if (
|
||||
buf[0] === 0x00 &&
|
||||
buf[1] === 0x00 &&
|
||||
buf[2] === 0xfe &&
|
||||
buf[3] === 0xff
|
||||
) {
|
||||
return { encoding: 'utf32be', bomLength: 4 };
|
||||
}
|
||||
}
|
||||
if (buf.length >= 3) {
|
||||
// UTF-8: EF BB BF
|
||||
if (buf[0] === 0xef && buf[1] === 0xbb && buf[2] === 0xbf) {
|
||||
return { encoding: 'utf8', bomLength: 3 };
|
||||
}
|
||||
}
|
||||
if (buf.length >= 2) {
|
||||
// UTF-16 LE: FF FE (but not UTF-32 LE already matched above)
|
||||
if (
|
||||
buf[0] === 0xff &&
|
||||
buf[1] === 0xfe &&
|
||||
(buf.length < 4 || buf[2] !== 0x00 || buf[3] !== 0x00)
|
||||
) {
|
||||
return { encoding: 'utf16le', bomLength: 2 };
|
||||
}
|
||||
// UTF-16 BE: FE FF
|
||||
if (buf[0] === 0xfe && buf[1] === 0xff) {
|
||||
return { encoding: 'utf16be', bomLength: 2 };
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a UTF-16 BE buffer to a JS string by swapping to LE then using Node's decoder.
|
||||
* (Node has 'utf16le' but not 'utf16be'.)
|
||||
*/
|
||||
function decodeUTF16BE(buf: Buffer): string {
|
||||
if (buf.length === 0) return '';
|
||||
const swapped = Buffer.from(buf); // swap16 mutates in place, so copy
|
||||
swapped.swap16();
|
||||
return swapped.toString('utf16le');
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode a UTF-32 buffer (LE or BE) into a JS string.
|
||||
* Invalid code points are replaced with U+FFFD, partial trailing bytes are ignored.
|
||||
*/
|
||||
function decodeUTF32(buf: Buffer, littleEndian: boolean): string {
|
||||
if (buf.length < 4) return '';
|
||||
const usable = buf.length - (buf.length % 4);
|
||||
let out = '';
|
||||
for (let i = 0; i < usable; i += 4) {
|
||||
const cp = littleEndian
|
||||
? (buf[i] |
|
||||
(buf[i + 1] << 8) |
|
||||
(buf[i + 2] << 16) |
|
||||
(buf[i + 3] << 24)) >>>
|
||||
0
|
||||
: (buf[i + 3] |
|
||||
(buf[i + 2] << 8) |
|
||||
(buf[i + 1] << 16) |
|
||||
(buf[i] << 24)) >>>
|
||||
0;
|
||||
// Valid planes: 0x0000..0x10FFFF excluding surrogates
|
||||
if (cp <= 0x10ffff && !(cp >= 0xd800 && cp <= 0xdfff)) {
|
||||
out += String.fromCodePoint(cp);
|
||||
} else {
|
||||
out += '\uFFFD';
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
/**
|
||||
* Read a file as text, honoring BOM encodings (UTF‑8/16/32) and stripping the BOM.
|
||||
* Falls back to utf8 when no BOM is present.
|
||||
*/
|
||||
export async function readFileWithEncoding(filePath: string): Promise<string> {
|
||||
// Read the file once; detect BOM and decode from the single buffer.
|
||||
const full = await fs.promises.readFile(filePath);
|
||||
if (full.length === 0) return '';
|
||||
|
||||
const bom = detectBOM(full);
|
||||
if (!bom) {
|
||||
// No BOM → treat as UTF‑8
|
||||
return full.toString('utf8');
|
||||
}
|
||||
|
||||
// Strip BOM and decode per encoding
|
||||
const content = full.subarray(bom.bomLength);
|
||||
switch (bom.encoding) {
|
||||
case 'utf8':
|
||||
return content.toString('utf8');
|
||||
case 'utf16le':
|
||||
return content.toString('utf16le');
|
||||
case 'utf16be':
|
||||
return decodeUTF16BE(content);
|
||||
case 'utf32le':
|
||||
return decodeUTF32(content, true);
|
||||
case 'utf32be':
|
||||
return decodeUTF32(content, false);
|
||||
default:
|
||||
// Defensive fallback; should be unreachable
|
||||
return content.toString('utf8');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Looks up the specific MIME type for a file path.
|
||||
* @param filePath Path to the file.
|
||||
@@ -57,59 +189,52 @@ export function isWithinRoot(
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines if a file is likely binary based on content sampling.
|
||||
* @param filePath Path to the file.
|
||||
* @returns Promise that resolves to true if the file appears to be binary.
|
||||
* Heuristic: determine if a file is likely binary.
|
||||
* Now BOM-aware: if a Unicode BOM is detected, we treat it as text.
|
||||
* For non-BOM files, retain the existing null-byte and non-printable ratio checks.
|
||||
*/
|
||||
export async function isBinaryFile(filePath: string): Promise<boolean> {
|
||||
let fileHandle: fs.promises.FileHandle | undefined;
|
||||
let fh: fs.promises.FileHandle | null = null;
|
||||
try {
|
||||
fileHandle = await fs.promises.open(filePath, 'r');
|
||||
|
||||
// Read up to 4KB or file size, whichever is smaller
|
||||
const stats = await fileHandle.stat();
|
||||
fh = await fs.promises.open(filePath, 'r');
|
||||
const stats = await fh.stat();
|
||||
const fileSize = stats.size;
|
||||
if (fileSize === 0) {
|
||||
// Empty file is not considered binary for content checking
|
||||
return false;
|
||||
}
|
||||
const bufferSize = Math.min(4096, fileSize);
|
||||
const buffer = Buffer.alloc(bufferSize);
|
||||
const result = await fileHandle.read(buffer, 0, buffer.length, 0);
|
||||
const bytesRead = result.bytesRead;
|
||||
if (fileSize === 0) return false; // empty is not binary
|
||||
|
||||
// Sample up to 4KB from the head (previous behavior)
|
||||
const sampleSize = Math.min(4096, fileSize);
|
||||
const buf = Buffer.alloc(sampleSize);
|
||||
const { bytesRead } = await fh.read(buf, 0, sampleSize, 0);
|
||||
if (bytesRead === 0) return false;
|
||||
|
||||
// BOM → text (avoid false positives for UTF‑16/32 with nulls)
|
||||
const bom = detectBOM(buf.subarray(0, Math.min(4, bytesRead)));
|
||||
if (bom) return false;
|
||||
|
||||
let nonPrintableCount = 0;
|
||||
for (let i = 0; i < bytesRead; i++) {
|
||||
if (buffer[i] === 0) return true; // Null byte is a strong indicator
|
||||
if (buffer[i] < 9 || (buffer[i] > 13 && buffer[i] < 32)) {
|
||||
if (buf[i] === 0) return true; // strong indicator of binary when no BOM
|
||||
if (buf[i] < 9 || (buf[i] > 13 && buf[i] < 32)) {
|
||||
nonPrintableCount++;
|
||||
}
|
||||
}
|
||||
// If >30% non-printable characters, consider it binary
|
||||
return nonPrintableCount / bytesRead > 0.3;
|
||||
} catch (error) {
|
||||
// Log error for debugging while maintaining existing behavior
|
||||
console.warn(
|
||||
`Failed to check if file is binary: ${filePath}`,
|
||||
error instanceof Error ? error.message : String(error),
|
||||
);
|
||||
// If any error occurs (e.g. file not found, permissions),
|
||||
// treat as not binary here; let higher-level functions handle existence/access errors.
|
||||
return false;
|
||||
} finally {
|
||||
// Safely close the file handle if it was successfully opened
|
||||
if (fileHandle) {
|
||||
if (fh) {
|
||||
try {
|
||||
await fileHandle.close();
|
||||
await fh.close();
|
||||
} catch (closeError) {
|
||||
// Log close errors for debugging while continuing with cleanup
|
||||
console.warn(
|
||||
`Failed to close file handle for: ${filePath}`,
|
||||
closeError instanceof Error ? closeError.message : String(closeError),
|
||||
);
|
||||
// The important thing is that we attempted to clean up
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -244,14 +369,15 @@ export async function processSingleFileContent(
|
||||
returnDisplay: `Skipped large SVG file (>1MB): ${relativePathForDisplay}`,
|
||||
};
|
||||
}
|
||||
const content = await fileSystemService.readTextFile(filePath);
|
||||
const content = await readFileWithEncoding(filePath);
|
||||
return {
|
||||
llmContent: content,
|
||||
returnDisplay: `Read SVG as text: ${relativePathForDisplay}`,
|
||||
};
|
||||
}
|
||||
case 'text': {
|
||||
const content = await fileSystemService.readTextFile(filePath);
|
||||
// Use BOM-aware reader to avoid leaving a BOM character in content and to support UTF-16/32 transparently
|
||||
const content = await readFileWithEncoding(filePath);
|
||||
const lines = content.split('\n');
|
||||
const originalLineCount = lines.length;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user