perf(ui): optimize stripUnsafeCharacters with regex (#18413)

This commit is contained in:
Gaurav
2026-02-05 17:46:23 -08:00
committed by GitHub
parent 4ffc349c18
commit 289769f544
2 changed files with 289 additions and 26 deletions

View File

@@ -58,9 +58,289 @@ describe('textUtils', () => {
});
describe('stripUnsafeCharacters', () => {
it('should not strip tab characters', () => {
const input = 'hello world';
expect(stripUnsafeCharacters(input)).toBe('hello world');
describe('preserved characters', () => {
it('should preserve TAB (0x09)', () => {
const input = 'hello\tworld';
expect(stripUnsafeCharacters(input)).toBe('hello\tworld');
});
it('should preserve LF/newline (0x0A)', () => {
const input = 'hello\nworld';
expect(stripUnsafeCharacters(input)).toBe('hello\nworld');
});
it('should preserve CR (0x0D)', () => {
const input = 'hello\rworld';
expect(stripUnsafeCharacters(input)).toBe('hello\rworld');
});
it('should preserve CRLF (0x0D 0x0A)', () => {
const input = 'hello\r\nworld';
expect(stripUnsafeCharacters(input)).toBe('hello\r\nworld');
});
it('should preserve DEL (0x7F)', () => {
const input = 'hello\x7Fworld';
expect(stripUnsafeCharacters(input)).toBe('hello\x7Fworld');
});
it('should preserve all printable ASCII (0x20-0x7E)', () => {
const printableAscii =
' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~';
expect(stripUnsafeCharacters(printableAscii)).toBe(printableAscii);
});
it('should preserve Unicode characters above 0x9F', () => {
const input = 'Hello κόσμε 世界 🌍';
expect(stripUnsafeCharacters(input)).toBe('Hello κόσμε 世界 🌍');
});
it('should preserve emojis', () => {
const input = '🎉 Celebration! 🚀 Launch! 💯';
expect(stripUnsafeCharacters(input)).toBe(
'🎉 Celebration! 🚀 Launch! 💯',
);
});
it('should preserve complex emoji sequences (ZWJ)', () => {
const input = 'Family: 👨‍👩‍👧‍👦 Flag: 🏳️‍🌈';
expect(stripUnsafeCharacters(input)).toBe('Family: 👨‍👩‍👧‍👦 Flag: 🏳️‍🌈');
});
});
describe('stripped C0 control characters (0x00-0x1F except TAB/LF/CR)', () => {
it('should strip NULL (0x00)', () => {
const input = 'hello\x00world';
expect(stripUnsafeCharacters(input)).toBe('helloworld');
});
it('should strip SOH (0x01)', () => {
const input = 'hello\x01world';
expect(stripUnsafeCharacters(input)).toBe('helloworld');
});
it('should strip STX (0x02)', () => {
const input = 'hello\x02world';
expect(stripUnsafeCharacters(input)).toBe('helloworld');
});
it('should strip ETX (0x03)', () => {
const input = 'hello\x03world';
expect(stripUnsafeCharacters(input)).toBe('helloworld');
});
it('should strip EOT (0x04)', () => {
const input = 'hello\x04world';
expect(stripUnsafeCharacters(input)).toBe('helloworld');
});
it('should strip ENQ (0x05)', () => {
const input = 'hello\x05world';
expect(stripUnsafeCharacters(input)).toBe('helloworld');
});
it('should strip ACK (0x06)', () => {
const input = 'hello\x06world';
expect(stripUnsafeCharacters(input)).toBe('helloworld');
});
it('should strip BELL (0x07)', () => {
const input = 'hello\x07world';
expect(stripUnsafeCharacters(input)).toBe('helloworld');
});
it('should strip BACKSPACE (0x08)', () => {
const input = 'hello\x08world';
expect(stripUnsafeCharacters(input)).toBe('helloworld');
});
it('should strip VT/Vertical Tab (0x0B)', () => {
const input = 'hello\x0Bworld';
expect(stripUnsafeCharacters(input)).toBe('helloworld');
});
it('should strip FF/Form Feed (0x0C)', () => {
const input = 'hello\x0Cworld';
expect(stripUnsafeCharacters(input)).toBe('helloworld');
});
it('should strip SO (0x0E)', () => {
const input = 'hello\x0Eworld';
expect(stripUnsafeCharacters(input)).toBe('helloworld');
});
it('should strip SI (0x0F)', () => {
const input = 'hello\x0Fworld';
expect(stripUnsafeCharacters(input)).toBe('helloworld');
});
it('should strip DLE (0x10)', () => {
const input = 'hello\x10world';
expect(stripUnsafeCharacters(input)).toBe('helloworld');
});
it('should strip DC1 (0x11)', () => {
const input = 'hello\x11world';
expect(stripUnsafeCharacters(input)).toBe('helloworld');
});
it('should strip DC2 (0x12)', () => {
const input = 'hello\x12world';
expect(stripUnsafeCharacters(input)).toBe('helloworld');
});
it('should strip DC3 (0x13)', () => {
const input = 'hello\x13world';
expect(stripUnsafeCharacters(input)).toBe('helloworld');
});
it('should strip DC4 (0x14)', () => {
const input = 'hello\x14world';
expect(stripUnsafeCharacters(input)).toBe('helloworld');
});
it('should strip NAK (0x15)', () => {
const input = 'hello\x15world';
expect(stripUnsafeCharacters(input)).toBe('helloworld');
});
it('should strip SYN (0x16)', () => {
const input = 'hello\x16world';
expect(stripUnsafeCharacters(input)).toBe('helloworld');
});
it('should strip ETB (0x17)', () => {
const input = 'hello\x17world';
expect(stripUnsafeCharacters(input)).toBe('helloworld');
});
it('should strip CAN (0x18)', () => {
const input = 'hello\x18world';
expect(stripUnsafeCharacters(input)).toBe('helloworld');
});
it('should strip EM (0x19)', () => {
const input = 'hello\x19world';
expect(stripUnsafeCharacters(input)).toBe('helloworld');
});
it('should strip SUB (0x1A)', () => {
const input = 'hello\x1Aworld';
expect(stripUnsafeCharacters(input)).toBe('helloworld');
});
it('should strip FS (0x1C)', () => {
const input = 'hello\x1Cworld';
expect(stripUnsafeCharacters(input)).toBe('helloworld');
});
it('should strip GS (0x1D)', () => {
const input = 'hello\x1Dworld';
expect(stripUnsafeCharacters(input)).toBe('helloworld');
});
it('should strip RS (0x1E)', () => {
const input = 'hello\x1Eworld';
expect(stripUnsafeCharacters(input)).toBe('helloworld');
});
it('should strip US (0x1F)', () => {
const input = 'hello\x1Fworld';
expect(stripUnsafeCharacters(input)).toBe('helloworld');
});
});
describe('stripped C1 control characters (0x80-0x9F)', () => {
it('should strip all C1 control characters', () => {
// Test a few representative C1 control chars
expect(stripUnsafeCharacters('hello\x80world')).toBe('helloworld');
expect(stripUnsafeCharacters('hello\x85world')).toBe('helloworld'); // NEL
expect(stripUnsafeCharacters('hello\x8Aworld')).toBe('helloworld');
expect(stripUnsafeCharacters('hello\x90world')).toBe('helloworld');
expect(stripUnsafeCharacters('hello\x9Fworld')).toBe('helloworld');
});
it('should preserve characters at 0xA0 and above (non-C1)', () => {
// 0xA0 is non-breaking space, should be preserved
expect(stripUnsafeCharacters('hello\xA0world')).toBe('hello\xA0world');
});
});
describe('ANSI escape sequence stripping', () => {
it('should strip ANSI color codes', () => {
const input = '\x1b[31mRed\x1b[0m text';
expect(stripUnsafeCharacters(input)).toBe('Red text');
});
it('should strip ANSI cursor movement codes', () => {
const input = 'hello\x1b[9D\x1b[Kworld';
expect(stripUnsafeCharacters(input)).toBe('helloworld');
});
it('should strip complex ANSI sequences', () => {
const input = '\x1b[1;32;40mBold Green on Black\x1b[0m';
expect(stripUnsafeCharacters(input)).toBe('Bold Green on Black');
});
});
describe('multiple control characters', () => {
it('should strip multiple different control characters', () => {
const input = 'a\x00b\x01c\x02d\x07e\x08f';
expect(stripUnsafeCharacters(input)).toBe('abcdef');
});
it('should handle consecutive control characters', () => {
const input = 'hello\x00\x01\x02\x03\x04world';
expect(stripUnsafeCharacters(input)).toBe('helloworld');
});
it('should handle mixed preserved and stripped chars', () => {
const input = 'line1\n\x00line2\t\x07line3\r\n';
expect(stripUnsafeCharacters(input)).toBe('line1\nline2\tline3\r\n');
});
});
describe('edge cases', () => {
it('should handle empty string', () => {
expect(stripUnsafeCharacters('')).toBe('');
});
it('should handle string with only control characters', () => {
expect(stripUnsafeCharacters('\x00\x01\x02\x03')).toBe('');
});
it('should handle string with only preserved whitespace', () => {
expect(stripUnsafeCharacters('\t\n\r')).toBe('\t\n\r');
});
it('should handle very long strings efficiently', () => {
const longString = 'a'.repeat(10000) + '\x00' + 'b'.repeat(10000);
const result = stripUnsafeCharacters(longString);
expect(result).toBe('a'.repeat(10000) + 'b'.repeat(10000));
expect(result.length).toBe(20000);
});
it('should handle surrogate pairs correctly', () => {
// 𝌆 is outside BMP (U+1D306)
const input = '𝌆hello𝌆';
expect(stripUnsafeCharacters(input)).toBe('𝌆hello𝌆');
});
it('should handle mixed BMP and non-BMP characters', () => {
const input = 'Hello 世界 🌍 привет';
expect(stripUnsafeCharacters(input)).toBe('Hello 世界 🌍 привет');
});
});
describe('performance: regex vs array-based', () => {
it('should handle real-world terminal output with control chars', () => {
// Simulate terminal output with various control sequences
const terminalOutput =
'\x1b[32mSuccess:\x1b[0m File saved\x07\n\x1b[?25hDone';
expect(stripUnsafeCharacters(terminalOutput)).toBe(
'Success: File saved\nDone',
);
});
});
});
describe('escapeAnsiCtrlCodes', () => {

View File

@@ -104,7 +104,7 @@ export function cpSlice(str: string, start: number, end?: number): string {
* Characters stripped:
* - ANSI escape sequences (via strip-ansi)
* - VT control sequences (via Node.js util.stripVTControlCharacters)
* - C0 control chars (0x00-0x1F) except CR/LF which are handled elsewhere
* - C0 control chars (0x00-0x1F) except TAB(0x09), LF(0x0A), CR(0x0D)
* - C1 control chars (0x80-0x9F) that can cause display issues
*
* Characters preserved:
@@ -117,28 +117,11 @@ export function stripUnsafeCharacters(str: string): string {
const strippedAnsi = stripAnsi(str);
const strippedVT = stripVTControlCharacters(strippedAnsi);
return toCodePoints(strippedVT)
.filter((char) => {
const code = char.codePointAt(0);
if (code === undefined) return false;
// Preserve CR/LF/TAB for line handling
if (code === 0x0a || code === 0x0d || code === 0x09) return true;
// Remove C0 control chars (except CR/LF) that can break display
// Examples: BELL(0x07) makes noise, BS(0x08) moves cursor, VT(0x0B), FF(0x0C)
if (code >= 0x00 && code <= 0x1f) return false;
// Remove C1 control chars (0x80-0x9f) - legacy 8-bit control codes
if (code >= 0x80 && code <= 0x9f) return false;
// Preserve DEL (0x7f) - it's handled functionally by applyOperations as backspace
// and doesn't cause rendering issues when displayed
// Preserve all other characters including Unicode/emojis
return true;
})
.join('');
// Use a regex to strip remaining unsafe control characters
// C0: 0x00-0x1F except 0x09 (TAB), 0x0A (LF), 0x0D (CR)
// C1: 0x80-0x9F
// eslint-disable-next-line no-control-regex
return strippedVT.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x80-\x9F]/g, '');
}
/**