mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-03-12 07:01:09 -07:00
perf(ui): optimize stripUnsafeCharacters with regex (#18413)
This commit is contained in:
@@ -58,9 +58,289 @@ describe('textUtils', () => {
|
||||
});
|
||||
|
||||
describe('stripUnsafeCharacters', () => {
|
||||
it('should not strip tab characters', () => {
|
||||
const input = 'hello world';
|
||||
expect(stripUnsafeCharacters(input)).toBe('hello world');
|
||||
describe('preserved characters', () => {
|
||||
it('should preserve TAB (0x09)', () => {
|
||||
const input = 'hello\tworld';
|
||||
expect(stripUnsafeCharacters(input)).toBe('hello\tworld');
|
||||
});
|
||||
|
||||
it('should preserve LF/newline (0x0A)', () => {
|
||||
const input = 'hello\nworld';
|
||||
expect(stripUnsafeCharacters(input)).toBe('hello\nworld');
|
||||
});
|
||||
|
||||
it('should preserve CR (0x0D)', () => {
|
||||
const input = 'hello\rworld';
|
||||
expect(stripUnsafeCharacters(input)).toBe('hello\rworld');
|
||||
});
|
||||
|
||||
it('should preserve CRLF (0x0D 0x0A)', () => {
|
||||
const input = 'hello\r\nworld';
|
||||
expect(stripUnsafeCharacters(input)).toBe('hello\r\nworld');
|
||||
});
|
||||
|
||||
it('should preserve DEL (0x7F)', () => {
|
||||
const input = 'hello\x7Fworld';
|
||||
expect(stripUnsafeCharacters(input)).toBe('hello\x7Fworld');
|
||||
});
|
||||
|
||||
it('should preserve all printable ASCII (0x20-0x7E)', () => {
|
||||
const printableAscii =
|
||||
' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~';
|
||||
expect(stripUnsafeCharacters(printableAscii)).toBe(printableAscii);
|
||||
});
|
||||
|
||||
it('should preserve Unicode characters above 0x9F', () => {
|
||||
const input = 'Hello κόσμε 世界 🌍';
|
||||
expect(stripUnsafeCharacters(input)).toBe('Hello κόσμε 世界 🌍');
|
||||
});
|
||||
|
||||
it('should preserve emojis', () => {
|
||||
const input = '🎉 Celebration! 🚀 Launch! 💯';
|
||||
expect(stripUnsafeCharacters(input)).toBe(
|
||||
'🎉 Celebration! 🚀 Launch! 💯',
|
||||
);
|
||||
});
|
||||
|
||||
it('should preserve complex emoji sequences (ZWJ)', () => {
|
||||
const input = 'Family: 👨👩👧👦 Flag: 🏳️🌈';
|
||||
expect(stripUnsafeCharacters(input)).toBe('Family: 👨👩👧👦 Flag: 🏳️🌈');
|
||||
});
|
||||
});
|
||||
|
||||
describe('stripped C0 control characters (0x00-0x1F except TAB/LF/CR)', () => {
|
||||
it('should strip NULL (0x00)', () => {
|
||||
const input = 'hello\x00world';
|
||||
expect(stripUnsafeCharacters(input)).toBe('helloworld');
|
||||
});
|
||||
|
||||
it('should strip SOH (0x01)', () => {
|
||||
const input = 'hello\x01world';
|
||||
expect(stripUnsafeCharacters(input)).toBe('helloworld');
|
||||
});
|
||||
|
||||
it('should strip STX (0x02)', () => {
|
||||
const input = 'hello\x02world';
|
||||
expect(stripUnsafeCharacters(input)).toBe('helloworld');
|
||||
});
|
||||
|
||||
it('should strip ETX (0x03)', () => {
|
||||
const input = 'hello\x03world';
|
||||
expect(stripUnsafeCharacters(input)).toBe('helloworld');
|
||||
});
|
||||
|
||||
it('should strip EOT (0x04)', () => {
|
||||
const input = 'hello\x04world';
|
||||
expect(stripUnsafeCharacters(input)).toBe('helloworld');
|
||||
});
|
||||
|
||||
it('should strip ENQ (0x05)', () => {
|
||||
const input = 'hello\x05world';
|
||||
expect(stripUnsafeCharacters(input)).toBe('helloworld');
|
||||
});
|
||||
|
||||
it('should strip ACK (0x06)', () => {
|
||||
const input = 'hello\x06world';
|
||||
expect(stripUnsafeCharacters(input)).toBe('helloworld');
|
||||
});
|
||||
|
||||
it('should strip BELL (0x07)', () => {
|
||||
const input = 'hello\x07world';
|
||||
expect(stripUnsafeCharacters(input)).toBe('helloworld');
|
||||
});
|
||||
|
||||
it('should strip BACKSPACE (0x08)', () => {
|
||||
const input = 'hello\x08world';
|
||||
expect(stripUnsafeCharacters(input)).toBe('helloworld');
|
||||
});
|
||||
|
||||
it('should strip VT/Vertical Tab (0x0B)', () => {
|
||||
const input = 'hello\x0Bworld';
|
||||
expect(stripUnsafeCharacters(input)).toBe('helloworld');
|
||||
});
|
||||
|
||||
it('should strip FF/Form Feed (0x0C)', () => {
|
||||
const input = 'hello\x0Cworld';
|
||||
expect(stripUnsafeCharacters(input)).toBe('helloworld');
|
||||
});
|
||||
|
||||
it('should strip SO (0x0E)', () => {
|
||||
const input = 'hello\x0Eworld';
|
||||
expect(stripUnsafeCharacters(input)).toBe('helloworld');
|
||||
});
|
||||
|
||||
it('should strip SI (0x0F)', () => {
|
||||
const input = 'hello\x0Fworld';
|
||||
expect(stripUnsafeCharacters(input)).toBe('helloworld');
|
||||
});
|
||||
|
||||
it('should strip DLE (0x10)', () => {
|
||||
const input = 'hello\x10world';
|
||||
expect(stripUnsafeCharacters(input)).toBe('helloworld');
|
||||
});
|
||||
|
||||
it('should strip DC1 (0x11)', () => {
|
||||
const input = 'hello\x11world';
|
||||
expect(stripUnsafeCharacters(input)).toBe('helloworld');
|
||||
});
|
||||
|
||||
it('should strip DC2 (0x12)', () => {
|
||||
const input = 'hello\x12world';
|
||||
expect(stripUnsafeCharacters(input)).toBe('helloworld');
|
||||
});
|
||||
|
||||
it('should strip DC3 (0x13)', () => {
|
||||
const input = 'hello\x13world';
|
||||
expect(stripUnsafeCharacters(input)).toBe('helloworld');
|
||||
});
|
||||
|
||||
it('should strip DC4 (0x14)', () => {
|
||||
const input = 'hello\x14world';
|
||||
expect(stripUnsafeCharacters(input)).toBe('helloworld');
|
||||
});
|
||||
|
||||
it('should strip NAK (0x15)', () => {
|
||||
const input = 'hello\x15world';
|
||||
expect(stripUnsafeCharacters(input)).toBe('helloworld');
|
||||
});
|
||||
|
||||
it('should strip SYN (0x16)', () => {
|
||||
const input = 'hello\x16world';
|
||||
expect(stripUnsafeCharacters(input)).toBe('helloworld');
|
||||
});
|
||||
|
||||
it('should strip ETB (0x17)', () => {
|
||||
const input = 'hello\x17world';
|
||||
expect(stripUnsafeCharacters(input)).toBe('helloworld');
|
||||
});
|
||||
|
||||
it('should strip CAN (0x18)', () => {
|
||||
const input = 'hello\x18world';
|
||||
expect(stripUnsafeCharacters(input)).toBe('helloworld');
|
||||
});
|
||||
|
||||
it('should strip EM (0x19)', () => {
|
||||
const input = 'hello\x19world';
|
||||
expect(stripUnsafeCharacters(input)).toBe('helloworld');
|
||||
});
|
||||
|
||||
it('should strip SUB (0x1A)', () => {
|
||||
const input = 'hello\x1Aworld';
|
||||
expect(stripUnsafeCharacters(input)).toBe('helloworld');
|
||||
});
|
||||
|
||||
it('should strip FS (0x1C)', () => {
|
||||
const input = 'hello\x1Cworld';
|
||||
expect(stripUnsafeCharacters(input)).toBe('helloworld');
|
||||
});
|
||||
|
||||
it('should strip GS (0x1D)', () => {
|
||||
const input = 'hello\x1Dworld';
|
||||
expect(stripUnsafeCharacters(input)).toBe('helloworld');
|
||||
});
|
||||
|
||||
it('should strip RS (0x1E)', () => {
|
||||
const input = 'hello\x1Eworld';
|
||||
expect(stripUnsafeCharacters(input)).toBe('helloworld');
|
||||
});
|
||||
|
||||
it('should strip US (0x1F)', () => {
|
||||
const input = 'hello\x1Fworld';
|
||||
expect(stripUnsafeCharacters(input)).toBe('helloworld');
|
||||
});
|
||||
});
|
||||
|
||||
describe('stripped C1 control characters (0x80-0x9F)', () => {
|
||||
it('should strip all C1 control characters', () => {
|
||||
// Test a few representative C1 control chars
|
||||
expect(stripUnsafeCharacters('hello\x80world')).toBe('helloworld');
|
||||
expect(stripUnsafeCharacters('hello\x85world')).toBe('helloworld'); // NEL
|
||||
expect(stripUnsafeCharacters('hello\x8Aworld')).toBe('helloworld');
|
||||
expect(stripUnsafeCharacters('hello\x90world')).toBe('helloworld');
|
||||
expect(stripUnsafeCharacters('hello\x9Fworld')).toBe('helloworld');
|
||||
});
|
||||
|
||||
it('should preserve characters at 0xA0 and above (non-C1)', () => {
|
||||
// 0xA0 is non-breaking space, should be preserved
|
||||
expect(stripUnsafeCharacters('hello\xA0world')).toBe('hello\xA0world');
|
||||
});
|
||||
});
|
||||
|
||||
describe('ANSI escape sequence stripping', () => {
|
||||
it('should strip ANSI color codes', () => {
|
||||
const input = '\x1b[31mRed\x1b[0m text';
|
||||
expect(stripUnsafeCharacters(input)).toBe('Red text');
|
||||
});
|
||||
|
||||
it('should strip ANSI cursor movement codes', () => {
|
||||
const input = 'hello\x1b[9D\x1b[Kworld';
|
||||
expect(stripUnsafeCharacters(input)).toBe('helloworld');
|
||||
});
|
||||
|
||||
it('should strip complex ANSI sequences', () => {
|
||||
const input = '\x1b[1;32;40mBold Green on Black\x1b[0m';
|
||||
expect(stripUnsafeCharacters(input)).toBe('Bold Green on Black');
|
||||
});
|
||||
});
|
||||
|
||||
describe('multiple control characters', () => {
|
||||
it('should strip multiple different control characters', () => {
|
||||
const input = 'a\x00b\x01c\x02d\x07e\x08f';
|
||||
expect(stripUnsafeCharacters(input)).toBe('abcdef');
|
||||
});
|
||||
|
||||
it('should handle consecutive control characters', () => {
|
||||
const input = 'hello\x00\x01\x02\x03\x04world';
|
||||
expect(stripUnsafeCharacters(input)).toBe('helloworld');
|
||||
});
|
||||
|
||||
it('should handle mixed preserved and stripped chars', () => {
|
||||
const input = 'line1\n\x00line2\t\x07line3\r\n';
|
||||
expect(stripUnsafeCharacters(input)).toBe('line1\nline2\tline3\r\n');
|
||||
});
|
||||
});
|
||||
|
||||
describe('edge cases', () => {
|
||||
it('should handle empty string', () => {
|
||||
expect(stripUnsafeCharacters('')).toBe('');
|
||||
});
|
||||
|
||||
it('should handle string with only control characters', () => {
|
||||
expect(stripUnsafeCharacters('\x00\x01\x02\x03')).toBe('');
|
||||
});
|
||||
|
||||
it('should handle string with only preserved whitespace', () => {
|
||||
expect(stripUnsafeCharacters('\t\n\r')).toBe('\t\n\r');
|
||||
});
|
||||
|
||||
it('should handle very long strings efficiently', () => {
|
||||
const longString = 'a'.repeat(10000) + '\x00' + 'b'.repeat(10000);
|
||||
const result = stripUnsafeCharacters(longString);
|
||||
expect(result).toBe('a'.repeat(10000) + 'b'.repeat(10000));
|
||||
expect(result.length).toBe(20000);
|
||||
});
|
||||
|
||||
it('should handle surrogate pairs correctly', () => {
|
||||
// 𝌆 is outside BMP (U+1D306)
|
||||
const input = '𝌆hello𝌆';
|
||||
expect(stripUnsafeCharacters(input)).toBe('𝌆hello𝌆');
|
||||
});
|
||||
|
||||
it('should handle mixed BMP and non-BMP characters', () => {
|
||||
const input = 'Hello 世界 🌍 привет';
|
||||
expect(stripUnsafeCharacters(input)).toBe('Hello 世界 🌍 привет');
|
||||
});
|
||||
});
|
||||
|
||||
describe('performance: regex vs array-based', () => {
|
||||
it('should handle real-world terminal output with control chars', () => {
|
||||
// Simulate terminal output with various control sequences
|
||||
const terminalOutput =
|
||||
'\x1b[32mSuccess:\x1b[0m File saved\x07\n\x1b[?25hDone';
|
||||
expect(stripUnsafeCharacters(terminalOutput)).toBe(
|
||||
'Success: File saved\nDone',
|
||||
);
|
||||
});
|
||||
});
|
||||
});
|
||||
describe('escapeAnsiCtrlCodes', () => {
|
||||
|
||||
@@ -104,7 +104,7 @@ export function cpSlice(str: string, start: number, end?: number): string {
|
||||
* Characters stripped:
|
||||
* - ANSI escape sequences (via strip-ansi)
|
||||
* - VT control sequences (via Node.js util.stripVTControlCharacters)
|
||||
* - C0 control chars (0x00-0x1F) except CR/LF which are handled elsewhere
|
||||
* - C0 control chars (0x00-0x1F) except TAB(0x09), LF(0x0A), CR(0x0D)
|
||||
* - C1 control chars (0x80-0x9F) that can cause display issues
|
||||
*
|
||||
* Characters preserved:
|
||||
@@ -117,28 +117,11 @@ export function stripUnsafeCharacters(str: string): string {
|
||||
const strippedAnsi = stripAnsi(str);
|
||||
const strippedVT = stripVTControlCharacters(strippedAnsi);
|
||||
|
||||
return toCodePoints(strippedVT)
|
||||
.filter((char) => {
|
||||
const code = char.codePointAt(0);
|
||||
if (code === undefined) return false;
|
||||
|
||||
// Preserve CR/LF/TAB for line handling
|
||||
if (code === 0x0a || code === 0x0d || code === 0x09) return true;
|
||||
|
||||
// Remove C0 control chars (except CR/LF) that can break display
|
||||
// Examples: BELL(0x07) makes noise, BS(0x08) moves cursor, VT(0x0B), FF(0x0C)
|
||||
if (code >= 0x00 && code <= 0x1f) return false;
|
||||
|
||||
// Remove C1 control chars (0x80-0x9f) - legacy 8-bit control codes
|
||||
if (code >= 0x80 && code <= 0x9f) return false;
|
||||
|
||||
// Preserve DEL (0x7f) - it's handled functionally by applyOperations as backspace
|
||||
// and doesn't cause rendering issues when displayed
|
||||
|
||||
// Preserve all other characters including Unicode/emojis
|
||||
return true;
|
||||
})
|
||||
.join('');
|
||||
// Use a regex to strip remaining unsafe control characters
|
||||
// C0: 0x00-0x1F except 0x09 (TAB), 0x0A (LF), 0x0D (CR)
|
||||
// C1: 0x80-0x9F
|
||||
// eslint-disable-next-line no-control-regex
|
||||
return strippedVT.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x80-\x9F]/g, '');
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user