mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-04-15 07:41:03 -07:00
fix(cli): resolve text sanitization data loss due to C1 control characters (#22624)
This commit is contained in:
@@ -266,6 +266,29 @@ describe('textUtils', () => {
|
||||
// 0xA0 is non-breaking space, should be preserved
|
||||
expect(stripUnsafeCharacters('hello\xA0world')).toBe('hello\xA0world');
|
||||
});
|
||||
|
||||
it('should not lose text after DCS (0x90) — regression for data loss', () => {
|
||||
// 0x90 (DCS) starts a Device Control String that stripVTControlCharacters
|
||||
// treats as an unterminated sequence, swallowing all subsequent text.
|
||||
// Stripping C1 chars before VT processing prevents this data loss.
|
||||
expect(stripUnsafeCharacters('important\x90data after DCS')).toBe(
|
||||
'importantdata after DCS',
|
||||
);
|
||||
});
|
||||
|
||||
it('should fully strip 8-bit CSI (0x9B) sequences', () => {
|
||||
// 0x9B (CSI) is equivalent to ESC[. stripAnsi should handle the
|
||||
// whole sequence including parameters.
|
||||
expect(stripUnsafeCharacters('keep\x9B42mthis text')).toBe(
|
||||
'keepthis text',
|
||||
);
|
||||
});
|
||||
|
||||
it('should not lose text when multiple C1 chars precede valid content', () => {
|
||||
expect(stripUnsafeCharacters('start\x90\x9B\x85middle\x80end')).toBe(
|
||||
'startmiddleend',
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
describe('ANSI escape sequence stripping', () => {
|
||||
|
||||
@@ -98,8 +98,16 @@ export function cpSlice(str: string, start: number, end?: number): string {
|
||||
/**
|
||||
* Strip characters that can break terminal rendering.
|
||||
*
|
||||
* Uses Node.js built-in stripVTControlCharacters to handle VT sequences,
|
||||
* then filters remaining control characters that can disrupt display.
|
||||
* This is a strict sanitization function intended for general display
|
||||
* contexts. It strips all C1 control characters (0x80-0x9F) and VT
|
||||
* control sequences. For list display contexts where a more lenient
|
||||
* approach is needed (preserving C1 characters and only stripping ANSI
|
||||
* codes and newlines/tabs), use a separate function instead.
|
||||
*
|
||||
* Processing order:
|
||||
* 1. stripAnsi removes ANSI escape sequences (including 8-bit CSI 0x9B)
|
||||
* 2. Regex strips C0, C1, BiDi, and zero-width control characters
|
||||
* 3. stripVTControlCharacters removes any remaining VT sequences
|
||||
*
|
||||
* Characters stripped:
|
||||
* - ANSI escape sequences (via strip-ansi)
|
||||
@@ -119,18 +127,20 @@ export function cpSlice(str: string, start: number, end?: number): string {
|
||||
*/
|
||||
export function stripUnsafeCharacters(str: string): string {
|
||||
const strippedAnsi = stripAnsi(str);
|
||||
const strippedVT = stripVTControlCharacters(strippedAnsi);
|
||||
|
||||
// Use a regex to strip remaining unsafe control characters
|
||||
// C0: 0x00-0x1F except 0x09 (TAB), 0x0A (LF), 0x0D (CR)
|
||||
// C1: 0x80-0x9F
|
||||
// BiDi: U+200E (LRM), U+200F (RLM), U+202A-U+202E, U+2066-U+2069
|
||||
// Zero-width: U+200B (ZWSP), U+FEFF (BOM)
|
||||
return strippedVT.replace(
|
||||
// Strip C0, C1, and other unsafe characters via regex first.
|
||||
// This is more efficient than multiple replaces and crucially removes C1
|
||||
// characters (e.g., 0x90 DCS) before they can be misinterpreted by
|
||||
// stripVTControlCharacters, which could otherwise cause data loss.
|
||||
const strippedWithRegex = strippedAnsi.replace(
|
||||
// eslint-disable-next-line no-control-regex
|
||||
/[\x00-\x08\x0B\x0C\x0E-\x1F\x80-\x9F\u200E\u200F\u202A-\u202E\u2066-\u2069\u200B\uFEFF]/g,
|
||||
'',
|
||||
);
|
||||
|
||||
// Finally, use stripVTControlCharacters for any remaining VT sequences
|
||||
// that the regex might not cover.
|
||||
return stripVTControlCharacters(strippedWithRegex);
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user