fix(cli): resolve text sanitization data loss due to C1 control characters (#22624)

2026-06-05 00:22:52 -07:00 · 2026-04-10 14:44:28 -04:00
parent 7bba7f437c
commit 565eafc1ec
2 changed files with 42 additions and 9 deletions
@@ -266,6 +266,29 @@ describe('textUtils', () => {
        // 0xA0 is non-breaking space, should be preserved
        expect(stripUnsafeCharacters('hello\xA0world')).toBe('hello\xA0world');
      });
+
+      it('should not lose text after DCS (0x90) — regression for data loss', () => {
+        // 0x90 (DCS) starts a Device Control String that stripVTControlCharacters
+        // treats as an unterminated sequence, swallowing all subsequent text.
+        // Stripping C1 chars before VT processing prevents this data loss.
+        expect(stripUnsafeCharacters('important\x90data after DCS')).toBe(
+          'importantdata after DCS',
+        );
+      });
+
+      it('should fully strip 8-bit CSI (0x9B) sequences', () => {
+        // 0x9B (CSI) is equivalent to ESC[. stripAnsi should handle the
+        // whole sequence including parameters.
+        expect(stripUnsafeCharacters('keep\x9B42mthis text')).toBe(
+          'keepthis text',
+        );
+      });
+
+      it('should not lose text when multiple C1 chars precede valid content', () => {
+        expect(stripUnsafeCharacters('start\x90\x9B\x85middle\x80end')).toBe(
+          'startmiddleend',
+        );
+      });
    });

    describe('ANSI escape sequence stripping', () => {
@@ -98,8 +98,16 @@ export function cpSlice(str: string, start: number, end?: number): string {
 /**
 * Strip characters that can break terminal rendering.
 *
- * Uses Node.js built-in stripVTControlCharacters to handle VT sequences,
- * then filters remaining control characters that can disrupt display.
+ * This is a strict sanitization function intended for general display
+ * contexts. It strips all C1 control characters (0x80-0x9F) and VT
+ * control sequences. For list display contexts where a more lenient
+ * approach is needed (preserving C1 characters and only stripping ANSI
+ * codes and newlines/tabs), use a separate function instead.
+ *
+ * Processing order:
+ * 1. stripAnsi removes ANSI escape sequences (including 8-bit CSI 0x9B)
+ * 2. Regex strips C0, C1, BiDi, and zero-width control characters
+ * 3. stripVTControlCharacters removes any remaining VT sequences
 *
 * Characters stripped:
 * - ANSI escape sequences (via strip-ansi)
@@ -119,18 +127,20 @@ export function cpSlice(str: string, start: number, end?: number): string {
 */
 export function stripUnsafeCharacters(str: string): string {
  const strippedAnsi = stripAnsi(str);
-  const strippedVT = stripVTControlCharacters(strippedAnsi);

-  // Use a regex to strip remaining unsafe control characters
-  // C0: 0x00-0x1F except 0x09 (TAB), 0x0A (LF), 0x0D (CR)
-  // C1: 0x80-0x9F
-  // BiDi: U+200E (LRM), U+200F (RLM), U+202A-U+202E, U+2066-U+2069
-  // Zero-width: U+200B (ZWSP), U+FEFF (BOM)
-  return strippedVT.replace(
+  // Strip C0, C1, and other unsafe characters via regex first.
+  // This is more efficient than multiple replaces and crucially removes C1
+  // characters (e.g., 0x90 DCS) before they can be misinterpreted by
+  // stripVTControlCharacters, which could otherwise cause data loss.
+  const strippedWithRegex = strippedAnsi.replace(
    // eslint-disable-next-line no-control-regex
    /[\x00-\x08\x0B\x0C\x0E-\x1F\x80-\x9F\u200E\u200F\u202A-\u202E\u2066-\u2069\u200B\uFEFF]/g,
    '',
  );
+
+  // Finally, use stripVTControlCharacters for any remaining VT sequences
+  // that the regex might not cover.
+  return stripVTControlCharacters(strippedWithRegex);
 }

 /**