From fc51e50bc6b180d9fd7e5ceda4fc9b898b2a233e Mon Sep 17 00:00:00 2001 From: Sehoon Shon Date: Tue, 17 Mar 2026 01:41:19 -0400 Subject: [PATCH] fix(core): handle surrogate pairs in truncateString (#22754) --- packages/core/src/utils/textUtils.test.ts | 38 +++++++++++++++++++++++ packages/core/src/utils/textUtils.ts | 32 ++++++++++++++++++- 2 files changed, 69 insertions(+), 1 deletion(-) diff --git a/packages/core/src/utils/textUtils.test.ts b/packages/core/src/utils/textUtils.test.ts index 00143b99e3..c1c572a170 100644 --- a/packages/core/src/utils/textUtils.test.ts +++ b/packages/core/src/utils/textUtils.test.ts @@ -102,6 +102,44 @@ describe('truncateString', () => { it('should handle empty string', () => { expect(truncateString('', 5)).toBe(''); }); + + it('should not slice surrogate pairs', () => { + const emoji = '😭'; // \uD83D\uDE2D, length 2 + const str = 'a' + emoji; // length 3 + + // We expect 'a' (len 1). Adding the emoji (len 2) would make it 3, exceeding maxLength 2. + expect(truncateString(str, 2, '')).toBe('a'); + expect(truncateString(str, 1, '')).toBe('a'); + expect(truncateString(emoji, 1, '')).toBe(''); + expect(truncateString(emoji, 2, '')).toBe(emoji); + }); + + it('should handle pre-existing dangling high surrogates at the cut point', () => { + // \uD83D is a high surrogate without a following low surrogate + const str = 'a\uD83Db'; + // 'a' (1) + '\uD83D' (1) = 2. + // BUT our function should strip the dangling surrogate for safety. + expect(truncateString(str, 2, '')).toBe('a'); + }); + + it('should handle multi-code-point grapheme clusters like combining marks', () => { + // FORCE Decomposed form (NFD) to ensure 'e' + 'accent' are separate code units + // This ensures the test behaves the same on Linux and Mac. + const combinedChar = 'e\u0301'.normalize('NFD'); + + // In NFD, combinedChar.length is 2. + const str = 'a' + combinedChar; // 'a' + 'e' + '\u0301' (length 3) + + // Truncating at 2: 'a' (1) + 'e\u0301' (2) = 3. Too long, should stay at 'a'. + expect(truncateString(str, 2, '')).toBe('a'); + expect(truncateString(str, 1, '')).toBe('a'); + + // Truncating combinedChar (len 2) at maxLength 1: too long, should be empty. + expect(truncateString(combinedChar, 1, '')).toBe(''); + + // Truncating combinedChar (len 2) at maxLength 2: fits perfectly. + expect(truncateString(combinedChar, 2, '')).toBe(combinedChar); + }); }); describe('safeTemplateReplace', () => { diff --git a/packages/core/src/utils/textUtils.ts b/packages/core/src/utils/textUtils.ts index 1066896bc4..8d4cbfa6d5 100644 --- a/packages/core/src/utils/textUtils.ts +++ b/packages/core/src/utils/textUtils.ts @@ -80,7 +80,37 @@ export function truncateString( if (str.length <= maxLength) { return str; } - return str.slice(0, maxLength) + suffix; + + // This regex matches a "Grapheme Cluster" manually: + // 1. A surrogate pair OR a single character... + // 2. Followed by any number of "Combining Marks" (\p{M}) + // 'u' flag is required for Unicode property escapes + const graphemeRegex = /(?:[\uD800-\uDBFF][\uDC00-\uDFFF]|.)\p{M}*/gu; + + let truncatedStr = ''; + let match: RegExpExecArray | null; + + while ((match = graphemeRegex.exec(str)) !== null) { + const segment = match[0]; + + // If adding the whole cluster (base char + accent) exceeds maxLength, stop. + if (truncatedStr.length + segment.length > maxLength) { + break; + } + + truncatedStr += segment; + if (truncatedStr.length >= maxLength) break; + } + + // Final safety check for dangling high surrogates + if (truncatedStr.length > 0) { + const lastCode = truncatedStr.charCodeAt(truncatedStr.length - 1); + if (lastCode >= 0xd800 && lastCode <= 0xdbff) { + truncatedStr = truncatedStr.slice(0, -1); + } + } + + return truncatedStr + suffix; } /**