mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-03-18 01:51:20 -07:00
fix(core): handle surrogate pairs in truncateString (#22754)
This commit is contained in:
@@ -102,6 +102,44 @@ describe('truncateString', () => {
|
||||
it('should handle empty string', () => {
|
||||
expect(truncateString('', 5)).toBe('');
|
||||
});
|
||||
|
||||
it('should not slice surrogate pairs', () => {
|
||||
const emoji = '😭'; // \uD83D\uDE2D, length 2
|
||||
const str = 'a' + emoji; // length 3
|
||||
|
||||
// We expect 'a' (len 1). Adding the emoji (len 2) would make it 3, exceeding maxLength 2.
|
||||
expect(truncateString(str, 2, '')).toBe('a');
|
||||
expect(truncateString(str, 1, '')).toBe('a');
|
||||
expect(truncateString(emoji, 1, '')).toBe('');
|
||||
expect(truncateString(emoji, 2, '')).toBe(emoji);
|
||||
});
|
||||
|
||||
it('should handle pre-existing dangling high surrogates at the cut point', () => {
|
||||
// \uD83D is a high surrogate without a following low surrogate
|
||||
const str = 'a\uD83Db';
|
||||
// 'a' (1) + '\uD83D' (1) = 2.
|
||||
// BUT our function should strip the dangling surrogate for safety.
|
||||
expect(truncateString(str, 2, '')).toBe('a');
|
||||
});
|
||||
|
||||
it('should handle multi-code-point grapheme clusters like combining marks', () => {
|
||||
// FORCE Decomposed form (NFD) to ensure 'e' + 'accent' are separate code units
|
||||
// This ensures the test behaves the same on Linux and Mac.
|
||||
const combinedChar = 'e\u0301'.normalize('NFD');
|
||||
|
||||
// In NFD, combinedChar.length is 2.
|
||||
const str = 'a' + combinedChar; // 'a' + 'e' + '\u0301' (length 3)
|
||||
|
||||
// Truncating at 2: 'a' (1) + 'e\u0301' (2) = 3. Too long, should stay at 'a'.
|
||||
expect(truncateString(str, 2, '')).toBe('a');
|
||||
expect(truncateString(str, 1, '')).toBe('a');
|
||||
|
||||
// Truncating combinedChar (len 2) at maxLength 1: too long, should be empty.
|
||||
expect(truncateString(combinedChar, 1, '')).toBe('');
|
||||
|
||||
// Truncating combinedChar (len 2) at maxLength 2: fits perfectly.
|
||||
expect(truncateString(combinedChar, 2, '')).toBe(combinedChar);
|
||||
});
|
||||
});
|
||||
|
||||
describe('safeTemplateReplace', () => {
|
||||
|
||||
@@ -80,7 +80,37 @@ export function truncateString(
|
||||
if (str.length <= maxLength) {
|
||||
return str;
|
||||
}
|
||||
return str.slice(0, maxLength) + suffix;
|
||||
|
||||
// This regex matches a "Grapheme Cluster" manually:
|
||||
// 1. A surrogate pair OR a single character...
|
||||
// 2. Followed by any number of "Combining Marks" (\p{M})
|
||||
// 'u' flag is required for Unicode property escapes
|
||||
const graphemeRegex = /(?:[\uD800-\uDBFF][\uDC00-\uDFFF]|.)\p{M}*/gu;
|
||||
|
||||
let truncatedStr = '';
|
||||
let match: RegExpExecArray | null;
|
||||
|
||||
while ((match = graphemeRegex.exec(str)) !== null) {
|
||||
const segment = match[0];
|
||||
|
||||
// If adding the whole cluster (base char + accent) exceeds maxLength, stop.
|
||||
if (truncatedStr.length + segment.length > maxLength) {
|
||||
break;
|
||||
}
|
||||
|
||||
truncatedStr += segment;
|
||||
if (truncatedStr.length >= maxLength) break;
|
||||
}
|
||||
|
||||
// Final safety check for dangling high surrogates
|
||||
if (truncatedStr.length > 0) {
|
||||
const lastCode = truncatedStr.charCodeAt(truncatedStr.length - 1);
|
||||
if (lastCode >= 0xd800 && lastCode <= 0xdbff) {
|
||||
truncatedStr = truncatedStr.slice(0, -1);
|
||||
}
|
||||
}
|
||||
|
||||
return truncatedStr + suffix;
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user