Files
gemini-cli/packages/cli/src/ui/utils/textUtils.ts
2026-01-22 16:41:51 +00:00

256 lines
7.3 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* @license
* Copyright 2025 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import stripAnsi from 'strip-ansi';
import ansiRegex from 'ansi-regex';
import { stripVTControlCharacters } from 'node:util';
import stringWidth from 'string-width';
import { LRUCache } from 'mnemonist';
import { LRU_BUFFER_PERF_CACHE_LIMIT } from '../constants.js';
/**
* Calculates the maximum width of a multi-line ASCII art string.
* @param asciiArt The ASCII art string.
* @returns The length of the longest line in the ASCII art.
*/
export const getAsciiArtWidth = (asciiArt: string): number => {
if (!asciiArt) {
return 0;
}
const lines = asciiArt.split('\n');
return Math.max(...lines.map((line) => line.length));
};
/*
* -------------------------------------------------------------------------
* Unicodeaware helpers (work at the codepoint level rather than UTF16
* code units so that surrogatepair emoji count as one "column".)
* ---------------------------------------------------------------------- */
// Cache for code points
const MAX_STRING_LENGTH_TO_CACHE = 1000;
const codePointsCache = new LRUCache<string, string[]>(
LRU_BUFFER_PERF_CACHE_LIMIT,
);
export function toCodePoints(str: string): string[] {
// ASCII fast path - check if all chars are ASCII (0-127)
let isAscii = true;
for (let i = 0; i < str.length; i++) {
if (str.charCodeAt(i) > 127) {
isAscii = false;
break;
}
}
if (isAscii) {
return str.split('');
}
// Cache short strings
if (str.length <= MAX_STRING_LENGTH_TO_CACHE) {
const cached = codePointsCache.get(str);
if (cached !== undefined) {
return cached;
}
}
const result = Array.from(str);
// Cache result
if (str.length <= MAX_STRING_LENGTH_TO_CACHE) {
codePointsCache.set(str, result);
}
return result;
}
export function cpLen(str: string): number {
return toCodePoints(str).length;
}
export function cpSlice(str: string, start: number, end?: number): string {
// Slice by codepoint indices and rejoin.
const arr = toCodePoints(str).slice(start, end);
return arr.join('');
}
/**
* Strip characters that can break terminal rendering.
*
* Uses Node.js built-in stripVTControlCharacters to handle VT sequences,
* then filters remaining control characters that can disrupt display.
*
* Characters stripped:
* - ANSI escape sequences (via strip-ansi)
* - VT control sequences (via Node.js util.stripVTControlCharacters)
* - C0 control chars (0x00-0x1F) except CR/LF which are handled elsewhere
* - C1 control chars (0x80-0x9F) that can cause display issues
*
* Characters preserved:
* - All printable Unicode including emojis
* - DEL (0x7F) - handled functionally by applyOperations, not a display issue
* - CR/LF (0x0D/0x0A) - needed for line breaks
* - TAB (0x09) - preserve tabs
*/
export function stripUnsafeCharacters(str: string): string {
const strippedAnsi = stripAnsi(str);
const strippedVT = stripVTControlCharacters(strippedAnsi);
return toCodePoints(strippedVT)
.filter((char) => {
const code = char.codePointAt(0);
if (code === undefined) return false;
// Preserve CR/LF/TAB for line handling
if (code === 0x0a || code === 0x0d || code === 0x09) return true;
// Remove C0 control chars (except CR/LF) that can break display
// Examples: BELL(0x07) makes noise, BS(0x08) moves cursor, VT(0x0B), FF(0x0C)
if (code >= 0x00 && code <= 0x1f) return false;
// Remove C1 control chars (0x80-0x9f) - legacy 8-bit control codes
if (code >= 0x80 && code <= 0x9f) return false;
// Preserve DEL (0x7f) - it's handled functionally by applyOperations as backspace
// and doesn't cause rendering issues when displayed
// Preserve all other characters including Unicode/emojis
return true;
})
.join('');
}
/**
* Sanitize a string for display in list-like UI components (e.g. Help, Suggestions).
* Removes ANSI codes, collapses whitespace characters into a single space, and optionally truncates.
*/
export function sanitizeForListDisplay(
str: string,
maxLength?: number,
): string {
if (!str) {
return '';
}
let sanitized = stripAnsi(str).replace(/\s+/g, ' ');
if (maxLength && sanitized.length > maxLength) {
sanitized = sanitized.substring(0, maxLength - 3) + '...';
}
return sanitized;
}
const stringWidthCache = new LRUCache<string, number>(
LRU_BUFFER_PERF_CACHE_LIMIT,
);
/**
* Cached version of stringWidth function for better performance
*/
export const getCachedStringWidth = (str: string): number => {
// ASCII printable chars (32-126) have width 1.
// This is a very frequent path, so we use a fast numeric check.
if (str.length === 1) {
const code = str.charCodeAt(0);
if (code >= 0x20 && code <= 0x7e) {
return 1;
}
}
const cached = stringWidthCache.get(str);
if (cached !== undefined) {
return cached;
}
let width: number;
try {
width = stringWidth(str);
} catch {
// Fallback for characters that cause string-width to crash (e.g. U+0602)
// See: https://github.com/google-gemini/gemini-cli/issues/16418
width = toCodePoints(stripAnsi(str)).length;
}
stringWidthCache.set(str, width);
return width;
};
/**
* Clear the string width cache
*/
export const clearStringWidthCache = (): void => {
stringWidthCache.clear();
};
const regex = ansiRegex();
/* Recursively traverses a JSON-like structure (objects, arrays, primitives)
* and escapes all ANSI control characters found in any string values.
*
* This function is designed to be robust, handling deeply nested objects and
* arrays. It applies a regex-based replacement to all string values to
* safely escape control characters.
*
* To optimize performance, this function uses a "copy-on-write" strategy.
* It avoids allocating new objects or arrays if no nested string values
* required escaping, returning the original object reference in such cases.
*
* @param obj The JSON-like value (object, array, string, etc.) to traverse.
* @returns A new value with all nested string fields escaped, or the
* original `obj` reference if no changes were necessary.
*/
export function escapeAnsiCtrlCodes<T>(obj: T): T {
if (typeof obj === 'string') {
if (obj.search(regex) === -1) {
return obj; // No changes return original string
}
regex.lastIndex = 0; // needed for global regex
return obj.replace(regex, (match) =>
JSON.stringify(match).slice(1, -1),
) as T;
}
if (obj === null || typeof obj !== 'object') {
return obj;
}
if (Array.isArray(obj)) {
let newArr: unknown[] | null = null;
for (let i = 0; i < obj.length; i++) {
const value = obj[i];
const escapedValue = escapeAnsiCtrlCodes(value);
if (escapedValue !== value) {
if (newArr === null) {
newArr = [...obj];
}
newArr[i] = escapedValue;
}
}
return (newArr !== null ? newArr : obj) as T;
}
let newObj: T | null = null;
const keys = Object.keys(obj);
for (const key of keys) {
const value = (obj as Record<string, unknown>)[key];
const escapedValue = escapeAnsiCtrlCodes(value);
if (escapedValue !== value) {
if (newObj === null) {
newObj = { ...obj };
}
(newObj as Record<string, unknown>)[key] = escapedValue;
}
}
return newObj !== null ? newObj : obj;
}