fix(cli): render LaTeX-style output as Unicode in the TUI (#25802)

Co-authored-by: cynthialong0-0 <82900738+cynthialong0-0@users.noreply.github.com>
This commit is contained in:
Aryan Singh
2026-05-04 23:35:06 +05:30
committed by GitHub
parent 0da1a2026a
commit 77f4be1f3d
4 changed files with 990 additions and 1 deletions
@@ -0,0 +1,304 @@
/**
* @license
* Copyright 2025 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, it, expect } from 'vitest';
import { convertLatexToUnicode } from './latexToUnicode.js';
describe('convertLatexToUnicode', () => {
describe('fast path', () => {
it('returns empty string unchanged', () => {
expect(convertLatexToUnicode('')).toBe('');
});
it('returns text without backslash or dollar unchanged', () => {
const input = 'hello world 123';
expect(convertLatexToUnicode(input)).toBe(input);
});
it('short-circuits plain ASCII identically', () => {
const input = 'The quick brown fox jumps over the lazy dog.';
expect(convertLatexToUnicode(input)).toBe(input);
});
});
describe('issue #25656 examples', () => {
it('converts the set-of-processes example', () => {
const input = 'A set of processes $\\{P_0, P_1, \\dots, P_n\\}$ exists';
expect(convertLatexToUnicode(input)).toBe(
'A set of processes {P₀, P₁, …, Pₙ} exists',
);
});
it('converts the deadlock arrow example', () => {
const input = 'If the graph contains no cycles $\\to$ No Deadlock.';
expect(convertLatexToUnicode(input)).toBe(
'If the graph contains no cycles → No Deadlock.',
);
});
});
describe('math delimiters', () => {
it('strips $...$ when the content contains LaTeX markers', () => {
expect(convertLatexToUnicode('see $\\alpha$ here')).toBe('see α here');
});
it('strips $...$ around single variables', () => {
expect(convertLatexToUnicode('let $x$ be a value')).toBe(
'let x be a value',
);
});
it('strips $$...$$ display math', () => {
expect(convertLatexToUnicode('$$\\alpha + \\beta$$')).toBe('α + β');
});
it('leaves currency $5.99 alone', () => {
expect(convertLatexToUnicode('It costs $5.99 total')).toBe(
'It costs $5.99 total',
);
});
it('leaves two dollar amounts alone', () => {
// The regex matches `$5 to $` as a pair, but the inner content is
// neither mathy nor purely variables, so it is left intact.
expect(convertLatexToUnicode('prices range $5 to $10')).toBe(
'prices range $5 to $10',
);
});
it('leaves shell-style $ interpolation alone', () => {
expect(convertLatexToUnicode('echo $USER $HOME')).toBe(
'echo $USER $HOME',
);
});
it('does not strip dollars across newlines', () => {
expect(convertLatexToUnicode('price $5\nfee $3')).toBe(
'price $5\nfee $3',
);
});
});
describe('greek letters', () => {
it('converts lowercase greek', () => {
expect(convertLatexToUnicode('\\alpha \\beta \\gamma')).toBe('α β γ');
});
it('converts uppercase greek', () => {
expect(convertLatexToUnicode('\\Omega \\Delta')).toBe('Ω Δ');
});
it('does not mangle a prefix match', () => {
// `\alphabet` is not a known command — must stay intact.
expect(convertLatexToUnicode('\\alphabet')).toBe('\\alphabet');
});
});
describe('named commands', () => {
it('converts arrows', () => {
expect(convertLatexToUnicode('\\to \\rightarrow \\Rightarrow')).toBe(
'→ → ⇒',
);
});
it('converts relations', () => {
expect(convertLatexToUnicode('\\leq \\geq \\neq \\approx')).toBe(
'≤ ≥ ≠ ≈',
);
});
it('converts set theory', () => {
expect(convertLatexToUnicode('\\in \\notin \\cup \\cap')).toBe('∈ ∉ ∩');
});
it('converts logic', () => {
expect(convertLatexToUnicode('\\forall x \\exists y')).toBe('∀ x ∃ y');
});
it('converts large operators', () => {
expect(convertLatexToUnicode('\\sum \\prod \\int')).toBe('∑ ∏ ∫');
});
it('converts ellipses', () => {
expect(convertLatexToUnicode('a, b, \\dots, z')).toBe('a, b, …, z');
});
it('converts infty', () => {
expect(convertLatexToUnicode('\\infty')).toBe('∞');
});
it('leaves unknown commands untouched', () => {
expect(convertLatexToUnicode('\\thisIsNotReal')).toBe('\\thisIsNotReal');
});
});
describe('escaped specials', () => {
it('unescapes braces and underscore', () => {
expect(convertLatexToUnicode('\\{ \\} \\_')).toBe('{ } _');
});
it('unescapes percent, ampersand, hash, dollar, pipe', () => {
expect(convertLatexToUnicode('\\% \\& \\# \\$ \\|')).toBe('% & # $ |');
});
it('unescapes backslash-space as a regular space', () => {
expect(convertLatexToUnicode('word\\ boundary')).toBe('word boundary');
});
it('converts \\\\ to a newline inside math mode', () => {
// `\\` is a LaTeX line break in math/tabular contexts. Only convert
// inside `$...$` — outside math this would mangle Windows UNC paths
// (`\\server\share`) and escaped backslashes in code-like prose.
expect(convertLatexToUnicode('$a\\\\b$')).toBe('a\nb');
});
it('leaves \\\\ alone outside math mode', () => {
expect(convertLatexToUnicode('line1\\\\line2')).toBe('line1\\\\line2');
});
});
describe('text formatting', () => {
it('wraps textbf in markdown bold', () => {
expect(convertLatexToUnicode('\\textbf{hello}')).toBe('**hello**');
});
it('wraps textit in markdown italic', () => {
expect(convertLatexToUnicode('\\textit{hello}')).toBe('*hello*');
});
it('strips \\text wrapper', () => {
expect(convertLatexToUnicode('\\text{plain}')).toBe('plain');
});
it('strips \\mathrm', () => {
expect(convertLatexToUnicode('\\mathrm{foo}')).toBe('foo');
});
it('handles \\emph as italic', () => {
expect(convertLatexToUnicode('\\emph{emphasized}')).toBe('*emphasized*');
});
});
describe('fractions and roots', () => {
it('converts \\frac', () => {
expect(convertLatexToUnicode('\\frac{a}{b}')).toBe('(a)/(b)');
});
it('converts \\sqrt', () => {
expect(convertLatexToUnicode('\\sqrt{x}')).toBe('√(x)');
});
it('converts \\sqrt with index', () => {
expect(convertLatexToUnicode('\\sqrt[3]{x}')).toBe('3√(x)');
});
it('converts \\frac combined with greek', () => {
expect(convertLatexToUnicode('\\frac{\\alpha}{\\beta}')).toBe('(α)/(β)');
});
});
describe('subscripts and superscripts', () => {
// Sub/superscripts are only applied inside math delimiters to avoid
// mangling identifiers like `file_name` and `foo_bar` in regular prose.
it('converts digit subscripts inside math', () => {
expect(convertLatexToUnicode('$x_0 + x_1 + x_2$')).toBe('x₀ + x₁ + x₂');
});
it('converts digit superscripts inside math', () => {
expect(convertLatexToUnicode('$E = mc^2$')).toBe('E = mc²');
});
it('converts letter subscripts where available', () => {
expect(convertLatexToUnicode('$P_n$ and $x_i$')).toBe('Pₙ and xᵢ');
});
it('converts braced digit subscripts', () => {
expect(convertLatexToUnicode('$x_{12}$')).toBe('x₁₂');
});
it('leaves subscripts with no unicode mapping alone', () => {
// `q` has no subscript glyph in Unicode — leave the whole operand
// untouched to avoid inconsistent-looking output.
expect(convertLatexToUnicode('$x_{abq}$')).toBe('x_{abq}');
});
it('does not subscript identifiers in prose', () => {
// Outside math delimiters, `_` is left alone entirely so that
// snake_case identifiers and file paths render correctly. This is a
// deliberate trade-off against model output that emits subscripts
// unwrapped.
expect(convertLatexToUnicode('the file_name variable')).toBe(
'the file_name variable',
);
expect(convertLatexToUnicode('_private')).toBe('_private');
});
it('does not superscript when character is unmapped in sup', () => {
// `^Q` — Q has no superscript. The regex only matches when the char is
// in the map; leave as-is even inside math.
expect(convertLatexToUnicode('$x^Q$')).toBe('x^Q');
});
it('leaves bare x_0 alone outside math', () => {
// Deliberate: we cannot tell `P_0` (subscript) from `my_0` (identifier)
// in arbitrary prose, so prefer to preserve identifiers.
expect(convertLatexToUnicode('x_0 is fine')).toBe('x_0 is fine');
});
});
describe('protection of non-LaTeX content', () => {
it('leaves Windows paths alone', () => {
expect(convertLatexToUnicode('C:\\Users\\foo\\bar')).toBe(
'C:\\Users\\foo\\bar',
);
});
it('leaves Windows UNC paths alone (no line-break rewrite in prose)', () => {
// `\\server\share\file` must NOT be rewritten to a newline. Line-break
// conversion is restricted to math mode. See PR #25802.
expect(convertLatexToUnicode('\\\\server\\share\\file')).toBe(
'\\\\server\\share\\file',
);
});
it('leaves regex backslash escapes alone', () => {
expect(convertLatexToUnicode('\\d+\\w*')).toBe('\\d+\\w*');
});
it('leaves $ in code-like prose alone', () => {
expect(convertLatexToUnicode('run $(command)$ to see output')).toBe(
'run $(command)$ to see output',
);
});
});
describe('combined scenarios', () => {
it('handles complex math in prose', () => {
const input =
'The complexity is $O(n \\log n)$ for sorting $n$ elements.';
expect(convertLatexToUnicode(input)).toBe(
'The complexity is O(n log n) for sorting n elements.',
);
});
it('handles multiple constructs in one line', () => {
const input = 'Let $\\alpha \\in \\mathbb{R}$ and $\\beta \\geq 0$.';
expect(convertLatexToUnicode(input)).toBe('Let α ∈ R and β ≥ 0.');
});
it('preserves surrounding text exactly', () => {
const input = 'Before $\\to$ after.';
expect(convertLatexToUnicode(input)).toBe('Before → after.');
});
it('idempotency — running twice yields the same result', () => {
const input = '$\\{P_0, \\dots, P_n\\}$';
const once = convertLatexToUnicode(input);
const twice = convertLatexToUnicode(once);
expect(twice).toBe(once);
});
});
});
+599
View File
@@ -0,0 +1,599 @@
/**
* @license
* Copyright 2025 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
/**
* Converts common LaTeX-style syntax in model output into terminal-friendly
* Unicode (and lightweight markdown where appropriate).
*
* Terminals cannot natively render LaTeX, but model responses — especially for
* math, CS, and algorithms — frequently include constructs like `$\{P_0,
* \dots, P_n\}$` or `$\to$`. Left as-is, the raw backslash commands show up
* verbatim and make the output look broken.
*
* This function is a conservative, lossy post-processor that handles the
* common cases and leaves anything it does not recognise untouched, so that
* legitimate backslash content (e.g. Windows paths, regex examples) is not
* mangled.
*
* See issue #25656.
*/
// Greek letters, lower and upper case, plus the common "var" variants.
const GREEK_LETTERS: Readonly<Record<string, string>> = Object.freeze({
alpha: 'α',
beta: 'β',
gamma: 'γ',
delta: 'δ',
epsilon: 'ε',
zeta: 'ζ',
eta: 'η',
theta: 'θ',
iota: 'ι',
kappa: 'κ',
lambda: 'λ',
mu: 'μ',
nu: 'ν',
xi: 'ξ',
omicron: 'ο',
pi: 'π',
rho: 'ρ',
sigma: 'σ',
tau: 'τ',
upsilon: 'υ',
phi: 'φ',
chi: 'χ',
psi: 'ψ',
omega: 'ω',
Alpha: 'Α',
Beta: 'Β',
Gamma: 'Γ',
Delta: 'Δ',
Epsilon: 'Ε',
Zeta: 'Ζ',
Eta: 'Η',
Theta: 'Θ',
Iota: 'Ι',
Kappa: 'Κ',
Lambda: 'Λ',
Mu: 'Μ',
Nu: 'Ν',
Xi: 'Ξ',
Omicron: 'Ο',
Pi: 'Π',
Rho: 'Ρ',
Sigma: 'Σ',
Tau: 'Τ',
Upsilon: 'Υ',
Phi: 'Φ',
Chi: 'Χ',
Psi: 'Ψ',
Omega: 'Ω',
varepsilon: 'ε',
vartheta: 'ϑ',
varphi: 'φ',
varrho: 'ϱ',
varsigma: 'ς',
varpi: 'ϖ',
});
// Named LaTeX commands → Unicode. Covers arrows, relations, set theory,
// logic, large operators, and a handful of common decorations. Anything not
// listed here is deliberately left untouched.
const LATEX_COMMANDS: Readonly<Record<string, string>> = Object.freeze({
// Arrows
to: '→',
rightarrow: '→',
Rightarrow: '⇒',
leftarrow: '←',
Leftarrow: '⇐',
leftrightarrow: '↔',
Leftrightarrow: '⇔',
mapsto: '↦',
longrightarrow: '⟶',
longleftarrow: '⟵',
longleftrightarrow: '⟷',
uparrow: '↑',
downarrow: '↓',
Uparrow: '⇑',
Downarrow: '⇓',
hookrightarrow: '↪',
hookleftarrow: '↩',
// Ellipses
dots: '…',
ldots: '…',
cdots: '⋯',
vdots: '⋮',
ddots: '⋱',
// Arithmetic / comparison
times: '×',
cdot: '·',
div: '÷',
pm: '±',
mp: '∓',
ast: '',
leq: '≤',
le: '≤',
geq: '≥',
ge: '≥',
neq: '≠',
ne: '≠',
ll: '≪',
gg: '≫',
approx: '≈',
equiv: '≡',
sim: '',
simeq: '≃',
cong: '≅',
propto: '∝',
// Set theory
in: '∈',
notin: '∉',
ni: '∋',
subset: '⊂',
supset: '⊃',
subseteq: '⊆',
supseteq: '⊇',
cup: '',
cap: '∩',
setminus: '',
emptyset: '∅',
varnothing: '∅',
// Logic
forall: '∀',
exists: '∃',
nexists: '∄',
neg: '¬',
lnot: '¬',
land: '∧',
wedge: '∧',
lor: '',
vee: '',
oplus: '⊕',
otimes: '⊗',
implies: '⟹',
iff: '⟺',
// Large operators
sum: '∑',
prod: '∏',
coprod: '∐',
int: '∫',
iint: '∬',
iiint: '∭',
oint: '∮',
// Calculus
partial: '∂',
nabla: '∇',
infty: '∞',
// Misc letters / constants
ell: '',
hbar: 'ℏ',
Re: '',
Im: '',
aleph: 'ℵ',
beth: 'ℶ',
// Brackets / delimiters
lbrace: '{',
rbrace: '}',
lbrack: '[',
rbrack: ']',
langle: '⟨',
rangle: '⟩',
lceil: '⌈',
rceil: '⌉',
lfloor: '⌊',
rfloor: '⌋',
// Geometry / misc
perp: '⊥',
parallel: '∥',
angle: '∠',
triangle: '△',
square: '□',
circ: '∘',
bullet: '•',
star: '⋆',
prime: '',
dag: '†',
ddag: '‡',
therefore: '∴',
because: '∵',
top: '',
bot: '⊥',
// Operator names (`\log`, `\sin`, …) render in LaTeX as upright text. In a
// terminal the closest equivalent is the lowercase word itself.
log: 'log',
ln: 'ln',
lg: 'lg',
exp: 'exp',
sin: 'sin',
cos: 'cos',
tan: 'tan',
cot: 'cot',
sec: 'sec',
csc: 'csc',
arcsin: 'arcsin',
arccos: 'arccos',
arctan: 'arctan',
sinh: 'sinh',
cosh: 'cosh',
tanh: 'tanh',
max: 'max',
min: 'min',
sup: 'sup',
inf: 'inf',
lim: 'lim',
limsup: 'lim sup',
liminf: 'lim inf',
arg: 'arg',
det: 'det',
dim: 'dim',
ker: 'ker',
gcd: 'gcd',
deg: 'deg',
hom: 'hom',
mod: 'mod',
bmod: 'mod',
pmod: 'mod',
// Whitespace commands — render as visible space so layout is roughly right.
quad: ' ',
qquad: ' ',
// These are all "thin-space" style commands in LaTeX; render as a single
// space so the surrounding tokens don't jam together.
',': ' ',
';': ' ',
':': ' ',
'!': '',
});
// Unicode subscript mappings (digits, operators, and the common letters that
// have full-height subscript glyphs in Unicode).
const SUBSCRIPT_MAP: Readonly<Record<string, string>> = Object.freeze({
'0': '₀',
'1': '₁',
'2': '₂',
'3': '₃',
'4': '₄',
'5': '₅',
'6': '₆',
'7': '₇',
'8': '₈',
'9': '₉',
'+': '₊',
'-': '₋',
'=': '₌',
'(': '₍',
')': '₎',
a: 'ₐ',
e: 'ₑ',
h: 'ₕ',
i: 'ᵢ',
j: 'ⱼ',
k: 'ₖ',
l: 'ₗ',
m: 'ₘ',
n: 'ₙ',
o: 'ₒ',
p: 'ₚ',
r: 'ᵣ',
s: 'ₛ',
t: 'ₜ',
u: 'ᵤ',
v: 'ᵥ',
x: 'ₓ',
});
// Unicode superscript mappings. A superset of subscripts — most letters have
// superscript glyphs.
const SUPERSCRIPT_MAP: Readonly<Record<string, string>> = Object.freeze({
'0': '⁰',
'1': '¹',
'2': '²',
'3': '³',
'4': '⁴',
'5': '⁵',
'6': '⁶',
'7': '⁷',
'8': '⁸',
'9': '⁹',
'+': '⁺',
'-': '⁻',
'=': '⁼',
'(': '⁽',
')': '⁾',
a: 'ᵃ',
b: 'ᵇ',
c: 'ᶜ',
d: 'ᵈ',
e: 'ᵉ',
f: 'ᶠ',
g: 'ᵍ',
h: 'ʰ',
i: 'ⁱ',
j: 'ʲ',
k: 'ᵏ',
l: 'ˡ',
m: 'ᵐ',
n: 'ⁿ',
o: 'ᵒ',
p: 'ᵖ',
r: 'ʳ',
s: 'ˢ',
t: 'ᵗ',
u: 'ᵘ',
v: 'ᵛ',
w: 'ʷ',
x: 'ˣ',
y: 'ʸ',
z: 'ᶻ',
});
/**
* Strips `$...$` and `$$...$$` math delimiters when the inner content looks
* like math, applying the full set of math-mode conversions (including
* sub/superscripts) to the inner text. The goal is to handle model output
* without eating dollar signs that appear in ordinary prose (prices,
* shell examples, etc.).
*
* A pair of `$...$` is treated as math when the inner text either:
* - contains a LaTeX marker (`\command`, `_`, `^`), or
* - is a single letter, possibly with whitespace padding (e.g. `$x$`,
* `$ n $`). Shell-style variables like `$USER` are LEFT intact because
* multi-letter all-caps sequences look much more like shell vars than
* math in practice.
*
* A currency expression like `$5.99` (single `$`) never matches the pair
* regex. `From $5 to $10` matches `$5 to $` as a pair but the inner text is
* neither mathy nor a single variable, so it is left intact.
*/
function stripMathDelimiters(text: string): string {
// Display math first, greedy-safe with non-dollar inner class.
let out = text.replace(/\$\$([^$]+)\$\$/g, (_, inner: string) =>
applyMathModeConversions(inner),
);
// Inline math: lazy, single-line to avoid eating across paragraphs.
out = out.replace(/\$([^$\n]+?)\$/g, (match, inner: string) => {
const hasLatexMarkers = /\\[A-Za-z]|[\\_^]/.test(inner);
const isSingleVariable = /^\s*[A-Za-z]\s*$/.test(inner);
if (hasLatexMarkers || isSingleVariable) {
return applyMathModeConversions(inner);
}
return match;
});
return out;
}
/**
* Converts `\textbf{..}`, `\textit{..}`, `\emph{..}`, `\text{..}`,
* `\mathrm{..}`, `\mathbf{..}`, `\mathit{..}`, `\mathsf{..}`, `\mathtt{..}`,
* and `\operatorname{..}` into markdown-equivalent wrappers or plain text so
* the regular inline parser picks them up downstream.
*
* Only handles a single level of nesting (no inner braces) — this keeps the
* regex bounded and avoids catastrophic backtracking on adversarial input.
*/
function convertTextFormatting(text: string): string {
let out = text;
out = out.replace(
/\\(?:textbf|mathbf)\{([^{}]*)\}/g,
(_, inner: string) => `**${inner}**`,
);
out = out.replace(
/\\(?:textit|emph|mathit)\{([^{}]*)\}/g,
(_, inner: string) => `*${inner}*`,
);
out = out.replace(
/\\(?:text|mathrm|mathsf|mathtt|mathbb|mathcal|mathfrak|operatorname)\{([^{}]*)\}/g,
(_, inner: string) => inner,
);
return out;
}
/**
* Handles `\frac{a}{b}` → `(a)/(b)` and `\sqrt{x}` → `√(x)`.
* Only a single level of braces is supported.
*/
function convertFractionsAndRoots(text: string): string {
let out = text;
out = out.replace(
/\\frac\{([^{}]*)\}\{([^{}]*)\}/g,
(_, num: string, den: string) => `(${num})/(${den})`,
);
out = out.replace(
/\\sqrt\[([^\]]*)\]\{([^{}]*)\}/g,
(_, index: string, radicand: string) => `${index}√(${radicand})`,
);
out = out.replace(
/\\sqrt\{([^{}]*)\}/g,
(_, radicand: string) => `√(${radicand})`,
);
return out;
}
/**
* Converts escaped single-character specials (`\{` → `{`, `\_` → `_`, etc.).
* Runs before command lookup so `\{` is not misread as a command named `{`.
*/
function convertEscapedSpecials(text: string): string {
// The set is intentionally narrow: only characters that have meaning in
// LaTeX and also appear unescaped in plain text. We do not unescape `\\`
// (line break) here — it is handled separately.
let out = text.replace(/\\([{}[\]_%&#$|])/g, (_, ch: string) => ch);
// `\ ` (backslash + space) is LaTeX for a non-breaking space; just keep it
// as a regular space so words do not collide.
out = out.replace(/\\ /g, ' ');
return out;
}
/**
* Converts named commands (alphabetic control sequences) to Unicode. Anything
* not in the tables is left as-is so unrelated backslash content
* (e.g. Windows paths) is not disturbed.
*/
function convertNamedCommands(text: string): string {
return text.replace(
/\\([A-Za-z]+)(?![A-Za-z])/g,
(match, name: string) =>
GREEK_LETTERS[name] ?? LATEX_COMMANDS[name] ?? match,
);
}
/**
* Converts the short-form punctuation commands `\,`, `\;`, `\:`, `\!` used
* for spacing in LaTeX. These are handled separately from alphabetic commands
* because the regex for the latter only matches letters.
*/
function convertPunctuationCommands(text: string): string {
// `\,`, `\;`, `\:` all render as a single space; `\!` is a negative space
// and is stripped.
return text.replace(/\\([,;:!])/g, (_, ch: string) => {
switch (ch) {
case ',':
case ';':
case ':':
return ' ';
case '!':
return '';
default:
return ch;
}
});
}
/**
* Converts the `\\` line-break command (used inside math environments and
* tables) to a literal newline. Must run after `\` specials but before any
* other regex that might see a lingering backslash.
*/
function convertLineBreaks(text: string): string {
return text.replace(/\\\\/g, '\n');
}
/**
* Converts subscripts and superscripts to Unicode where every character in
* the operand maps. If any character has no mapping the whole operand is
* left alone, to avoid "half-converted" output that looks worse than no
* conversion.
*/
function convertSubSuperScripts(text: string): string {
// Braced form first: x_{...}, x^{...}. We only support BMP characters (the
// mapping tables are ASCII-only), so iterating with `Array.from` over code
// units is safe and keeps the lint rule against splitting strings happy.
const charsOf = (s: string): string[] => Array.from(s);
let out = text.replace(/_\{([^{}]+)\}/g, (match, inner: string) => {
const chars = charsOf(inner);
if (chars.every((c) => SUBSCRIPT_MAP[c] !== undefined)) {
return chars.map((c) => SUBSCRIPT_MAP[c]).join('');
}
return match;
});
out = out.replace(/\^\{([^{}]+)\}/g, (match, inner: string) => {
const chars = charsOf(inner);
if (chars.every((c) => SUPERSCRIPT_MAP[c] !== undefined)) {
return chars.map((c) => SUPERSCRIPT_MAP[c]).join('');
}
return match;
});
// Single-character form: x_0, x^2. Only convert when the character actually
// has a mapping — leaves `file_name` and `foo^bar` alone.
out = out.replace(
/([A-Za-z0-9)\]])_([A-Za-z0-9+\-=()])/g,
(match, base: string, c: string) => {
const sub = SUBSCRIPT_MAP[c];
return sub ? `${base}${sub}` : match;
},
);
out = out.replace(
/([A-Za-z0-9)\]])\^([A-Za-z0-9+\-=()])/g,
(match, base: string, c: string) => {
const sup = SUPERSCRIPT_MAP[c];
return sup ? `${base}${sup}` : match;
},
);
return out;
}
/**
* Applies the full set of conversions that make sense inside a LaTeX math
* region (i.e. text that was originally wrapped in `$...$`). This includes
* sub/superscripts, which are NOT safe to apply to arbitrary prose because
* they would mangle identifiers like `file_name`.
*/
function applyMathModeConversions(text: string): string {
let out = text;
out = convertTextFormatting(out);
out = convertFractionsAndRoots(out);
out = convertEscapedSpecials(out);
out = convertLineBreaks(out);
out = convertNamedCommands(out);
out = convertPunctuationCommands(out);
out = convertSubSuperScripts(out);
return out;
}
/**
* Applies conversions that are safe to run on arbitrary prose — anything
* keyed off explicit LaTeX tokens like `\alpha`, `\textbf{...}`, `\to`. Does
* NOT touch standalone `_` or `^` so identifiers and snake_case names are
* preserved.
*/
function applyProseConversions(text: string): string {
let out = text;
out = convertTextFormatting(out);
out = convertFractionsAndRoots(out);
out = convertEscapedSpecials(out);
// Deliberately NOT running convertLineBreaks here: outside math delimiters
// `\\` is far more likely to be a Windows UNC path (`\\server\share`) or an
// escaped backslash in code-like prose than a LaTeX line break. Legitimate
// LaTeX line breaks belong inside `$...$` or `$$...$$` and are handled by
// applyMathModeConversions. See PR #25802 review.
out = convertNamedCommands(out);
out = convertPunctuationCommands(out);
return out;
}
/**
* Top-level entry point. Two-phase conversion:
*
* 1. Strip `$...$` / `$$...$$` math regions, applying math-mode conversions
* (including sub/superscripts) to the inner text. The heuristic for
* "this dollar pair is math" runs against the ORIGINAL input so that
* model-authored LaTeX is recognised before any tokens are rewritten.
*
* 2. Run prose-safe conversions over the remaining text, catching
* unwrapped LaTeX tokens (`\alpha`, `\to`, `\textbf{...}`) that the
* model emitted outside math delimiters.
*
* Short-circuits on input that has no LaTeX markers at all (`\` or `$`) so
* the hot rendering path stays cheap for ordinary prose.
*/
export function convertLatexToUnicode(input: string): string {
if (!input) return input;
// Fast path: if there's no backslash and no dollar sign, there's nothing to
// convert. This keeps the hot rendering path inexpensive for ordinary text.
if (input.indexOf('\\') === -1 && input.indexOf('$') === -1) {
return input;
}
let text = input;
text = stripMathDelimiters(text);
text = applyProseConversions(text);
return text;
}
@@ -222,5 +222,52 @@ describe('parsingUtils', () => {
),
);
});
describe('LaTeX conversion (issue #25656)', () => {
it('converts LaTeX in plain text (no markdown tokens)', () => {
const input = 'No cycles $\\to$ no deadlock';
const output = parseMarkdownToANSI(input);
expect(output).toBe(primary('No cycles → no deadlock'));
});
it('converts LaTeX in the set example from the issue', () => {
const input = 'Processes $\\{P_0, \\dots, P_n\\}$';
const output = parseMarkdownToANSI(input);
expect(output).toBe(primary('Processes {P₀, …, Pₙ}'));
});
it('preserves LaTeX inside inline code', () => {
// Content between backticks must be rendered verbatim — conversion
// must NOT be applied inside code spans, even when the code contains
// `$...$` that would otherwise be stripped.
const input = 'use `$\\to$` for an arrow';
const output = parseMarkdownToANSI(input);
expect(output).toBe(
`${primary('use ')}${accent('$\\to$')}${primary(' for an arrow')}`,
);
});
it('converts LaTeX in slices around markdown tokens', () => {
const input = '$\\alpha$ is **bold** and $\\beta$ is plain';
const output = parseMarkdownToANSI(input);
expect(output).toBe(
`${primary('α is ')}${chalk.bold(primary('bold'))}${primary(
' and β is plain',
)}`,
);
});
it('leaves Windows paths alone', () => {
const input = 'Path: C:\\Users\\foo';
const output = parseMarkdownToANSI(input);
expect(output).toBe(primary('Path: C:\\Users\\foo'));
});
it('leaves currency amounts alone', () => {
const input = 'It costs $5.99 total';
const output = parseMarkdownToANSI(input);
expect(output).toBe(primary('It costs $5.99 total'));
});
});
});
});
@@ -12,6 +12,7 @@ import {
} from '../themes/color-utils.js';
import { theme } from '../semantic-colors.js';
import { debugLogger } from '@google/gemini-cli-core';
import { convertLatexToUnicode } from './latexToUnicode.js';
// Constants for Markdown parsing
const BOLD_MARKER_LENGTH = 2; // For "**"
@@ -72,11 +73,49 @@ const ansiColorize = (str: string, color: string | undefined): string => {
* Converts markdown text into a string with ANSI escape codes.
* This mirrors the parsing logic in InlineMarkdownRenderer.tsx
*/
// Private-Use-Area codepoint used as a placeholder sentinel when masking
// inline code / URL spans from LaTeX conversion. Not touched by
// stripUnsafeCharacters and not matched by the markdown tokenizer.
const MASK_SENTINEL = '\uE000';
const MASK_PATTERN = /\uE000(\d+)\uE000/g;
/**
* Runs LaTeX conversion on `text` while keeping inline code spans and bare
* URLs verbatim. Without masking, the LaTeX pass would happily rewrite
* ``$\to$`` inside a backtick code span — violating the "code is verbatim"
* contract — and could rewrite URL query strings containing `$`.
*/
const convertLatexPreservingSpans = (text: string): string => {
const preserved: string[] = [];
// Match inline code spans (with matched backtick counts) and bare URLs.
// Order matters: code spans first so they win over a URL inside a span.
const masked = text.replace(/(`+)([^`\n]+?)\1|https?:\/\/\S+/g, (match) => {
const index = preserved.push(match) - 1;
return `${MASK_SENTINEL}${index}${MASK_SENTINEL}`;
});
const converted = convertLatexToUnicode(masked);
return converted.replace(
MASK_PATTERN,
// Fallback to the literal match if the index is somehow out of range —
// defensive against the unlikely case where the PUA sentinel appears in
// user input. Without the fallback, replace would emit "undefined".
(match, i: string) => preserved[Number(i)] ?? match,
);
};
export const parseMarkdownToANSI = (
text: string,
rawText: string,
defaultColor?: string,
): string => {
const baseColor = defaultColor ?? theme.text.primary;
// Convert LaTeX-style math/commands to Unicode BEFORE tokenizing markdown,
// so constructs like `$\{P_0, \dots, P_n\}$` are handled as a whole even
// when they contain underscores (which the tokenizer would otherwise treat
// as italic markers). Inline code and URLs are masked during the
// conversion so their contents are preserved verbatim. Unknown `\foo`
// sequences are left alone, so Windows paths and regex escapes survive.
// See issue #25656.
const text = convertLatexPreservingSpans(rawText);
// Early return for plain text without markdown or URLs
if (!/[*_~`<[https?:]/.test(text)) {
return ansiColorize(text, baseColor);