mirror of
https://github.com/google-gemini/gemini-cli.git
synced 2026-05-13 13:22:35 -07:00
fix(cli): render LaTeX-style output as Unicode in the TUI (#25802)
Co-authored-by: cynthialong0-0 <82900738+cynthialong0-0@users.noreply.github.com>
This commit is contained in:
@@ -0,0 +1,304 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2025 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { describe, it, expect } from 'vitest';
|
||||
import { convertLatexToUnicode } from './latexToUnicode.js';
|
||||
|
||||
describe('convertLatexToUnicode', () => {
|
||||
describe('fast path', () => {
|
||||
it('returns empty string unchanged', () => {
|
||||
expect(convertLatexToUnicode('')).toBe('');
|
||||
});
|
||||
|
||||
it('returns text without backslash or dollar unchanged', () => {
|
||||
const input = 'hello world 123';
|
||||
expect(convertLatexToUnicode(input)).toBe(input);
|
||||
});
|
||||
|
||||
it('short-circuits plain ASCII identically', () => {
|
||||
const input = 'The quick brown fox jumps over the lazy dog.';
|
||||
expect(convertLatexToUnicode(input)).toBe(input);
|
||||
});
|
||||
});
|
||||
|
||||
describe('issue #25656 examples', () => {
|
||||
it('converts the set-of-processes example', () => {
|
||||
const input = 'A set of processes $\\{P_0, P_1, \\dots, P_n\\}$ exists';
|
||||
expect(convertLatexToUnicode(input)).toBe(
|
||||
'A set of processes {P₀, P₁, …, Pₙ} exists',
|
||||
);
|
||||
});
|
||||
|
||||
it('converts the deadlock arrow example', () => {
|
||||
const input = 'If the graph contains no cycles $\\to$ No Deadlock.';
|
||||
expect(convertLatexToUnicode(input)).toBe(
|
||||
'If the graph contains no cycles → No Deadlock.',
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
describe('math delimiters', () => {
|
||||
it('strips $...$ when the content contains LaTeX markers', () => {
|
||||
expect(convertLatexToUnicode('see $\\alpha$ here')).toBe('see α here');
|
||||
});
|
||||
|
||||
it('strips $...$ around single variables', () => {
|
||||
expect(convertLatexToUnicode('let $x$ be a value')).toBe(
|
||||
'let x be a value',
|
||||
);
|
||||
});
|
||||
|
||||
it('strips $$...$$ display math', () => {
|
||||
expect(convertLatexToUnicode('$$\\alpha + \\beta$$')).toBe('α + β');
|
||||
});
|
||||
|
||||
it('leaves currency $5.99 alone', () => {
|
||||
expect(convertLatexToUnicode('It costs $5.99 total')).toBe(
|
||||
'It costs $5.99 total',
|
||||
);
|
||||
});
|
||||
|
||||
it('leaves two dollar amounts alone', () => {
|
||||
// The regex matches `$5 to $` as a pair, but the inner content is
|
||||
// neither mathy nor purely variables, so it is left intact.
|
||||
expect(convertLatexToUnicode('prices range $5 to $10')).toBe(
|
||||
'prices range $5 to $10',
|
||||
);
|
||||
});
|
||||
|
||||
it('leaves shell-style $ interpolation alone', () => {
|
||||
expect(convertLatexToUnicode('echo $USER $HOME')).toBe(
|
||||
'echo $USER $HOME',
|
||||
);
|
||||
});
|
||||
|
||||
it('does not strip dollars across newlines', () => {
|
||||
expect(convertLatexToUnicode('price $5\nfee $3')).toBe(
|
||||
'price $5\nfee $3',
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
describe('greek letters', () => {
|
||||
it('converts lowercase greek', () => {
|
||||
expect(convertLatexToUnicode('\\alpha \\beta \\gamma')).toBe('α β γ');
|
||||
});
|
||||
|
||||
it('converts uppercase greek', () => {
|
||||
expect(convertLatexToUnicode('\\Omega \\Delta')).toBe('Ω Δ');
|
||||
});
|
||||
|
||||
it('does not mangle a prefix match', () => {
|
||||
// `\alphabet` is not a known command — must stay intact.
|
||||
expect(convertLatexToUnicode('\\alphabet')).toBe('\\alphabet');
|
||||
});
|
||||
});
|
||||
|
||||
describe('named commands', () => {
|
||||
it('converts arrows', () => {
|
||||
expect(convertLatexToUnicode('\\to \\rightarrow \\Rightarrow')).toBe(
|
||||
'→ → ⇒',
|
||||
);
|
||||
});
|
||||
|
||||
it('converts relations', () => {
|
||||
expect(convertLatexToUnicode('\\leq \\geq \\neq \\approx')).toBe(
|
||||
'≤ ≥ ≠ ≈',
|
||||
);
|
||||
});
|
||||
|
||||
it('converts set theory', () => {
|
||||
expect(convertLatexToUnicode('\\in \\notin \\cup \\cap')).toBe('∈ ∉ ∪ ∩');
|
||||
});
|
||||
|
||||
it('converts logic', () => {
|
||||
expect(convertLatexToUnicode('\\forall x \\exists y')).toBe('∀ x ∃ y');
|
||||
});
|
||||
|
||||
it('converts large operators', () => {
|
||||
expect(convertLatexToUnicode('\\sum \\prod \\int')).toBe('∑ ∏ ∫');
|
||||
});
|
||||
|
||||
it('converts ellipses', () => {
|
||||
expect(convertLatexToUnicode('a, b, \\dots, z')).toBe('a, b, …, z');
|
||||
});
|
||||
|
||||
it('converts infty', () => {
|
||||
expect(convertLatexToUnicode('\\infty')).toBe('∞');
|
||||
});
|
||||
|
||||
it('leaves unknown commands untouched', () => {
|
||||
expect(convertLatexToUnicode('\\thisIsNotReal')).toBe('\\thisIsNotReal');
|
||||
});
|
||||
});
|
||||
|
||||
describe('escaped specials', () => {
|
||||
it('unescapes braces and underscore', () => {
|
||||
expect(convertLatexToUnicode('\\{ \\} \\_')).toBe('{ } _');
|
||||
});
|
||||
|
||||
it('unescapes percent, ampersand, hash, dollar, pipe', () => {
|
||||
expect(convertLatexToUnicode('\\% \\& \\# \\$ \\|')).toBe('% & # $ |');
|
||||
});
|
||||
|
||||
it('unescapes backslash-space as a regular space', () => {
|
||||
expect(convertLatexToUnicode('word\\ boundary')).toBe('word boundary');
|
||||
});
|
||||
|
||||
it('converts \\\\ to a newline inside math mode', () => {
|
||||
// `\\` is a LaTeX line break in math/tabular contexts. Only convert
|
||||
// inside `$...$` — outside math this would mangle Windows UNC paths
|
||||
// (`\\server\share`) and escaped backslashes in code-like prose.
|
||||
expect(convertLatexToUnicode('$a\\\\b$')).toBe('a\nb');
|
||||
});
|
||||
|
||||
it('leaves \\\\ alone outside math mode', () => {
|
||||
expect(convertLatexToUnicode('line1\\\\line2')).toBe('line1\\\\line2');
|
||||
});
|
||||
});
|
||||
|
||||
describe('text formatting', () => {
|
||||
it('wraps textbf in markdown bold', () => {
|
||||
expect(convertLatexToUnicode('\\textbf{hello}')).toBe('**hello**');
|
||||
});
|
||||
|
||||
it('wraps textit in markdown italic', () => {
|
||||
expect(convertLatexToUnicode('\\textit{hello}')).toBe('*hello*');
|
||||
});
|
||||
|
||||
it('strips \\text wrapper', () => {
|
||||
expect(convertLatexToUnicode('\\text{plain}')).toBe('plain');
|
||||
});
|
||||
|
||||
it('strips \\mathrm', () => {
|
||||
expect(convertLatexToUnicode('\\mathrm{foo}')).toBe('foo');
|
||||
});
|
||||
|
||||
it('handles \\emph as italic', () => {
|
||||
expect(convertLatexToUnicode('\\emph{emphasized}')).toBe('*emphasized*');
|
||||
});
|
||||
});
|
||||
|
||||
describe('fractions and roots', () => {
|
||||
it('converts \\frac', () => {
|
||||
expect(convertLatexToUnicode('\\frac{a}{b}')).toBe('(a)/(b)');
|
||||
});
|
||||
|
||||
it('converts \\sqrt', () => {
|
||||
expect(convertLatexToUnicode('\\sqrt{x}')).toBe('√(x)');
|
||||
});
|
||||
|
||||
it('converts \\sqrt with index', () => {
|
||||
expect(convertLatexToUnicode('\\sqrt[3]{x}')).toBe('3√(x)');
|
||||
});
|
||||
|
||||
it('converts \\frac combined with greek', () => {
|
||||
expect(convertLatexToUnicode('\\frac{\\alpha}{\\beta}')).toBe('(α)/(β)');
|
||||
});
|
||||
});
|
||||
|
||||
describe('subscripts and superscripts', () => {
|
||||
// Sub/superscripts are only applied inside math delimiters to avoid
|
||||
// mangling identifiers like `file_name` and `foo_bar` in regular prose.
|
||||
it('converts digit subscripts inside math', () => {
|
||||
expect(convertLatexToUnicode('$x_0 + x_1 + x_2$')).toBe('x₀ + x₁ + x₂');
|
||||
});
|
||||
|
||||
it('converts digit superscripts inside math', () => {
|
||||
expect(convertLatexToUnicode('$E = mc^2$')).toBe('E = mc²');
|
||||
});
|
||||
|
||||
it('converts letter subscripts where available', () => {
|
||||
expect(convertLatexToUnicode('$P_n$ and $x_i$')).toBe('Pₙ and xᵢ');
|
||||
});
|
||||
|
||||
it('converts braced digit subscripts', () => {
|
||||
expect(convertLatexToUnicode('$x_{12}$')).toBe('x₁₂');
|
||||
});
|
||||
|
||||
it('leaves subscripts with no unicode mapping alone', () => {
|
||||
// `q` has no subscript glyph in Unicode — leave the whole operand
|
||||
// untouched to avoid inconsistent-looking output.
|
||||
expect(convertLatexToUnicode('$x_{abq}$')).toBe('x_{abq}');
|
||||
});
|
||||
|
||||
it('does not subscript identifiers in prose', () => {
|
||||
// Outside math delimiters, `_` is left alone entirely so that
|
||||
// snake_case identifiers and file paths render correctly. This is a
|
||||
// deliberate trade-off against model output that emits subscripts
|
||||
// unwrapped.
|
||||
expect(convertLatexToUnicode('the file_name variable')).toBe(
|
||||
'the file_name variable',
|
||||
);
|
||||
expect(convertLatexToUnicode('_private')).toBe('_private');
|
||||
});
|
||||
|
||||
it('does not superscript when character is unmapped in sup', () => {
|
||||
// `^Q` — Q has no superscript. The regex only matches when the char is
|
||||
// in the map; leave as-is even inside math.
|
||||
expect(convertLatexToUnicode('$x^Q$')).toBe('x^Q');
|
||||
});
|
||||
|
||||
it('leaves bare x_0 alone outside math', () => {
|
||||
// Deliberate: we cannot tell `P_0` (subscript) from `my_0` (identifier)
|
||||
// in arbitrary prose, so prefer to preserve identifiers.
|
||||
expect(convertLatexToUnicode('x_0 is fine')).toBe('x_0 is fine');
|
||||
});
|
||||
});
|
||||
|
||||
describe('protection of non-LaTeX content', () => {
|
||||
it('leaves Windows paths alone', () => {
|
||||
expect(convertLatexToUnicode('C:\\Users\\foo\\bar')).toBe(
|
||||
'C:\\Users\\foo\\bar',
|
||||
);
|
||||
});
|
||||
|
||||
it('leaves Windows UNC paths alone (no line-break rewrite in prose)', () => {
|
||||
// `\\server\share\file` must NOT be rewritten to a newline. Line-break
|
||||
// conversion is restricted to math mode. See PR #25802.
|
||||
expect(convertLatexToUnicode('\\\\server\\share\\file')).toBe(
|
||||
'\\\\server\\share\\file',
|
||||
);
|
||||
});
|
||||
|
||||
it('leaves regex backslash escapes alone', () => {
|
||||
expect(convertLatexToUnicode('\\d+\\w*')).toBe('\\d+\\w*');
|
||||
});
|
||||
|
||||
it('leaves $ in code-like prose alone', () => {
|
||||
expect(convertLatexToUnicode('run $(command)$ to see output')).toBe(
|
||||
'run $(command)$ to see output',
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
describe('combined scenarios', () => {
|
||||
it('handles complex math in prose', () => {
|
||||
const input =
|
||||
'The complexity is $O(n \\log n)$ for sorting $n$ elements.';
|
||||
expect(convertLatexToUnicode(input)).toBe(
|
||||
'The complexity is O(n log n) for sorting n elements.',
|
||||
);
|
||||
});
|
||||
|
||||
it('handles multiple constructs in one line', () => {
|
||||
const input = 'Let $\\alpha \\in \\mathbb{R}$ and $\\beta \\geq 0$.';
|
||||
expect(convertLatexToUnicode(input)).toBe('Let α ∈ R and β ≥ 0.');
|
||||
});
|
||||
|
||||
it('preserves surrounding text exactly', () => {
|
||||
const input = 'Before $\\to$ after.';
|
||||
expect(convertLatexToUnicode(input)).toBe('Before → after.');
|
||||
});
|
||||
|
||||
it('idempotency — running twice yields the same result', () => {
|
||||
const input = '$\\{P_0, \\dots, P_n\\}$';
|
||||
const once = convertLatexToUnicode(input);
|
||||
const twice = convertLatexToUnicode(once);
|
||||
expect(twice).toBe(once);
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,599 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2025 Google LLC
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
/**
|
||||
* Converts common LaTeX-style syntax in model output into terminal-friendly
|
||||
* Unicode (and lightweight markdown where appropriate).
|
||||
*
|
||||
* Terminals cannot natively render LaTeX, but model responses — especially for
|
||||
* math, CS, and algorithms — frequently include constructs like `$\{P_0,
|
||||
* \dots, P_n\}$` or `$\to$`. Left as-is, the raw backslash commands show up
|
||||
* verbatim and make the output look broken.
|
||||
*
|
||||
* This function is a conservative, lossy post-processor that handles the
|
||||
* common cases and leaves anything it does not recognise untouched, so that
|
||||
* legitimate backslash content (e.g. Windows paths, regex examples) is not
|
||||
* mangled.
|
||||
*
|
||||
* See issue #25656.
|
||||
*/
|
||||
|
||||
// Greek letters, lower and upper case, plus the common "var" variants.
|
||||
const GREEK_LETTERS: Readonly<Record<string, string>> = Object.freeze({
|
||||
alpha: 'α',
|
||||
beta: 'β',
|
||||
gamma: 'γ',
|
||||
delta: 'δ',
|
||||
epsilon: 'ε',
|
||||
zeta: 'ζ',
|
||||
eta: 'η',
|
||||
theta: 'θ',
|
||||
iota: 'ι',
|
||||
kappa: 'κ',
|
||||
lambda: 'λ',
|
||||
mu: 'μ',
|
||||
nu: 'ν',
|
||||
xi: 'ξ',
|
||||
omicron: 'ο',
|
||||
pi: 'π',
|
||||
rho: 'ρ',
|
||||
sigma: 'σ',
|
||||
tau: 'τ',
|
||||
upsilon: 'υ',
|
||||
phi: 'φ',
|
||||
chi: 'χ',
|
||||
psi: 'ψ',
|
||||
omega: 'ω',
|
||||
Alpha: 'Α',
|
||||
Beta: 'Β',
|
||||
Gamma: 'Γ',
|
||||
Delta: 'Δ',
|
||||
Epsilon: 'Ε',
|
||||
Zeta: 'Ζ',
|
||||
Eta: 'Η',
|
||||
Theta: 'Θ',
|
||||
Iota: 'Ι',
|
||||
Kappa: 'Κ',
|
||||
Lambda: 'Λ',
|
||||
Mu: 'Μ',
|
||||
Nu: 'Ν',
|
||||
Xi: 'Ξ',
|
||||
Omicron: 'Ο',
|
||||
Pi: 'Π',
|
||||
Rho: 'Ρ',
|
||||
Sigma: 'Σ',
|
||||
Tau: 'Τ',
|
||||
Upsilon: 'Υ',
|
||||
Phi: 'Φ',
|
||||
Chi: 'Χ',
|
||||
Psi: 'Ψ',
|
||||
Omega: 'Ω',
|
||||
varepsilon: 'ε',
|
||||
vartheta: 'ϑ',
|
||||
varphi: 'φ',
|
||||
varrho: 'ϱ',
|
||||
varsigma: 'ς',
|
||||
varpi: 'ϖ',
|
||||
});
|
||||
|
||||
// Named LaTeX commands → Unicode. Covers arrows, relations, set theory,
|
||||
// logic, large operators, and a handful of common decorations. Anything not
|
||||
// listed here is deliberately left untouched.
|
||||
const LATEX_COMMANDS: Readonly<Record<string, string>> = Object.freeze({
|
||||
// Arrows
|
||||
to: '→',
|
||||
rightarrow: '→',
|
||||
Rightarrow: '⇒',
|
||||
leftarrow: '←',
|
||||
Leftarrow: '⇐',
|
||||
leftrightarrow: '↔',
|
||||
Leftrightarrow: '⇔',
|
||||
mapsto: '↦',
|
||||
longrightarrow: '⟶',
|
||||
longleftarrow: '⟵',
|
||||
longleftrightarrow: '⟷',
|
||||
uparrow: '↑',
|
||||
downarrow: '↓',
|
||||
Uparrow: '⇑',
|
||||
Downarrow: '⇓',
|
||||
hookrightarrow: '↪',
|
||||
hookleftarrow: '↩',
|
||||
|
||||
// Ellipses
|
||||
dots: '…',
|
||||
ldots: '…',
|
||||
cdots: '⋯',
|
||||
vdots: '⋮',
|
||||
ddots: '⋱',
|
||||
|
||||
// Arithmetic / comparison
|
||||
times: '×',
|
||||
cdot: '·',
|
||||
div: '÷',
|
||||
pm: '±',
|
||||
mp: '∓',
|
||||
ast: '∗',
|
||||
leq: '≤',
|
||||
le: '≤',
|
||||
geq: '≥',
|
||||
ge: '≥',
|
||||
neq: '≠',
|
||||
ne: '≠',
|
||||
ll: '≪',
|
||||
gg: '≫',
|
||||
approx: '≈',
|
||||
equiv: '≡',
|
||||
sim: '∼',
|
||||
simeq: '≃',
|
||||
cong: '≅',
|
||||
propto: '∝',
|
||||
|
||||
// Set theory
|
||||
in: '∈',
|
||||
notin: '∉',
|
||||
ni: '∋',
|
||||
subset: '⊂',
|
||||
supset: '⊃',
|
||||
subseteq: '⊆',
|
||||
supseteq: '⊇',
|
||||
cup: '∪',
|
||||
cap: '∩',
|
||||
setminus: '∖',
|
||||
emptyset: '∅',
|
||||
varnothing: '∅',
|
||||
|
||||
// Logic
|
||||
forall: '∀',
|
||||
exists: '∃',
|
||||
nexists: '∄',
|
||||
neg: '¬',
|
||||
lnot: '¬',
|
||||
land: '∧',
|
||||
wedge: '∧',
|
||||
lor: '∨',
|
||||
vee: '∨',
|
||||
oplus: '⊕',
|
||||
otimes: '⊗',
|
||||
implies: '⟹',
|
||||
iff: '⟺',
|
||||
|
||||
// Large operators
|
||||
sum: '∑',
|
||||
prod: '∏',
|
||||
coprod: '∐',
|
||||
int: '∫',
|
||||
iint: '∬',
|
||||
iiint: '∭',
|
||||
oint: '∮',
|
||||
|
||||
// Calculus
|
||||
partial: '∂',
|
||||
nabla: '∇',
|
||||
infty: '∞',
|
||||
|
||||
// Misc letters / constants
|
||||
ell: 'ℓ',
|
||||
hbar: 'ℏ',
|
||||
Re: 'ℜ',
|
||||
Im: 'ℑ',
|
||||
aleph: 'ℵ',
|
||||
beth: 'ℶ',
|
||||
|
||||
// Brackets / delimiters
|
||||
lbrace: '{',
|
||||
rbrace: '}',
|
||||
lbrack: '[',
|
||||
rbrack: ']',
|
||||
langle: '⟨',
|
||||
rangle: '⟩',
|
||||
lceil: '⌈',
|
||||
rceil: '⌉',
|
||||
lfloor: '⌊',
|
||||
rfloor: '⌋',
|
||||
|
||||
// Geometry / misc
|
||||
perp: '⊥',
|
||||
parallel: '∥',
|
||||
angle: '∠',
|
||||
triangle: '△',
|
||||
square: '□',
|
||||
circ: '∘',
|
||||
bullet: '•',
|
||||
star: '⋆',
|
||||
prime: '′',
|
||||
dag: '†',
|
||||
ddag: '‡',
|
||||
therefore: '∴',
|
||||
because: '∵',
|
||||
top: '⊤',
|
||||
bot: '⊥',
|
||||
|
||||
// Operator names (`\log`, `\sin`, …) render in LaTeX as upright text. In a
|
||||
// terminal the closest equivalent is the lowercase word itself.
|
||||
log: 'log',
|
||||
ln: 'ln',
|
||||
lg: 'lg',
|
||||
exp: 'exp',
|
||||
sin: 'sin',
|
||||
cos: 'cos',
|
||||
tan: 'tan',
|
||||
cot: 'cot',
|
||||
sec: 'sec',
|
||||
csc: 'csc',
|
||||
arcsin: 'arcsin',
|
||||
arccos: 'arccos',
|
||||
arctan: 'arctan',
|
||||
sinh: 'sinh',
|
||||
cosh: 'cosh',
|
||||
tanh: 'tanh',
|
||||
max: 'max',
|
||||
min: 'min',
|
||||
sup: 'sup',
|
||||
inf: 'inf',
|
||||
lim: 'lim',
|
||||
limsup: 'lim sup',
|
||||
liminf: 'lim inf',
|
||||
arg: 'arg',
|
||||
det: 'det',
|
||||
dim: 'dim',
|
||||
ker: 'ker',
|
||||
gcd: 'gcd',
|
||||
deg: 'deg',
|
||||
hom: 'hom',
|
||||
mod: 'mod',
|
||||
bmod: 'mod',
|
||||
pmod: 'mod',
|
||||
|
||||
// Whitespace commands — render as visible space so layout is roughly right.
|
||||
quad: ' ',
|
||||
qquad: ' ',
|
||||
// These are all "thin-space" style commands in LaTeX; render as a single
|
||||
// space so the surrounding tokens don't jam together.
|
||||
',': ' ',
|
||||
';': ' ',
|
||||
':': ' ',
|
||||
'!': '',
|
||||
});
|
||||
|
||||
// Unicode subscript mappings (digits, operators, and the common letters that
|
||||
// have full-height subscript glyphs in Unicode).
|
||||
const SUBSCRIPT_MAP: Readonly<Record<string, string>> = Object.freeze({
|
||||
'0': '₀',
|
||||
'1': '₁',
|
||||
'2': '₂',
|
||||
'3': '₃',
|
||||
'4': '₄',
|
||||
'5': '₅',
|
||||
'6': '₆',
|
||||
'7': '₇',
|
||||
'8': '₈',
|
||||
'9': '₉',
|
||||
'+': '₊',
|
||||
'-': '₋',
|
||||
'=': '₌',
|
||||
'(': '₍',
|
||||
')': '₎',
|
||||
a: 'ₐ',
|
||||
e: 'ₑ',
|
||||
h: 'ₕ',
|
||||
i: 'ᵢ',
|
||||
j: 'ⱼ',
|
||||
k: 'ₖ',
|
||||
l: 'ₗ',
|
||||
m: 'ₘ',
|
||||
n: 'ₙ',
|
||||
o: 'ₒ',
|
||||
p: 'ₚ',
|
||||
r: 'ᵣ',
|
||||
s: 'ₛ',
|
||||
t: 'ₜ',
|
||||
u: 'ᵤ',
|
||||
v: 'ᵥ',
|
||||
x: 'ₓ',
|
||||
});
|
||||
|
||||
// Unicode superscript mappings. A superset of subscripts — most letters have
|
||||
// superscript glyphs.
|
||||
const SUPERSCRIPT_MAP: Readonly<Record<string, string>> = Object.freeze({
|
||||
'0': '⁰',
|
||||
'1': '¹',
|
||||
'2': '²',
|
||||
'3': '³',
|
||||
'4': '⁴',
|
||||
'5': '⁵',
|
||||
'6': '⁶',
|
||||
'7': '⁷',
|
||||
'8': '⁸',
|
||||
'9': '⁹',
|
||||
'+': '⁺',
|
||||
'-': '⁻',
|
||||
'=': '⁼',
|
||||
'(': '⁽',
|
||||
')': '⁾',
|
||||
a: 'ᵃ',
|
||||
b: 'ᵇ',
|
||||
c: 'ᶜ',
|
||||
d: 'ᵈ',
|
||||
e: 'ᵉ',
|
||||
f: 'ᶠ',
|
||||
g: 'ᵍ',
|
||||
h: 'ʰ',
|
||||
i: 'ⁱ',
|
||||
j: 'ʲ',
|
||||
k: 'ᵏ',
|
||||
l: 'ˡ',
|
||||
m: 'ᵐ',
|
||||
n: 'ⁿ',
|
||||
o: 'ᵒ',
|
||||
p: 'ᵖ',
|
||||
r: 'ʳ',
|
||||
s: 'ˢ',
|
||||
t: 'ᵗ',
|
||||
u: 'ᵘ',
|
||||
v: 'ᵛ',
|
||||
w: 'ʷ',
|
||||
x: 'ˣ',
|
||||
y: 'ʸ',
|
||||
z: 'ᶻ',
|
||||
});
|
||||
|
||||
/**
|
||||
* Strips `$...$` and `$$...$$` math delimiters when the inner content looks
|
||||
* like math, applying the full set of math-mode conversions (including
|
||||
* sub/superscripts) to the inner text. The goal is to handle model output
|
||||
* without eating dollar signs that appear in ordinary prose (prices,
|
||||
* shell examples, etc.).
|
||||
*
|
||||
* A pair of `$...$` is treated as math when the inner text either:
|
||||
* - contains a LaTeX marker (`\command`, `_`, `^`), or
|
||||
* - is a single letter, possibly with whitespace padding (e.g. `$x$`,
|
||||
* `$ n $`). Shell-style variables like `$USER` are LEFT intact because
|
||||
* multi-letter all-caps sequences look much more like shell vars than
|
||||
* math in practice.
|
||||
*
|
||||
* A currency expression like `$5.99` (single `$`) never matches the pair
|
||||
* regex. `From $5 to $10` matches `$5 to $` as a pair but the inner text is
|
||||
* neither mathy nor a single variable, so it is left intact.
|
||||
*/
|
||||
function stripMathDelimiters(text: string): string {
|
||||
// Display math first, greedy-safe with non-dollar inner class.
|
||||
let out = text.replace(/\$\$([^$]+)\$\$/g, (_, inner: string) =>
|
||||
applyMathModeConversions(inner),
|
||||
);
|
||||
|
||||
// Inline math: lazy, single-line to avoid eating across paragraphs.
|
||||
out = out.replace(/\$([^$\n]+?)\$/g, (match, inner: string) => {
|
||||
const hasLatexMarkers = /\\[A-Za-z]|[\\_^]/.test(inner);
|
||||
const isSingleVariable = /^\s*[A-Za-z]\s*$/.test(inner);
|
||||
if (hasLatexMarkers || isSingleVariable) {
|
||||
return applyMathModeConversions(inner);
|
||||
}
|
||||
return match;
|
||||
});
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts `\textbf{..}`, `\textit{..}`, `\emph{..}`, `\text{..}`,
|
||||
* `\mathrm{..}`, `\mathbf{..}`, `\mathit{..}`, `\mathsf{..}`, `\mathtt{..}`,
|
||||
* and `\operatorname{..}` into markdown-equivalent wrappers or plain text so
|
||||
* the regular inline parser picks them up downstream.
|
||||
*
|
||||
* Only handles a single level of nesting (no inner braces) — this keeps the
|
||||
* regex bounded and avoids catastrophic backtracking on adversarial input.
|
||||
*/
|
||||
function convertTextFormatting(text: string): string {
|
||||
let out = text;
|
||||
out = out.replace(
|
||||
/\\(?:textbf|mathbf)\{([^{}]*)\}/g,
|
||||
(_, inner: string) => `**${inner}**`,
|
||||
);
|
||||
out = out.replace(
|
||||
/\\(?:textit|emph|mathit)\{([^{}]*)\}/g,
|
||||
(_, inner: string) => `*${inner}*`,
|
||||
);
|
||||
out = out.replace(
|
||||
/\\(?:text|mathrm|mathsf|mathtt|mathbb|mathcal|mathfrak|operatorname)\{([^{}]*)\}/g,
|
||||
(_, inner: string) => inner,
|
||||
);
|
||||
return out;
|
||||
}
|
||||
|
||||
/**
|
||||
* Handles `\frac{a}{b}` → `(a)/(b)` and `\sqrt{x}` → `√(x)`.
|
||||
* Only a single level of braces is supported.
|
||||
*/
|
||||
function convertFractionsAndRoots(text: string): string {
|
||||
let out = text;
|
||||
out = out.replace(
|
||||
/\\frac\{([^{}]*)\}\{([^{}]*)\}/g,
|
||||
(_, num: string, den: string) => `(${num})/(${den})`,
|
||||
);
|
||||
out = out.replace(
|
||||
/\\sqrt\[([^\]]*)\]\{([^{}]*)\}/g,
|
||||
(_, index: string, radicand: string) => `${index}√(${radicand})`,
|
||||
);
|
||||
out = out.replace(
|
||||
/\\sqrt\{([^{}]*)\}/g,
|
||||
(_, radicand: string) => `√(${radicand})`,
|
||||
);
|
||||
return out;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts escaped single-character specials (`\{` → `{`, `\_` → `_`, etc.).
|
||||
* Runs before command lookup so `\{` is not misread as a command named `{`.
|
||||
*/
|
||||
function convertEscapedSpecials(text: string): string {
|
||||
// The set is intentionally narrow: only characters that have meaning in
|
||||
// LaTeX and also appear unescaped in plain text. We do not unescape `\\`
|
||||
// (line break) here — it is handled separately.
|
||||
let out = text.replace(/\\([{}[\]_%&#$|])/g, (_, ch: string) => ch);
|
||||
// `\ ` (backslash + space) is LaTeX for a non-breaking space; just keep it
|
||||
// as a regular space so words do not collide.
|
||||
out = out.replace(/\\ /g, ' ');
|
||||
return out;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts named commands (alphabetic control sequences) to Unicode. Anything
|
||||
* not in the tables is left as-is so unrelated backslash content
|
||||
* (e.g. Windows paths) is not disturbed.
|
||||
*/
|
||||
function convertNamedCommands(text: string): string {
|
||||
return text.replace(
|
||||
/\\([A-Za-z]+)(?![A-Za-z])/g,
|
||||
(match, name: string) =>
|
||||
GREEK_LETTERS[name] ?? LATEX_COMMANDS[name] ?? match,
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts the short-form punctuation commands `\,`, `\;`, `\:`, `\!` used
|
||||
* for spacing in LaTeX. These are handled separately from alphabetic commands
|
||||
* because the regex for the latter only matches letters.
|
||||
*/
|
||||
function convertPunctuationCommands(text: string): string {
|
||||
// `\,`, `\;`, `\:` all render as a single space; `\!` is a negative space
|
||||
// and is stripped.
|
||||
return text.replace(/\\([,;:!])/g, (_, ch: string) => {
|
||||
switch (ch) {
|
||||
case ',':
|
||||
case ';':
|
||||
case ':':
|
||||
return ' ';
|
||||
case '!':
|
||||
return '';
|
||||
default:
|
||||
return ch;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts the `\\` line-break command (used inside math environments and
|
||||
* tables) to a literal newline. Must run after `\` specials but before any
|
||||
* other regex that might see a lingering backslash.
|
||||
*/
|
||||
function convertLineBreaks(text: string): string {
|
||||
return text.replace(/\\\\/g, '\n');
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts subscripts and superscripts to Unicode where every character in
|
||||
* the operand maps. If any character has no mapping the whole operand is
|
||||
* left alone, to avoid "half-converted" output that looks worse than no
|
||||
* conversion.
|
||||
*/
|
||||
function convertSubSuperScripts(text: string): string {
|
||||
// Braced form first: x_{...}, x^{...}. We only support BMP characters (the
|
||||
// mapping tables are ASCII-only), so iterating with `Array.from` over code
|
||||
// units is safe and keeps the lint rule against splitting strings happy.
|
||||
const charsOf = (s: string): string[] => Array.from(s);
|
||||
|
||||
let out = text.replace(/_\{([^{}]+)\}/g, (match, inner: string) => {
|
||||
const chars = charsOf(inner);
|
||||
if (chars.every((c) => SUBSCRIPT_MAP[c] !== undefined)) {
|
||||
return chars.map((c) => SUBSCRIPT_MAP[c]).join('');
|
||||
}
|
||||
return match;
|
||||
});
|
||||
out = out.replace(/\^\{([^{}]+)\}/g, (match, inner: string) => {
|
||||
const chars = charsOf(inner);
|
||||
if (chars.every((c) => SUPERSCRIPT_MAP[c] !== undefined)) {
|
||||
return chars.map((c) => SUPERSCRIPT_MAP[c]).join('');
|
||||
}
|
||||
return match;
|
||||
});
|
||||
|
||||
// Single-character form: x_0, x^2. Only convert when the character actually
|
||||
// has a mapping — leaves `file_name` and `foo^bar` alone.
|
||||
out = out.replace(
|
||||
/([A-Za-z0-9)\]])_([A-Za-z0-9+\-=()])/g,
|
||||
(match, base: string, c: string) => {
|
||||
const sub = SUBSCRIPT_MAP[c];
|
||||
return sub ? `${base}${sub}` : match;
|
||||
},
|
||||
);
|
||||
out = out.replace(
|
||||
/([A-Za-z0-9)\]])\^([A-Za-z0-9+\-=()])/g,
|
||||
(match, base: string, c: string) => {
|
||||
const sup = SUPERSCRIPT_MAP[c];
|
||||
return sup ? `${base}${sup}` : match;
|
||||
},
|
||||
);
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
/**
|
||||
* Applies the full set of conversions that make sense inside a LaTeX math
|
||||
* region (i.e. text that was originally wrapped in `$...$`). This includes
|
||||
* sub/superscripts, which are NOT safe to apply to arbitrary prose because
|
||||
* they would mangle identifiers like `file_name`.
|
||||
*/
|
||||
function applyMathModeConversions(text: string): string {
|
||||
let out = text;
|
||||
out = convertTextFormatting(out);
|
||||
out = convertFractionsAndRoots(out);
|
||||
out = convertEscapedSpecials(out);
|
||||
out = convertLineBreaks(out);
|
||||
out = convertNamedCommands(out);
|
||||
out = convertPunctuationCommands(out);
|
||||
out = convertSubSuperScripts(out);
|
||||
return out;
|
||||
}
|
||||
|
||||
/**
|
||||
* Applies conversions that are safe to run on arbitrary prose — anything
|
||||
* keyed off explicit LaTeX tokens like `\alpha`, `\textbf{...}`, `\to`. Does
|
||||
* NOT touch standalone `_` or `^` so identifiers and snake_case names are
|
||||
* preserved.
|
||||
*/
|
||||
function applyProseConversions(text: string): string {
|
||||
let out = text;
|
||||
out = convertTextFormatting(out);
|
||||
out = convertFractionsAndRoots(out);
|
||||
out = convertEscapedSpecials(out);
|
||||
// Deliberately NOT running convertLineBreaks here: outside math delimiters
|
||||
// `\\` is far more likely to be a Windows UNC path (`\\server\share`) or an
|
||||
// escaped backslash in code-like prose than a LaTeX line break. Legitimate
|
||||
// LaTeX line breaks belong inside `$...$` or `$$...$$` and are handled by
|
||||
// applyMathModeConversions. See PR #25802 review.
|
||||
out = convertNamedCommands(out);
|
||||
out = convertPunctuationCommands(out);
|
||||
return out;
|
||||
}
|
||||
|
||||
/**
|
||||
* Top-level entry point. Two-phase conversion:
|
||||
*
|
||||
* 1. Strip `$...$` / `$$...$$` math regions, applying math-mode conversions
|
||||
* (including sub/superscripts) to the inner text. The heuristic for
|
||||
* "this dollar pair is math" runs against the ORIGINAL input so that
|
||||
* model-authored LaTeX is recognised before any tokens are rewritten.
|
||||
*
|
||||
* 2. Run prose-safe conversions over the remaining text, catching
|
||||
* unwrapped LaTeX tokens (`\alpha`, `\to`, `\textbf{...}`) that the
|
||||
* model emitted outside math delimiters.
|
||||
*
|
||||
* Short-circuits on input that has no LaTeX markers at all (`\` or `$`) so
|
||||
* the hot rendering path stays cheap for ordinary prose.
|
||||
*/
|
||||
export function convertLatexToUnicode(input: string): string {
|
||||
if (!input) return input;
|
||||
// Fast path: if there's no backslash and no dollar sign, there's nothing to
|
||||
// convert. This keeps the hot rendering path inexpensive for ordinary text.
|
||||
if (input.indexOf('\\') === -1 && input.indexOf('$') === -1) {
|
||||
return input;
|
||||
}
|
||||
|
||||
let text = input;
|
||||
text = stripMathDelimiters(text);
|
||||
text = applyProseConversions(text);
|
||||
return text;
|
||||
}
|
||||
@@ -222,5 +222,52 @@ describe('parsingUtils', () => {
|
||||
),
|
||||
);
|
||||
});
|
||||
|
||||
describe('LaTeX conversion (issue #25656)', () => {
|
||||
it('converts LaTeX in plain text (no markdown tokens)', () => {
|
||||
const input = 'No cycles $\\to$ no deadlock';
|
||||
const output = parseMarkdownToANSI(input);
|
||||
expect(output).toBe(primary('No cycles → no deadlock'));
|
||||
});
|
||||
|
||||
it('converts LaTeX in the set example from the issue', () => {
|
||||
const input = 'Processes $\\{P_0, \\dots, P_n\\}$';
|
||||
const output = parseMarkdownToANSI(input);
|
||||
expect(output).toBe(primary('Processes {P₀, …, Pₙ}'));
|
||||
});
|
||||
|
||||
it('preserves LaTeX inside inline code', () => {
|
||||
// Content between backticks must be rendered verbatim — conversion
|
||||
// must NOT be applied inside code spans, even when the code contains
|
||||
// `$...$` that would otherwise be stripped.
|
||||
const input = 'use `$\\to$` for an arrow';
|
||||
const output = parseMarkdownToANSI(input);
|
||||
expect(output).toBe(
|
||||
`${primary('use ')}${accent('$\\to$')}${primary(' for an arrow')}`,
|
||||
);
|
||||
});
|
||||
|
||||
it('converts LaTeX in slices around markdown tokens', () => {
|
||||
const input = '$\\alpha$ is **bold** and $\\beta$ is plain';
|
||||
const output = parseMarkdownToANSI(input);
|
||||
expect(output).toBe(
|
||||
`${primary('α is ')}${chalk.bold(primary('bold'))}${primary(
|
||||
' and β is plain',
|
||||
)}`,
|
||||
);
|
||||
});
|
||||
|
||||
it('leaves Windows paths alone', () => {
|
||||
const input = 'Path: C:\\Users\\foo';
|
||||
const output = parseMarkdownToANSI(input);
|
||||
expect(output).toBe(primary('Path: C:\\Users\\foo'));
|
||||
});
|
||||
|
||||
it('leaves currency amounts alone', () => {
|
||||
const input = 'It costs $5.99 total';
|
||||
const output = parseMarkdownToANSI(input);
|
||||
expect(output).toBe(primary('It costs $5.99 total'));
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
@@ -12,6 +12,7 @@ import {
|
||||
} from '../themes/color-utils.js';
|
||||
import { theme } from '../semantic-colors.js';
|
||||
import { debugLogger } from '@google/gemini-cli-core';
|
||||
import { convertLatexToUnicode } from './latexToUnicode.js';
|
||||
|
||||
// Constants for Markdown parsing
|
||||
const BOLD_MARKER_LENGTH = 2; // For "**"
|
||||
@@ -72,11 +73,49 @@ const ansiColorize = (str: string, color: string | undefined): string => {
|
||||
* Converts markdown text into a string with ANSI escape codes.
|
||||
* This mirrors the parsing logic in InlineMarkdownRenderer.tsx
|
||||
*/
|
||||
// Private-Use-Area codepoint used as a placeholder sentinel when masking
|
||||
// inline code / URL spans from LaTeX conversion. Not touched by
|
||||
// stripUnsafeCharacters and not matched by the markdown tokenizer.
|
||||
const MASK_SENTINEL = '\uE000';
|
||||
const MASK_PATTERN = /\uE000(\d+)\uE000/g;
|
||||
|
||||
/**
|
||||
* Runs LaTeX conversion on `text` while keeping inline code spans and bare
|
||||
* URLs verbatim. Without masking, the LaTeX pass would happily rewrite
|
||||
* ``$\to$`` inside a backtick code span — violating the "code is verbatim"
|
||||
* contract — and could rewrite URL query strings containing `$`.
|
||||
*/
|
||||
const convertLatexPreservingSpans = (text: string): string => {
|
||||
const preserved: string[] = [];
|
||||
// Match inline code spans (with matched backtick counts) and bare URLs.
|
||||
// Order matters: code spans first so they win over a URL inside a span.
|
||||
const masked = text.replace(/(`+)([^`\n]+?)\1|https?:\/\/\S+/g, (match) => {
|
||||
const index = preserved.push(match) - 1;
|
||||
return `${MASK_SENTINEL}${index}${MASK_SENTINEL}`;
|
||||
});
|
||||
const converted = convertLatexToUnicode(masked);
|
||||
return converted.replace(
|
||||
MASK_PATTERN,
|
||||
// Fallback to the literal match if the index is somehow out of range —
|
||||
// defensive against the unlikely case where the PUA sentinel appears in
|
||||
// user input. Without the fallback, replace would emit "undefined".
|
||||
(match, i: string) => preserved[Number(i)] ?? match,
|
||||
);
|
||||
};
|
||||
|
||||
export const parseMarkdownToANSI = (
|
||||
text: string,
|
||||
rawText: string,
|
||||
defaultColor?: string,
|
||||
): string => {
|
||||
const baseColor = defaultColor ?? theme.text.primary;
|
||||
// Convert LaTeX-style math/commands to Unicode BEFORE tokenizing markdown,
|
||||
// so constructs like `$\{P_0, \dots, P_n\}$` are handled as a whole even
|
||||
// when they contain underscores (which the tokenizer would otherwise treat
|
||||
// as italic markers). Inline code and URLs are masked during the
|
||||
// conversion so their contents are preserved verbatim. Unknown `\foo`
|
||||
// sequences are left alone, so Windows paths and regex escapes survive.
|
||||
// See issue #25656.
|
||||
const text = convertLatexPreservingSpans(rawText);
|
||||
// Early return for plain text without markdown or URLs
|
||||
if (!/[*_~`<[https?:]/.test(text)) {
|
||||
return ansiColorize(text, baseColor);
|
||||
|
||||
Reference in New Issue
Block a user