From 77f4be1f3d77217f46876d6c8243e920fadb9326 Mon Sep 17 00:00:00 2001 From: Aryan Singh <146713101+dimssu@users.noreply.github.com> Date: Mon, 4 May 2026 23:35:06 +0530 Subject: [PATCH] fix(cli): render LaTeX-style output as Unicode in the TUI (#25802) Co-authored-by: cynthialong0-0 <82900738+cynthialong0-0@users.noreply.github.com> --- .../cli/src/ui/utils/latexToUnicode.test.ts | 304 +++++++++ packages/cli/src/ui/utils/latexToUnicode.ts | 599 ++++++++++++++++++ .../src/ui/utils/markdownParsingUtils.test.ts | 47 ++ .../cli/src/ui/utils/markdownParsingUtils.ts | 41 +- 4 files changed, 990 insertions(+), 1 deletion(-) create mode 100644 packages/cli/src/ui/utils/latexToUnicode.test.ts create mode 100644 packages/cli/src/ui/utils/latexToUnicode.ts diff --git a/packages/cli/src/ui/utils/latexToUnicode.test.ts b/packages/cli/src/ui/utils/latexToUnicode.test.ts new file mode 100644 index 0000000000..8aab911ce8 --- /dev/null +++ b/packages/cli/src/ui/utils/latexToUnicode.test.ts @@ -0,0 +1,304 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, it, expect } from 'vitest'; +import { convertLatexToUnicode } from './latexToUnicode.js'; + +describe('convertLatexToUnicode', () => { + describe('fast path', () => { + it('returns empty string unchanged', () => { + expect(convertLatexToUnicode('')).toBe(''); + }); + + it('returns text without backslash or dollar unchanged', () => { + const input = 'hello world 123'; + expect(convertLatexToUnicode(input)).toBe(input); + }); + + it('short-circuits plain ASCII identically', () => { + const input = 'The quick brown fox jumps over the lazy dog.'; + expect(convertLatexToUnicode(input)).toBe(input); + }); + }); + + describe('issue #25656 examples', () => { + it('converts the set-of-processes example', () => { + const input = 'A set of processes $\\{P_0, P_1, \\dots, P_n\\}$ exists'; + expect(convertLatexToUnicode(input)).toBe( + 'A set of processes {P₀, P₁, …, Pₙ} exists', + ); + }); + + it('converts the deadlock arrow example', () => { + const input = 'If the graph contains no cycles $\\to$ No Deadlock.'; + expect(convertLatexToUnicode(input)).toBe( + 'If the graph contains no cycles → No Deadlock.', + ); + }); + }); + + describe('math delimiters', () => { + it('strips $...$ when the content contains LaTeX markers', () => { + expect(convertLatexToUnicode('see $\\alpha$ here')).toBe('see α here'); + }); + + it('strips $...$ around single variables', () => { + expect(convertLatexToUnicode('let $x$ be a value')).toBe( + 'let x be a value', + ); + }); + + it('strips $$...$$ display math', () => { + expect(convertLatexToUnicode('$$\\alpha + \\beta$$')).toBe('α + β'); + }); + + it('leaves currency $5.99 alone', () => { + expect(convertLatexToUnicode('It costs $5.99 total')).toBe( + 'It costs $5.99 total', + ); + }); + + it('leaves two dollar amounts alone', () => { + // The regex matches `$5 to $` as a pair, but the inner content is + // neither mathy nor purely variables, so it is left intact. + expect(convertLatexToUnicode('prices range $5 to $10')).toBe( + 'prices range $5 to $10', + ); + }); + + it('leaves shell-style $ interpolation alone', () => { + expect(convertLatexToUnicode('echo $USER $HOME')).toBe( + 'echo $USER $HOME', + ); + }); + + it('does not strip dollars across newlines', () => { + expect(convertLatexToUnicode('price $5\nfee $3')).toBe( + 'price $5\nfee $3', + ); + }); + }); + + describe('greek letters', () => { + it('converts lowercase greek', () => { + expect(convertLatexToUnicode('\\alpha \\beta \\gamma')).toBe('α β γ'); + }); + + it('converts uppercase greek', () => { + expect(convertLatexToUnicode('\\Omega \\Delta')).toBe('Ω Δ'); + }); + + it('does not mangle a prefix match', () => { + // `\alphabet` is not a known command — must stay intact. + expect(convertLatexToUnicode('\\alphabet')).toBe('\\alphabet'); + }); + }); + + describe('named commands', () => { + it('converts arrows', () => { + expect(convertLatexToUnicode('\\to \\rightarrow \\Rightarrow')).toBe( + '→ → ⇒', + ); + }); + + it('converts relations', () => { + expect(convertLatexToUnicode('\\leq \\geq \\neq \\approx')).toBe( + '≤ ≥ ≠ ≈', + ); + }); + + it('converts set theory', () => { + expect(convertLatexToUnicode('\\in \\notin \\cup \\cap')).toBe('∈ ∉ ∪ ∩'); + }); + + it('converts logic', () => { + expect(convertLatexToUnicode('\\forall x \\exists y')).toBe('∀ x ∃ y'); + }); + + it('converts large operators', () => { + expect(convertLatexToUnicode('\\sum \\prod \\int')).toBe('∑ ∏ ∫'); + }); + + it('converts ellipses', () => { + expect(convertLatexToUnicode('a, b, \\dots, z')).toBe('a, b, …, z'); + }); + + it('converts infty', () => { + expect(convertLatexToUnicode('\\infty')).toBe('∞'); + }); + + it('leaves unknown commands untouched', () => { + expect(convertLatexToUnicode('\\thisIsNotReal')).toBe('\\thisIsNotReal'); + }); + }); + + describe('escaped specials', () => { + it('unescapes braces and underscore', () => { + expect(convertLatexToUnicode('\\{ \\} \\_')).toBe('{ } _'); + }); + + it('unescapes percent, ampersand, hash, dollar, pipe', () => { + expect(convertLatexToUnicode('\\% \\& \\# \\$ \\|')).toBe('% & # $ |'); + }); + + it('unescapes backslash-space as a regular space', () => { + expect(convertLatexToUnicode('word\\ boundary')).toBe('word boundary'); + }); + + it('converts \\\\ to a newline inside math mode', () => { + // `\\` is a LaTeX line break in math/tabular contexts. Only convert + // inside `$...$` — outside math this would mangle Windows UNC paths + // (`\\server\share`) and escaped backslashes in code-like prose. + expect(convertLatexToUnicode('$a\\\\b$')).toBe('a\nb'); + }); + + it('leaves \\\\ alone outside math mode', () => { + expect(convertLatexToUnicode('line1\\\\line2')).toBe('line1\\\\line2'); + }); + }); + + describe('text formatting', () => { + it('wraps textbf in markdown bold', () => { + expect(convertLatexToUnicode('\\textbf{hello}')).toBe('**hello**'); + }); + + it('wraps textit in markdown italic', () => { + expect(convertLatexToUnicode('\\textit{hello}')).toBe('*hello*'); + }); + + it('strips \\text wrapper', () => { + expect(convertLatexToUnicode('\\text{plain}')).toBe('plain'); + }); + + it('strips \\mathrm', () => { + expect(convertLatexToUnicode('\\mathrm{foo}')).toBe('foo'); + }); + + it('handles \\emph as italic', () => { + expect(convertLatexToUnicode('\\emph{emphasized}')).toBe('*emphasized*'); + }); + }); + + describe('fractions and roots', () => { + it('converts \\frac', () => { + expect(convertLatexToUnicode('\\frac{a}{b}')).toBe('(a)/(b)'); + }); + + it('converts \\sqrt', () => { + expect(convertLatexToUnicode('\\sqrt{x}')).toBe('√(x)'); + }); + + it('converts \\sqrt with index', () => { + expect(convertLatexToUnicode('\\sqrt[3]{x}')).toBe('3√(x)'); + }); + + it('converts \\frac combined with greek', () => { + expect(convertLatexToUnicode('\\frac{\\alpha}{\\beta}')).toBe('(α)/(β)'); + }); + }); + + describe('subscripts and superscripts', () => { + // Sub/superscripts are only applied inside math delimiters to avoid + // mangling identifiers like `file_name` and `foo_bar` in regular prose. + it('converts digit subscripts inside math', () => { + expect(convertLatexToUnicode('$x_0 + x_1 + x_2$')).toBe('x₀ + x₁ + x₂'); + }); + + it('converts digit superscripts inside math', () => { + expect(convertLatexToUnicode('$E = mc^2$')).toBe('E = mc²'); + }); + + it('converts letter subscripts where available', () => { + expect(convertLatexToUnicode('$P_n$ and $x_i$')).toBe('Pₙ and xᵢ'); + }); + + it('converts braced digit subscripts', () => { + expect(convertLatexToUnicode('$x_{12}$')).toBe('x₁₂'); + }); + + it('leaves subscripts with no unicode mapping alone', () => { + // `q` has no subscript glyph in Unicode — leave the whole operand + // untouched to avoid inconsistent-looking output. + expect(convertLatexToUnicode('$x_{abq}$')).toBe('x_{abq}'); + }); + + it('does not subscript identifiers in prose', () => { + // Outside math delimiters, `_` is left alone entirely so that + // snake_case identifiers and file paths render correctly. This is a + // deliberate trade-off against model output that emits subscripts + // unwrapped. + expect(convertLatexToUnicode('the file_name variable')).toBe( + 'the file_name variable', + ); + expect(convertLatexToUnicode('_private')).toBe('_private'); + }); + + it('does not superscript when character is unmapped in sup', () => { + // `^Q` — Q has no superscript. The regex only matches when the char is + // in the map; leave as-is even inside math. + expect(convertLatexToUnicode('$x^Q$')).toBe('x^Q'); + }); + + it('leaves bare x_0 alone outside math', () => { + // Deliberate: we cannot tell `P_0` (subscript) from `my_0` (identifier) + // in arbitrary prose, so prefer to preserve identifiers. + expect(convertLatexToUnicode('x_0 is fine')).toBe('x_0 is fine'); + }); + }); + + describe('protection of non-LaTeX content', () => { + it('leaves Windows paths alone', () => { + expect(convertLatexToUnicode('C:\\Users\\foo\\bar')).toBe( + 'C:\\Users\\foo\\bar', + ); + }); + + it('leaves Windows UNC paths alone (no line-break rewrite in prose)', () => { + // `\\server\share\file` must NOT be rewritten to a newline. Line-break + // conversion is restricted to math mode. See PR #25802. + expect(convertLatexToUnicode('\\\\server\\share\\file')).toBe( + '\\\\server\\share\\file', + ); + }); + + it('leaves regex backslash escapes alone', () => { + expect(convertLatexToUnicode('\\d+\\w*')).toBe('\\d+\\w*'); + }); + + it('leaves $ in code-like prose alone', () => { + expect(convertLatexToUnicode('run $(command)$ to see output')).toBe( + 'run $(command)$ to see output', + ); + }); + }); + + describe('combined scenarios', () => { + it('handles complex math in prose', () => { + const input = + 'The complexity is $O(n \\log n)$ for sorting $n$ elements.'; + expect(convertLatexToUnicode(input)).toBe( + 'The complexity is O(n log n) for sorting n elements.', + ); + }); + + it('handles multiple constructs in one line', () => { + const input = 'Let $\\alpha \\in \\mathbb{R}$ and $\\beta \\geq 0$.'; + expect(convertLatexToUnicode(input)).toBe('Let α ∈ R and β ≥ 0.'); + }); + + it('preserves surrounding text exactly', () => { + const input = 'Before $\\to$ after.'; + expect(convertLatexToUnicode(input)).toBe('Before → after.'); + }); + + it('idempotency — running twice yields the same result', () => { + const input = '$\\{P_0, \\dots, P_n\\}$'; + const once = convertLatexToUnicode(input); + const twice = convertLatexToUnicode(once); + expect(twice).toBe(once); + }); + }); +}); diff --git a/packages/cli/src/ui/utils/latexToUnicode.ts b/packages/cli/src/ui/utils/latexToUnicode.ts new file mode 100644 index 0000000000..f021d70f0d --- /dev/null +++ b/packages/cli/src/ui/utils/latexToUnicode.ts @@ -0,0 +1,599 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * Converts common LaTeX-style syntax in model output into terminal-friendly + * Unicode (and lightweight markdown where appropriate). + * + * Terminals cannot natively render LaTeX, but model responses — especially for + * math, CS, and algorithms — frequently include constructs like `$\{P_0, + * \dots, P_n\}$` or `$\to$`. Left as-is, the raw backslash commands show up + * verbatim and make the output look broken. + * + * This function is a conservative, lossy post-processor that handles the + * common cases and leaves anything it does not recognise untouched, so that + * legitimate backslash content (e.g. Windows paths, regex examples) is not + * mangled. + * + * See issue #25656. + */ + +// Greek letters, lower and upper case, plus the common "var" variants. +const GREEK_LETTERS: Readonly> = Object.freeze({ + alpha: 'α', + beta: 'β', + gamma: 'γ', + delta: 'δ', + epsilon: 'ε', + zeta: 'ζ', + eta: 'η', + theta: 'θ', + iota: 'ι', + kappa: 'κ', + lambda: 'λ', + mu: 'μ', + nu: 'ν', + xi: 'ξ', + omicron: 'ο', + pi: 'π', + rho: 'ρ', + sigma: 'σ', + tau: 'τ', + upsilon: 'υ', + phi: 'φ', + chi: 'χ', + psi: 'ψ', + omega: 'ω', + Alpha: 'Α', + Beta: 'Β', + Gamma: 'Γ', + Delta: 'Δ', + Epsilon: 'Ε', + Zeta: 'Ζ', + Eta: 'Η', + Theta: 'Θ', + Iota: 'Ι', + Kappa: 'Κ', + Lambda: 'Λ', + Mu: 'Μ', + Nu: 'Ν', + Xi: 'Ξ', + Omicron: 'Ο', + Pi: 'Π', + Rho: 'Ρ', + Sigma: 'Σ', + Tau: 'Τ', + Upsilon: 'Υ', + Phi: 'Φ', + Chi: 'Χ', + Psi: 'Ψ', + Omega: 'Ω', + varepsilon: 'ε', + vartheta: 'ϑ', + varphi: 'φ', + varrho: 'ϱ', + varsigma: 'ς', + varpi: 'ϖ', +}); + +// Named LaTeX commands → Unicode. Covers arrows, relations, set theory, +// logic, large operators, and a handful of common decorations. Anything not +// listed here is deliberately left untouched. +const LATEX_COMMANDS: Readonly> = Object.freeze({ + // Arrows + to: '→', + rightarrow: '→', + Rightarrow: '⇒', + leftarrow: '←', + Leftarrow: '⇐', + leftrightarrow: '↔', + Leftrightarrow: '⇔', + mapsto: '↦', + longrightarrow: '⟶', + longleftarrow: '⟵', + longleftrightarrow: '⟷', + uparrow: '↑', + downarrow: '↓', + Uparrow: '⇑', + Downarrow: '⇓', + hookrightarrow: '↪', + hookleftarrow: '↩', + + // Ellipses + dots: '…', + ldots: '…', + cdots: '⋯', + vdots: '⋮', + ddots: '⋱', + + // Arithmetic / comparison + times: '×', + cdot: '·', + div: '÷', + pm: '±', + mp: '∓', + ast: '∗', + leq: '≤', + le: '≤', + geq: '≥', + ge: '≥', + neq: '≠', + ne: '≠', + ll: '≪', + gg: '≫', + approx: '≈', + equiv: '≡', + sim: '∼', + simeq: '≃', + cong: '≅', + propto: '∝', + + // Set theory + in: '∈', + notin: '∉', + ni: '∋', + subset: '⊂', + supset: '⊃', + subseteq: '⊆', + supseteq: '⊇', + cup: '∪', + cap: '∩', + setminus: '∖', + emptyset: '∅', + varnothing: '∅', + + // Logic + forall: '∀', + exists: '∃', + nexists: '∄', + neg: '¬', + lnot: '¬', + land: '∧', + wedge: '∧', + lor: '∨', + vee: '∨', + oplus: '⊕', + otimes: '⊗', + implies: '⟹', + iff: '⟺', + + // Large operators + sum: '∑', + prod: '∏', + coprod: '∐', + int: '∫', + iint: '∬', + iiint: '∭', + oint: '∮', + + // Calculus + partial: '∂', + nabla: '∇', + infty: '∞', + + // Misc letters / constants + ell: 'ℓ', + hbar: 'ℏ', + Re: 'ℜ', + Im: 'ℑ', + aleph: 'ℵ', + beth: 'ℶ', + + // Brackets / delimiters + lbrace: '{', + rbrace: '}', + lbrack: '[', + rbrack: ']', + langle: '⟨', + rangle: '⟩', + lceil: '⌈', + rceil: '⌉', + lfloor: '⌊', + rfloor: '⌋', + + // Geometry / misc + perp: '⊥', + parallel: '∥', + angle: '∠', + triangle: '△', + square: '□', + circ: '∘', + bullet: '•', + star: '⋆', + prime: '′', + dag: '†', + ddag: '‡', + therefore: '∴', + because: '∵', + top: '⊤', + bot: '⊥', + + // Operator names (`\log`, `\sin`, …) render in LaTeX as upright text. In a + // terminal the closest equivalent is the lowercase word itself. + log: 'log', + ln: 'ln', + lg: 'lg', + exp: 'exp', + sin: 'sin', + cos: 'cos', + tan: 'tan', + cot: 'cot', + sec: 'sec', + csc: 'csc', + arcsin: 'arcsin', + arccos: 'arccos', + arctan: 'arctan', + sinh: 'sinh', + cosh: 'cosh', + tanh: 'tanh', + max: 'max', + min: 'min', + sup: 'sup', + inf: 'inf', + lim: 'lim', + limsup: 'lim sup', + liminf: 'lim inf', + arg: 'arg', + det: 'det', + dim: 'dim', + ker: 'ker', + gcd: 'gcd', + deg: 'deg', + hom: 'hom', + mod: 'mod', + bmod: 'mod', + pmod: 'mod', + + // Whitespace commands — render as visible space so layout is roughly right. + quad: ' ', + qquad: ' ', + // These are all "thin-space" style commands in LaTeX; render as a single + // space so the surrounding tokens don't jam together. + ',': ' ', + ';': ' ', + ':': ' ', + '!': '', +}); + +// Unicode subscript mappings (digits, operators, and the common letters that +// have full-height subscript glyphs in Unicode). +const SUBSCRIPT_MAP: Readonly> = Object.freeze({ + '0': '₀', + '1': '₁', + '2': '₂', + '3': '₃', + '4': '₄', + '5': '₅', + '6': '₆', + '7': '₇', + '8': '₈', + '9': '₉', + '+': '₊', + '-': '₋', + '=': '₌', + '(': '₍', + ')': '₎', + a: 'ₐ', + e: 'ₑ', + h: 'ₕ', + i: 'ᵢ', + j: 'ⱼ', + k: 'ₖ', + l: 'ₗ', + m: 'ₘ', + n: 'ₙ', + o: 'ₒ', + p: 'ₚ', + r: 'ᵣ', + s: 'ₛ', + t: 'ₜ', + u: 'ᵤ', + v: 'ᵥ', + x: 'ₓ', +}); + +// Unicode superscript mappings. A superset of subscripts — most letters have +// superscript glyphs. +const SUPERSCRIPT_MAP: Readonly> = Object.freeze({ + '0': '⁰', + '1': '¹', + '2': '²', + '3': '³', + '4': '⁴', + '5': '⁵', + '6': '⁶', + '7': '⁷', + '8': '⁸', + '9': '⁹', + '+': '⁺', + '-': '⁻', + '=': '⁼', + '(': '⁽', + ')': '⁾', + a: 'ᵃ', + b: 'ᵇ', + c: 'ᶜ', + d: 'ᵈ', + e: 'ᵉ', + f: 'ᶠ', + g: 'ᵍ', + h: 'ʰ', + i: 'ⁱ', + j: 'ʲ', + k: 'ᵏ', + l: 'ˡ', + m: 'ᵐ', + n: 'ⁿ', + o: 'ᵒ', + p: 'ᵖ', + r: 'ʳ', + s: 'ˢ', + t: 'ᵗ', + u: 'ᵘ', + v: 'ᵛ', + w: 'ʷ', + x: 'ˣ', + y: 'ʸ', + z: 'ᶻ', +}); + +/** + * Strips `$...$` and `$$...$$` math delimiters when the inner content looks + * like math, applying the full set of math-mode conversions (including + * sub/superscripts) to the inner text. The goal is to handle model output + * without eating dollar signs that appear in ordinary prose (prices, + * shell examples, etc.). + * + * A pair of `$...$` is treated as math when the inner text either: + * - contains a LaTeX marker (`\command`, `_`, `^`), or + * - is a single letter, possibly with whitespace padding (e.g. `$x$`, + * `$ n $`). Shell-style variables like `$USER` are LEFT intact because + * multi-letter all-caps sequences look much more like shell vars than + * math in practice. + * + * A currency expression like `$5.99` (single `$`) never matches the pair + * regex. `From $5 to $10` matches `$5 to $` as a pair but the inner text is + * neither mathy nor a single variable, so it is left intact. + */ +function stripMathDelimiters(text: string): string { + // Display math first, greedy-safe with non-dollar inner class. + let out = text.replace(/\$\$([^$]+)\$\$/g, (_, inner: string) => + applyMathModeConversions(inner), + ); + + // Inline math: lazy, single-line to avoid eating across paragraphs. + out = out.replace(/\$([^$\n]+?)\$/g, (match, inner: string) => { + const hasLatexMarkers = /\\[A-Za-z]|[\\_^]/.test(inner); + const isSingleVariable = /^\s*[A-Za-z]\s*$/.test(inner); + if (hasLatexMarkers || isSingleVariable) { + return applyMathModeConversions(inner); + } + return match; + }); + + return out; +} + +/** + * Converts `\textbf{..}`, `\textit{..}`, `\emph{..}`, `\text{..}`, + * `\mathrm{..}`, `\mathbf{..}`, `\mathit{..}`, `\mathsf{..}`, `\mathtt{..}`, + * and `\operatorname{..}` into markdown-equivalent wrappers or plain text so + * the regular inline parser picks them up downstream. + * + * Only handles a single level of nesting (no inner braces) — this keeps the + * regex bounded and avoids catastrophic backtracking on adversarial input. + */ +function convertTextFormatting(text: string): string { + let out = text; + out = out.replace( + /\\(?:textbf|mathbf)\{([^{}]*)\}/g, + (_, inner: string) => `**${inner}**`, + ); + out = out.replace( + /\\(?:textit|emph|mathit)\{([^{}]*)\}/g, + (_, inner: string) => `*${inner}*`, + ); + out = out.replace( + /\\(?:text|mathrm|mathsf|mathtt|mathbb|mathcal|mathfrak|operatorname)\{([^{}]*)\}/g, + (_, inner: string) => inner, + ); + return out; +} + +/** + * Handles `\frac{a}{b}` → `(a)/(b)` and `\sqrt{x}` → `√(x)`. + * Only a single level of braces is supported. + */ +function convertFractionsAndRoots(text: string): string { + let out = text; + out = out.replace( + /\\frac\{([^{}]*)\}\{([^{}]*)\}/g, + (_, num: string, den: string) => `(${num})/(${den})`, + ); + out = out.replace( + /\\sqrt\[([^\]]*)\]\{([^{}]*)\}/g, + (_, index: string, radicand: string) => `${index}√(${radicand})`, + ); + out = out.replace( + /\\sqrt\{([^{}]*)\}/g, + (_, radicand: string) => `√(${radicand})`, + ); + return out; +} + +/** + * Converts escaped single-character specials (`\{` → `{`, `\_` → `_`, etc.). + * Runs before command lookup so `\{` is not misread as a command named `{`. + */ +function convertEscapedSpecials(text: string): string { + // The set is intentionally narrow: only characters that have meaning in + // LaTeX and also appear unescaped in plain text. We do not unescape `\\` + // (line break) here — it is handled separately. + let out = text.replace(/\\([{}[\]_%&#$|])/g, (_, ch: string) => ch); + // `\ ` (backslash + space) is LaTeX for a non-breaking space; just keep it + // as a regular space so words do not collide. + out = out.replace(/\\ /g, ' '); + return out; +} + +/** + * Converts named commands (alphabetic control sequences) to Unicode. Anything + * not in the tables is left as-is so unrelated backslash content + * (e.g. Windows paths) is not disturbed. + */ +function convertNamedCommands(text: string): string { + return text.replace( + /\\([A-Za-z]+)(?![A-Za-z])/g, + (match, name: string) => + GREEK_LETTERS[name] ?? LATEX_COMMANDS[name] ?? match, + ); +} + +/** + * Converts the short-form punctuation commands `\,`, `\;`, `\:`, `\!` used + * for spacing in LaTeX. These are handled separately from alphabetic commands + * because the regex for the latter only matches letters. + */ +function convertPunctuationCommands(text: string): string { + // `\,`, `\;`, `\:` all render as a single space; `\!` is a negative space + // and is stripped. + return text.replace(/\\([,;:!])/g, (_, ch: string) => { + switch (ch) { + case ',': + case ';': + case ':': + return ' '; + case '!': + return ''; + default: + return ch; + } + }); +} + +/** + * Converts the `\\` line-break command (used inside math environments and + * tables) to a literal newline. Must run after `\` specials but before any + * other regex that might see a lingering backslash. + */ +function convertLineBreaks(text: string): string { + return text.replace(/\\\\/g, '\n'); +} + +/** + * Converts subscripts and superscripts to Unicode where every character in + * the operand maps. If any character has no mapping the whole operand is + * left alone, to avoid "half-converted" output that looks worse than no + * conversion. + */ +function convertSubSuperScripts(text: string): string { + // Braced form first: x_{...}, x^{...}. We only support BMP characters (the + // mapping tables are ASCII-only), so iterating with `Array.from` over code + // units is safe and keeps the lint rule against splitting strings happy. + const charsOf = (s: string): string[] => Array.from(s); + + let out = text.replace(/_\{([^{}]+)\}/g, (match, inner: string) => { + const chars = charsOf(inner); + if (chars.every((c) => SUBSCRIPT_MAP[c] !== undefined)) { + return chars.map((c) => SUBSCRIPT_MAP[c]).join(''); + } + return match; + }); + out = out.replace(/\^\{([^{}]+)\}/g, (match, inner: string) => { + const chars = charsOf(inner); + if (chars.every((c) => SUPERSCRIPT_MAP[c] !== undefined)) { + return chars.map((c) => SUPERSCRIPT_MAP[c]).join(''); + } + return match; + }); + + // Single-character form: x_0, x^2. Only convert when the character actually + // has a mapping — leaves `file_name` and `foo^bar` alone. + out = out.replace( + /([A-Za-z0-9)\]])_([A-Za-z0-9+\-=()])/g, + (match, base: string, c: string) => { + const sub = SUBSCRIPT_MAP[c]; + return sub ? `${base}${sub}` : match; + }, + ); + out = out.replace( + /([A-Za-z0-9)\]])\^([A-Za-z0-9+\-=()])/g, + (match, base: string, c: string) => { + const sup = SUPERSCRIPT_MAP[c]; + return sup ? `${base}${sup}` : match; + }, + ); + + return out; +} + +/** + * Applies the full set of conversions that make sense inside a LaTeX math + * region (i.e. text that was originally wrapped in `$...$`). This includes + * sub/superscripts, which are NOT safe to apply to arbitrary prose because + * they would mangle identifiers like `file_name`. + */ +function applyMathModeConversions(text: string): string { + let out = text; + out = convertTextFormatting(out); + out = convertFractionsAndRoots(out); + out = convertEscapedSpecials(out); + out = convertLineBreaks(out); + out = convertNamedCommands(out); + out = convertPunctuationCommands(out); + out = convertSubSuperScripts(out); + return out; +} + +/** + * Applies conversions that are safe to run on arbitrary prose — anything + * keyed off explicit LaTeX tokens like `\alpha`, `\textbf{...}`, `\to`. Does + * NOT touch standalone `_` or `^` so identifiers and snake_case names are + * preserved. + */ +function applyProseConversions(text: string): string { + let out = text; + out = convertTextFormatting(out); + out = convertFractionsAndRoots(out); + out = convertEscapedSpecials(out); + // Deliberately NOT running convertLineBreaks here: outside math delimiters + // `\\` is far more likely to be a Windows UNC path (`\\server\share`) or an + // escaped backslash in code-like prose than a LaTeX line break. Legitimate + // LaTeX line breaks belong inside `$...$` or `$$...$$` and are handled by + // applyMathModeConversions. See PR #25802 review. + out = convertNamedCommands(out); + out = convertPunctuationCommands(out); + return out; +} + +/** + * Top-level entry point. Two-phase conversion: + * + * 1. Strip `$...$` / `$$...$$` math regions, applying math-mode conversions + * (including sub/superscripts) to the inner text. The heuristic for + * "this dollar pair is math" runs against the ORIGINAL input so that + * model-authored LaTeX is recognised before any tokens are rewritten. + * + * 2. Run prose-safe conversions over the remaining text, catching + * unwrapped LaTeX tokens (`\alpha`, `\to`, `\textbf{...}`) that the + * model emitted outside math delimiters. + * + * Short-circuits on input that has no LaTeX markers at all (`\` or `$`) so + * the hot rendering path stays cheap for ordinary prose. + */ +export function convertLatexToUnicode(input: string): string { + if (!input) return input; + // Fast path: if there's no backslash and no dollar sign, there's nothing to + // convert. This keeps the hot rendering path inexpensive for ordinary text. + if (input.indexOf('\\') === -1 && input.indexOf('$') === -1) { + return input; + } + + let text = input; + text = stripMathDelimiters(text); + text = applyProseConversions(text); + return text; +} diff --git a/packages/cli/src/ui/utils/markdownParsingUtils.test.ts b/packages/cli/src/ui/utils/markdownParsingUtils.test.ts index c32bda58fa..5728f886dc 100644 --- a/packages/cli/src/ui/utils/markdownParsingUtils.test.ts +++ b/packages/cli/src/ui/utils/markdownParsingUtils.test.ts @@ -222,5 +222,52 @@ describe('parsingUtils', () => { ), ); }); + + describe('LaTeX conversion (issue #25656)', () => { + it('converts LaTeX in plain text (no markdown tokens)', () => { + const input = 'No cycles $\\to$ no deadlock'; + const output = parseMarkdownToANSI(input); + expect(output).toBe(primary('No cycles → no deadlock')); + }); + + it('converts LaTeX in the set example from the issue', () => { + const input = 'Processes $\\{P_0, \\dots, P_n\\}$'; + const output = parseMarkdownToANSI(input); + expect(output).toBe(primary('Processes {P₀, …, Pₙ}')); + }); + + it('preserves LaTeX inside inline code', () => { + // Content between backticks must be rendered verbatim — conversion + // must NOT be applied inside code spans, even when the code contains + // `$...$` that would otherwise be stripped. + const input = 'use `$\\to$` for an arrow'; + const output = parseMarkdownToANSI(input); + expect(output).toBe( + `${primary('use ')}${accent('$\\to$')}${primary(' for an arrow')}`, + ); + }); + + it('converts LaTeX in slices around markdown tokens', () => { + const input = '$\\alpha$ is **bold** and $\\beta$ is plain'; + const output = parseMarkdownToANSI(input); + expect(output).toBe( + `${primary('α is ')}${chalk.bold(primary('bold'))}${primary( + ' and β is plain', + )}`, + ); + }); + + it('leaves Windows paths alone', () => { + const input = 'Path: C:\\Users\\foo'; + const output = parseMarkdownToANSI(input); + expect(output).toBe(primary('Path: C:\\Users\\foo')); + }); + + it('leaves currency amounts alone', () => { + const input = 'It costs $5.99 total'; + const output = parseMarkdownToANSI(input); + expect(output).toBe(primary('It costs $5.99 total')); + }); + }); }); }); diff --git a/packages/cli/src/ui/utils/markdownParsingUtils.ts b/packages/cli/src/ui/utils/markdownParsingUtils.ts index 10f7cb7a40..841809f08c 100644 --- a/packages/cli/src/ui/utils/markdownParsingUtils.ts +++ b/packages/cli/src/ui/utils/markdownParsingUtils.ts @@ -12,6 +12,7 @@ import { } from '../themes/color-utils.js'; import { theme } from '../semantic-colors.js'; import { debugLogger } from '@google/gemini-cli-core'; +import { convertLatexToUnicode } from './latexToUnicode.js'; // Constants for Markdown parsing const BOLD_MARKER_LENGTH = 2; // For "**" @@ -72,11 +73,49 @@ const ansiColorize = (str: string, color: string | undefined): string => { * Converts markdown text into a string with ANSI escape codes. * This mirrors the parsing logic in InlineMarkdownRenderer.tsx */ +// Private-Use-Area codepoint used as a placeholder sentinel when masking +// inline code / URL spans from LaTeX conversion. Not touched by +// stripUnsafeCharacters and not matched by the markdown tokenizer. +const MASK_SENTINEL = '\uE000'; +const MASK_PATTERN = /\uE000(\d+)\uE000/g; + +/** + * Runs LaTeX conversion on `text` while keeping inline code spans and bare + * URLs verbatim. Without masking, the LaTeX pass would happily rewrite + * ``$\to$`` inside a backtick code span — violating the "code is verbatim" + * contract — and could rewrite URL query strings containing `$`. + */ +const convertLatexPreservingSpans = (text: string): string => { + const preserved: string[] = []; + // Match inline code spans (with matched backtick counts) and bare URLs. + // Order matters: code spans first so they win over a URL inside a span. + const masked = text.replace(/(`+)([^`\n]+?)\1|https?:\/\/\S+/g, (match) => { + const index = preserved.push(match) - 1; + return `${MASK_SENTINEL}${index}${MASK_SENTINEL}`; + }); + const converted = convertLatexToUnicode(masked); + return converted.replace( + MASK_PATTERN, + // Fallback to the literal match if the index is somehow out of range — + // defensive against the unlikely case where the PUA sentinel appears in + // user input. Without the fallback, replace would emit "undefined". + (match, i: string) => preserved[Number(i)] ?? match, + ); +}; + export const parseMarkdownToANSI = ( - text: string, + rawText: string, defaultColor?: string, ): string => { const baseColor = defaultColor ?? theme.text.primary; + // Convert LaTeX-style math/commands to Unicode BEFORE tokenizing markdown, + // so constructs like `$\{P_0, \dots, P_n\}$` are handled as a whole even + // when they contain underscores (which the tokenizer would otherwise treat + // as italic markers). Inline code and URLs are masked during the + // conversion so their contents are preserved verbatim. Unknown `\foo` + // sequences are left alone, so Windows paths and regex escapes survive. + // See issue #25656. + const text = convertLatexPreservingSpans(rawText); // Early return for plain text without markdown or URLs if (!/[*_~`<[https?:]/.test(text)) { return ansiColorize(text, baseColor);