From 9ceec55257188fd5f1e368aa5e7c4680ec25f021 Mon Sep 17 00:00:00 2001 From: Caio Pizzol Date: Mon, 20 Apr 2026 17:54:13 -0300 Subject: [PATCH 1/3] fix(math): split multi-char m:r text per character to match Word (SD-2632) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Word's OMML2MML.XSL classifies each character in an m:r run individually — digits group into a single (with one optional decimal point between digits), operator characters each become their own , and everything else becomes its own . SuperDoc was emitting the entire run text as one element, so runs like "→∞" or "x+1" rendered as a single , losing operator spacing and semantics. tokenizeMathText implements the per-character classification. convertMathRun returns a single Element for one-atom runs and a DocumentFragment when multiple atoms are emitted, so siblings flow directly into the parent's without an extra wrapper. m:fName is the documented exception — Word keeps multi-letter function names like "sin" or "lim" as one inside the function-name slot. convertFunction routes m:r children through convertMathRunWhole (no splitting), and a new collapseFunctionNameBases pass re-merges the base slot of any structural MathML element (munder/mover/msub/…) that Word nested inside m:fName — without this, "lim" inside m:limLow would incorrectly split to three . Also drops U+221E (∞) from OPERATOR_CHARS — it's a mathematical constant per Word's XSL, not an operator. MathObjectConverter's return type widens from Element|null to Node|null so convertMathRun can return a DocumentFragment. All other converters already return Element, which is assignable to Node — no other changes. Verified against real Word-native fixtures: `→∞` in the limit-tests fixture case 1 now renders as n (matches Word OMML2MML.XSL byte-for-byte), and nested limits keep their function names intact. Ref ECMA-376 §22.1.2.116, Annex L.6.1.13, §22.1.2.58. --- .../src/features/math/converters/function.ts | 57 +++++- .../src/features/math/converters/math-run.ts | 168 ++++++++++++++---- .../src/features/math/omml-to-mathml.test.ts | 110 ++++++++++++ .../painters/dom/src/features/math/types.ts | 2 +- .../tests/importing/math-equations.spec.ts | 31 +++- 5 files changed, 326 insertions(+), 42 deletions(-) diff --git a/packages/layout-engine/painters/dom/src/features/math/converters/function.ts b/packages/layout-engine/painters/dom/src/features/math/converters/function.ts index 145c183af9..b8f8697f01 100644 --- a/packages/layout-engine/painters/dom/src/features/math/converters/function.ts +++ b/packages/layout-engine/painters/dom/src/features/math/converters/function.ts @@ -1,4 +1,5 @@ import type { MathObjectConverter } from '../types.js'; +import { convertMathRunWhole } from './math-run.js'; const MATHML_NS = 'http://www.w3.org/1998/Math/MathML'; const FUNCTION_APPLY_OPERATOR = '\u2061'; @@ -37,6 +38,48 @@ function forceNormalMathVariant(root: ParentNode): void { } } +/** + * Structural MathML elements whose FIRST child is the "function-name base" + * when nested inside m:fName (e.g. m:limLow → , m:limUpp → , + * m:sSub → , etc.). Word's OMML2MML.XSL keeps the base text whole + * (e.g. "lim" as one ) even though it splits regular runs per-character. + */ +const BASE_BEARING_ELEMENTS = new Set(['munder', 'mover', 'munderover', 'msub', 'msup', 'msubsup']); + +/** + * After per-character splitting in convertMathRun, the base of a nested + * limit/script inside m:fName comes out as multiple single-char siblings + * wrapped in an . Word's XSL keeps that base whole — merge the siblings + * back into a single if they all share the same (or no) mathvariant. + */ +function collapseFunctionNameBases(root: ParentNode): void { + for (const child of Array.from(root.children)) { + if (BASE_BEARING_ELEMENTS.has(child.localName)) { + const base = child.children[0]; + if (base) { + collapseMrowToSingleMi(base); + collapseFunctionNameBases(base); + } + } else { + collapseFunctionNameBases(child); + } + } +} + +function collapseMrowToSingleMi(container: Element): void { + const children = Array.from(container.children); + if (children.length < 2) return; + if (!children.every((c) => c.localName === 'mi')) return; + const variant = children[0]!.getAttribute('mathvariant'); + if (!children.every((c) => c.getAttribute('mathvariant') === variant)) return; + + const merged = container.ownerDocument!.createElementNS(MATHML_NS, 'mi'); + merged.textContent = children.map((c) => c.textContent ?? '').join(''); + if (variant) merged.setAttribute('mathvariant', variant); + container.insertBefore(merged, children[0]!); + for (const c of children) c.remove(); +} + /** * Convert m:func (function apply) to MathML. * @@ -59,7 +102,19 @@ export const convertFunction: MathObjectConverter = (node, doc, convertChildren) const wrapper = doc.createElementNS(MATHML_NS, 'mrow'); const functionNameRow = doc.createElementNS(MATHML_NS, 'mrow'); - functionNameRow.appendChild(convertChildren(functionName?.elements ?? [])); + // m:r children of m:fName stay whole (Word's OMML2MML.XSL keeps multi-letter + // function names like "sin" or "lim" as a single ). Non-m:r children — + // like a nested m:limLow — go through the normal recursive path. + for (const child of functionName?.elements ?? []) { + if (child.name === 'm:r') { + const whole = convertMathRunWhole(child, doc); + if (whole) functionNameRow.appendChild(whole); + } else { + const converted = convertChildren([child]); + if (converted.childNodes.length > 0) functionNameRow.appendChild(converted); + } + } + collapseFunctionNameBases(functionNameRow); forceNormalMathVariant(functionNameRow); if (functionNameRow.childNodes.length > 0) { diff --git a/packages/layout-engine/painters/dom/src/features/math/converters/math-run.ts b/packages/layout-engine/painters/dom/src/features/math/converters/math-run.ts index 79fddd2375..49884e3ca9 100644 --- a/packages/layout-engine/painters/dom/src/features/math/converters/math-run.ts +++ b/packages/layout-engine/painters/dom/src/features/math/converters/math-run.ts @@ -46,8 +46,7 @@ const OPERATOR_CHARS = new Set([ '\u220C', // ∈, ∉, ∋, ∌ '\u2211', '\u220F', // ∑, ∏ - '\u221A', - '\u221E', // √, ∞ + '\u221A', // √ (radical sign — prefix operator) '\u2227', '\u2228', '\u2229', @@ -65,16 +64,55 @@ const OPERATOR_CHARS = new Set([ '\u2287', // ⊂, ⊃, ⊆, ⊇ ]); +type MathAtomTag = 'mi' | 'mo' | 'mn'; + +function isDigit(ch: string): boolean { + return ch >= '0' && ch <= '9'; +} + /** - * Classify a text string into MathML element type. - * - All-digit strings → (number) - * - Known operators → (operator) - * - Everything else → (identifier) + * Split a math run's text into MathML atoms, matching Word's OMML2MML.XSL. + * + * Rules (ECMA-376 §22.1.2.116 example + Annex L.6.1.13): + * - Consecutive digits — optionally containing one decimal point between digits — + * group into a single ``. + * - Each recognized operator character becomes its own ``. + * - Every other character becomes its own ``. + * + * Example: `"n+1"` → `[n, +, 1]`. */ -function classifyMathText(text: string): 'mn' | 'mo' | 'mi' { - if (/^\d*\.?\d+$/.test(text)) return 'mn'; - if (text.length === 1 && OPERATOR_CHARS.has(text)) return 'mo'; - return 'mi'; +export function tokenizeMathText(text: string): Array<{ tag: MathAtomTag; content: string }> { + const atoms: Array<{ tag: MathAtomTag; content: string }> = []; + let i = 0; + while (i < text.length) { + const ch = text[i]!; + if (isDigit(ch)) { + let end = i + 1; + let sawDot = false; + while (end < text.length) { + const c = text[end]!; + if (isDigit(c)) { + end++; + continue; + } + if (c === '.' && !sawDot && end + 1 < text.length && isDigit(text[end + 1]!)) { + sawDot = true; + end++; + continue; + } + break; + } + atoms.push({ tag: 'mn', content: text.slice(i, end) }); + i = end; + } else if (OPERATOR_CHARS.has(ch)) { + atoms.push({ tag: 'mo', content: ch }); + i++; + } else { + atoms.push({ tag: 'mi', content: ch }); + i++; + } + } + return atoms; } /** ECMA-376 m:sty → MathML mathvariant (§22.1.2 math run properties). */ @@ -115,47 +153,103 @@ function resolveMathVariant(rPr: OmmlJsonNode | undefined): string | null { return null; } +function extractText(node: OmmlJsonNode): string { + let text = ''; + for (const child of node.elements ?? []) { + if (child.name === 'm:t') { + for (const tc of child.elements ?? []) { + if (tc.type === 'text' && typeof tc.text === 'string') text += tc.text; + } + } + } + return text; +} + /** - * Convert an m:r (math run) element to MathML. + * Convert an m:r (math run) element to MathML atoms. * * m:r contains: * - m:rPr (math run properties: script, style, normal text flag) * - m:t (text content) * - Optionally w:rPr (WordprocessingML run properties for formatting) * - * The text is classified as , , or based on content. + * The run's text is split per-character into `` / `` / `` atoms + * per Word's OMML2MML.XSL. For a single-atom run (common case — a one-letter + * variable, single operator, or an all-digit number) the converter returns a + * single Element. For a multi-atom run (e.g. "→∞", "x+1") it returns a + * DocumentFragment whose children become siblings of the parent mrow. + * + * @spec ECMA-376 §22.1.2.116 (t) — example shows multi-char mixed runs as the + * normal authored shape; §22.1.2.58 (lit) implies operators are classified + * per-character by default. */ export const convertMathRun: MathObjectConverter = (node, doc) => { - const elements = node.elements ?? []; + const text = extractText(node); + if (!text) return null; - // Extract text from m:t children - let text = ''; - for (const child of elements) { - if (child.name === 'm:t') { - const textChildren = child.elements ?? []; - for (const tc of textChildren) { - if (tc.type === 'text' && typeof tc.text === 'string') { - text += tc.text; - } - } - } - } + const rPr = (node.elements ?? []).find((el) => el.name === 'm:rPr'); + const variant = resolveMathVariant(rPr); + const atoms = tokenizeMathText(text); + + const createAtom = (atom: { tag: MathAtomTag; content: string }): Element => { + const el = doc.createElementNS(MATHML_NS, atom.tag); + el.textContent = atom.content; + // Apply m:rPr-derived variant to every atom in the run. Omitted attribute + // means "use the MathML default" (italic for single-char , normal + // for multi-char //). + if (variant) el.setAttribute('mathvariant', variant); + return el; + }; + + if (atoms.length === 1) return createAtom(atoms[0]!); + + const fragment = doc.createDocumentFragment(); + for (const atom of atoms) fragment.appendChild(createAtom(atom)); + return fragment; +}; +/** + * Convert an m:r to a single `` element, preserving the entire run text. + * + * Used by m:func when processing m:fName children — Word's OMML2MML.XSL treats + * multi-letter function names (e.g. "sin", "lim", "max") as one identifier + * rather than splitting per character. See `convertFunction` for the calling + * context. + * + * Returns null if the run has no text. If the run contains non-letter + * characters (digits or operators), falls back to the splitting path so + * composite function names still render correctly. + */ +export function convertMathRunWhole(node: OmmlJsonNode, doc: Document): Node | null { + const text = extractText(node); if (!text) return null; - const rPr = elements.find((el) => el.name === 'm:rPr'); + const rPr = (node.elements ?? []).find((el) => el.name === 'm:rPr'); const variant = resolveMathVariant(rPr); - const tag = classifyMathText(text); - - const el = doc.createElementNS(MATHML_NS, tag); - el.textContent = text; + const atoms = tokenizeMathText(text); - // Apply mathvariant when the spec properties resolve to one. The default - // for single-char is italic and for multi-char // is - // normal — we only set an attribute when m:rPr explicitly specifies it. - if (variant) { - el.setAttribute('mathvariant', variant); + if (atoms.every((a) => a.tag === 'mi')) { + const el = doc.createElementNS(MATHML_NS, 'mi'); + el.textContent = text; + if (variant) el.setAttribute('mathvariant', variant); + return el; } - return el; -}; + // Mixed content inside m:fName is rare (e.g. "log_2"). Fall through to the + // per-atom path so operators and numbers render with correct semantics. + if (atoms.length === 1) { + const atom = atoms[0]!; + const el = doc.createElementNS(MATHML_NS, atom.tag); + el.textContent = atom.content; + if (variant) el.setAttribute('mathvariant', variant); + return el; + } + const fragment = doc.createDocumentFragment(); + for (const atom of atoms) { + const el = doc.createElementNS(MATHML_NS, atom.tag); + el.textContent = atom.content; + if (variant) el.setAttribute('mathvariant', variant); + fragment.appendChild(el); + } + return fragment; +} diff --git a/packages/layout-engine/painters/dom/src/features/math/omml-to-mathml.test.ts b/packages/layout-engine/painters/dom/src/features/math/omml-to-mathml.test.ts index 010d6b9218..138824639e 100644 --- a/packages/layout-engine/painters/dom/src/features/math/omml-to-mathml.test.ts +++ b/packages/layout-engine/painters/dom/src/features/math/omml-to-mathml.test.ts @@ -343,6 +343,116 @@ describe('convertOmmlToMathml', () => { expect(children.some((c) => c.localName === 'mo')).toBe(true); // + expect(children.some((c) => c.localName === 'mn')).toBe(true); // 1 }); + + // ─── SD-2632: per-character split of multi-char m:r text ────────────────── + + it('splits a single m:r containing operator + identifier into + (SD-2632)', () => { + // Fixture case 1 of math-limit-tests.docx has m:r "→∞" as one run inside + // m:limLow's m:lim. Word's OMML2MML.XSL splits it to . + const omml = { + name: 'm:oMath', + elements: [{ name: 'm:r', elements: [{ name: 'm:t', elements: [{ type: 'text', text: '\u2192\u221E' }] }] }], + }; + const result = convertOmmlToMathml(omml, doc); + const children = Array.from(result!.children); + expect(children.map((c) => `${c.localName}:${c.textContent}`)).toEqual(['mo:\u2192', 'mi:\u221E']); + }); + + it('splits "x+1=2" per character with digits grouped (SD-2632)', () => { + const omml = { + name: 'm:oMath', + elements: [{ name: 'm:r', elements: [{ name: 'm:t', elements: [{ type: 'text', text: 'x+1=2' }] }] }], + }; + const result = convertOmmlToMathml(omml, doc); + const children = Array.from(result!.children); + expect(children.map((c) => `${c.localName}:${c.textContent}`)).toEqual(['mi:x', 'mo:+', 'mn:1', 'mo:=', 'mn:2']); + }); + + it('groups consecutive digits with an interior decimal point into one (SD-2632)', () => { + const omml = { + name: 'm:oMath', + elements: [{ name: 'm:r', elements: [{ name: 'm:t', elements: [{ type: 'text', text: '123.45+67' }] }] }], + }; + const result = convertOmmlToMathml(omml, doc); + const children = Array.from(result!.children); + expect(children.map((c) => `${c.localName}:${c.textContent}`)).toEqual(['mn:123.45', 'mo:+', 'mn:67']); + }); + + it('splits m:r content inside m:sub of an m:sSub (SD-2632 F3)', () => { + // Word's built-up "b_(n+1)" has "n+1" as a single m:r inside m:sub. + // The subscript should contain separate n+1. + const omml = { + name: 'm:oMath', + elements: [ + { + name: 'm:sSub', + elements: [ + { + name: 'm:e', + elements: [{ name: 'm:r', elements: [{ name: 'm:t', elements: [{ type: 'text', text: 'b' }] }] }], + }, + { + name: 'm:sub', + elements: [{ name: 'm:r', elements: [{ name: 'm:t', elements: [{ type: 'text', text: 'n+1' }] }] }], + }, + ], + }, + ], + }; + const result = convertOmmlToMathml(omml, doc); + const subMrow = result!.querySelector('msub > mrow:nth-child(2)'); + expect(subMrow).not.toBeNull(); + const children = Array.from(subMrow!.children); + expect(children.map((c) => `${c.localName}:${c.textContent}`)).toEqual(['mi:n', 'mo:+', 'mn:1']); + }); + + it('preserves m:rPr mathvariant across every atom of a split run (SD-2632)', () => { + // When m:sty="b" (bold) applies to the whole run, every atom emitted + // from the split inherits it. + const omml = { + name: 'm:oMath', + elements: [ + { + name: 'm:r', + elements: [ + { name: 'm:rPr', elements: [{ name: 'm:sty', attributes: { 'm:val': 'b' } }] }, + { name: 'm:t', elements: [{ type: 'text', text: 'x+1' }] }, + ], + }, + ], + }; + const result = convertOmmlToMathml(omml, doc); + const variants = Array.from(result!.children).map((c) => c.getAttribute('mathvariant')); + expect(variants).toEqual(['bold', 'bold', 'bold']); + }); + + it('keeps multi-letter function names whole inside m:func > m:fName (SD-2632 exception)', () => { + // Word's OMML2MML.XSL keeps "sin" as one when nested in m:fName, + // even though it would otherwise per-char split a bare m:r. Exception is + // applied by convertFunction. + const omml = { + name: 'm:oMath', + elements: [ + { + name: 'm:func', + elements: [ + { + name: 'm:fName', + elements: [{ name: 'm:r', elements: [{ name: 'm:t', elements: [{ type: 'text', text: 'sin' }] }] }], + }, + { + name: 'm:e', + elements: [{ name: 'm:r', elements: [{ name: 'm:t', elements: [{ type: 'text', text: 'x' }] }] }], + }, + ], + }, + ], + }; + const result = convertOmmlToMathml(omml, doc); + const functionName = result!.querySelector('mrow > mrow:first-child > mi'); + expect(functionName!.textContent).toBe('sin'); + expect(functionName!.getAttribute('mathvariant')).toBe('normal'); + }); }); describe('m:bar converter', () => { diff --git a/packages/layout-engine/painters/dom/src/features/math/types.ts b/packages/layout-engine/painters/dom/src/features/math/types.ts index f1e8d90b71..00c31ea60f 100644 --- a/packages/layout-engine/painters/dom/src/features/math/types.ts +++ b/packages/layout-engine/painters/dom/src/features/math/types.ts @@ -43,4 +43,4 @@ export type MathObjectConverter = ( node: OmmlJsonNode, doc: Document, convertChildren: (children: OmmlJsonNode[]) => DocumentFragment, -) => Element | null; +) => Node | null; diff --git a/tests/behavior/tests/importing/math-equations.spec.ts b/tests/behavior/tests/importing/math-equations.spec.ts index 647c9d8342..def4590cdc 100644 --- a/tests/behavior/tests/importing/math-equations.spec.ts +++ b/tests/behavior/tests/importing/math-equations.spec.ts @@ -821,6 +821,28 @@ test.describe('m:limLow / m:limUpp (limit object) rendering', () => { }); expect(leaked).toEqual([]); }); + + test('splits multi-char operator runs in m:lim content (SD-2632)', async ({ superdoc }) => { + await superdoc.loadDocument(LIMIT_DOC); + await superdoc.waitForStable(); + + // Case 1: lim_(n→∞). Word emits the "→∞" as a single m:r. Previously we + // rendered it as one →∞; now per Word's OMML2MML.XSL it splits + // into separate atoms. + const limExpressionAtoms = await superdoc.page.evaluate(() => { + const munders = Array.from(document.querySelectorAll('munder')); + const limMunder = munders.find((m) => m.children[0]?.querySelector('mi')?.textContent === 'lim'); + const limExpr = limMunder?.children[1]; + return Array.from(limExpr?.children ?? []).map((c) => ({ tag: c.localName, text: c.textContent })); + }); + + // At minimum the limit expression contains the identifier "n" and an + // operator "→" — they must be separate atoms, not bundled as one . + const nAtom = limExpressionAtoms.find((a) => a.text === 'n'); + const arrowAtom = limExpressionAtoms.find((a) => a.text === '\u2192'); + expect(nAtom).toEqual({ tag: 'mi', text: 'n' }); + expect(arrowAtom).toEqual({ tag: 'mo', text: '\u2192' }); + }); }); test.describe('m:eqArr (equation array) rendering', () => { @@ -1341,9 +1363,12 @@ test.describe('m:groupChr (group character) rendering', () => { await superdoc.loadDocument(GROUPCHR_DOC); await superdoc.waitForStable(); + // Use `:scope > mo` to target the group character directly — the base + // expression may itself contain atoms (e.g. "a+b" splits to + // a+b per Word's OMML2MML.XSL). const firstMunder = await superdoc.page.evaluate(() => { const munder = document.querySelector('munder'); - const mo = munder?.querySelector('mo'); + const mo = munder?.querySelector(':scope > mo'); return mo ? { text: mo.textContent, stretchy: mo.getAttribute('stretchy') } : null; }); @@ -1359,7 +1384,7 @@ test.describe('m:groupChr (group character) rendering', () => { // Variant 2 — second munder in DOM order. const hiddenChar = await superdoc.page.evaluate(() => { const munders = document.querySelectorAll('munder'); - const mo = munders[1]?.querySelector('mo'); + const mo = munders[1]?.querySelector(':scope > mo'); return mo?.textContent; }); @@ -1372,7 +1397,7 @@ test.describe('m:groupChr (group character) rendering', () => { const chars = await superdoc.page.evaluate(() => { const wrappers = document.querySelectorAll('munder, mover'); - return Array.from(wrappers).map((w) => w.querySelector('mo')?.textContent ?? null); + return Array.from(wrappers).map((w) => w.querySelector(':scope > mo')?.textContent ?? null); }); // Variants 4 (U+23DE), 5 (U+2190), 6 (U+2192). From 799c4efca665006f8a1de942ff6f9c85eed0c3ae Mon Sep 17 00:00:00 2001 From: Caio Pizzol Date: Mon, 20 Apr 2026 18:18:12 -0300 Subject: [PATCH 2/3] fix(math): group letters inside m:fName mixed runs + cover mmultiscripts (SD-2632) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up to /review feedback backed by fresh Word OMML2MML.XSL evidence: - `convertMathRunAsFunctionName` (renamed from `convertMathRunWhole` since "whole" no longer fits) now groups consecutive non-digit / non-operator characters into one while still splitting digits and operators. Word's XSL for `log_2` produces `log_2` — not `log…`. - `BASE_BEARING_ELEMENTS` gains `mmultiscripts` — Word emits it when an `m:sPre` sits inside `m:fName`; our base-collapse pass needs to know to merge the first-child run. - CONTRIBUTING.md now documents the widened `Node | null` return type. Tests added: - Direct `tokenizeMathText` edge cases: `.5` / `5.` / `1.2.3` / `2x+1` / consecutive operators / empty / standalone ∞. - m:fName mixed-content: `log_2` stays `log_2`. - Base collapse inside nested `m:sSub` under `m:fName`. - Base collapse inside nested `m:sPre` (mmultiscripts) under `m:fName`. - Behavior test tightened to pin the full 3-atom sequence for `n→∞`. Disputed during review and deferred with evidence: - Opus claim that standalone `m:limLow` with "lim" base regresses to italic: Word XSL itself splits "lim" per-char in that shape (with or without m:sty=p), so our output matches Word. - Codex claim that Arabic-Indic digits should be ``: Word XSL also classifies them as ``, so our behavior matches. - Non-BMP surrogate-pair support: edge case in extreme mathematical alphanumerics; Word XSL itself errored on U+1D465. Separate ticket worth. --- .../dom/src/features/math/CONTRIBUTING.md | 4 +- .../src/features/math/converters/function.ts | 16 +- .../src/features/math/converters/math-run.ts | 91 ++++++--- .../src/features/math/omml-to-mathml.test.ts | 184 ++++++++++++++++++ .../tests/importing/math-equations.spec.ts | 12 +- 5 files changed, 266 insertions(+), 41 deletions(-) diff --git a/packages/layout-engine/painters/dom/src/features/math/CONTRIBUTING.md b/packages/layout-engine/painters/dom/src/features/math/CONTRIBUTING.md index 8c002d976d..3764f6da11 100644 --- a/packages/layout-engine/painters/dom/src/features/math/CONTRIBUTING.md +++ b/packages/layout-engine/painters/dom/src/features/math/CONTRIBUTING.md @@ -36,7 +36,9 @@ type MathObjectConverter = ( doc: Document, // For creating DOM elements convertChildren: (children: OmmlJsonNode[]) => DocumentFragment, // Recursively converts nested OMML content -) => Element | null; +) => Node | null; // Return a single Element for one atom, or a + // DocumentFragment when your converter produces + // multiple sibling elements (see m:r / math-run). ``` `convertChildren` is the important one. Pass it any child elements that contain nested math content (`m:e`, `m:num`, `m:sub`, etc.). It handles everything inside them, including other math objects. diff --git a/packages/layout-engine/painters/dom/src/features/math/converters/function.ts b/packages/layout-engine/painters/dom/src/features/math/converters/function.ts index b8f8697f01..6aa879340b 100644 --- a/packages/layout-engine/painters/dom/src/features/math/converters/function.ts +++ b/packages/layout-engine/painters/dom/src/features/math/converters/function.ts @@ -1,5 +1,5 @@ import type { MathObjectConverter } from '../types.js'; -import { convertMathRunWhole } from './math-run.js'; +import { convertMathRunAsFunctionName } from './math-run.js'; const MATHML_NS = 'http://www.w3.org/1998/Math/MathML'; const FUNCTION_APPLY_OPERATOR = '\u2061'; @@ -44,7 +44,15 @@ function forceNormalMathVariant(root: ParentNode): void { * m:sSub → , etc.). Word's OMML2MML.XSL keeps the base text whole * (e.g. "lim" as one ) even though it splits regular runs per-character. */ -const BASE_BEARING_ELEMENTS = new Set(['munder', 'mover', 'munderover', 'msub', 'msup', 'msubsup']); +const BASE_BEARING_ELEMENTS = new Set([ + 'munder', + 'mover', + 'munderover', + 'msub', + 'msup', + 'msubsup', + 'mmultiscripts', // m:sPre inside m:fName +]); /** * After per-character splitting in convertMathRun, the base of a nested @@ -107,8 +115,8 @@ export const convertFunction: MathObjectConverter = (node, doc, convertChildren) // like a nested m:limLow — go through the normal recursive path. for (const child of functionName?.elements ?? []) { if (child.name === 'm:r') { - const whole = convertMathRunWhole(child, doc); - if (whole) functionNameRow.appendChild(whole); + const atom = convertMathRunAsFunctionName(child, doc); + if (atom) functionNameRow.appendChild(atom); } else { const converted = convertChildren([child]); if (converted.childNodes.length > 0) functionNameRow.appendChild(converted); diff --git a/packages/layout-engine/painters/dom/src/features/math/converters/math-run.ts b/packages/layout-engine/painters/dom/src/features/math/converters/math-run.ts index 49884e3ca9..4ce5a2a74e 100644 --- a/packages/layout-engine/painters/dom/src/features/math/converters/math-run.ts +++ b/packages/layout-engine/painters/dom/src/features/math/converters/math-run.ts @@ -209,47 +209,82 @@ export const convertMathRun: MathObjectConverter = (node, doc) => { }; /** - * Convert an m:r to a single `` element, preserving the entire run text. + * Tokenize a math run's text for the m:fName context: consecutive non-digit, + * non-operator characters stay grouped in one `` (so "log" in "log_2" + * remains a single identifier), while digits still group into `` and + * each operator character is its own ``. * - * Used by m:func when processing m:fName children — Word's OMML2MML.XSL treats - * multi-letter function names (e.g. "sin", "lim", "max") as one identifier - * rather than splitting per character. See `convertFunction` for the calling - * context. + * Matches Word's OMML2MML.XSL run-internal classification for m:fName + * content: `log_2` → `log_2`. + */ +function tokenizeFunctionNameText(text: string): Array<{ tag: MathAtomTag; content: string }> { + const atoms: Array<{ tag: MathAtomTag; content: string }> = []; + let i = 0; + while (i < text.length) { + const ch = text[i]!; + if (isDigit(ch)) { + let end = i + 1; + let sawDot = false; + while (end < text.length) { + const c = text[end]!; + if (isDigit(c)) { + end++; + continue; + } + if (c === '.' && !sawDot && end + 1 < text.length && isDigit(text[end + 1]!)) { + sawDot = true; + end++; + continue; + } + break; + } + atoms.push({ tag: 'mn', content: text.slice(i, end) }); + i = end; + } else if (OPERATOR_CHARS.has(ch)) { + atoms.push({ tag: 'mo', content: ch }); + i++; + } else { + // Group consecutive non-digit, non-operator characters into a single . + let end = i + 1; + while (end < text.length) { + const c = text[end]!; + if (isDigit(c) || OPERATOR_CHARS.has(c)) break; + end++; + } + atoms.push({ tag: 'mi', content: text.slice(i, end) }); + i = end; + } + } + return atoms; +} + +/** + * Convert an m:r inside m:fName (m:func's function-name slot). Word's + * OMML2MML.XSL keeps each letter-sequence whole while still splitting out + * digits and operators — so `sin` stays `sin`, but `log_2` becomes + * `log_2`. * - * Returns null if the run has no text. If the run contains non-letter - * characters (digits or operators), falls back to the splitting path so - * composite function names still render correctly. + * Returns a single Element for single-atom runs or a DocumentFragment when + * the run emits multiple atoms. Returns null for empty text. */ -export function convertMathRunWhole(node: OmmlJsonNode, doc: Document): Node | null { +export function convertMathRunAsFunctionName(node: OmmlJsonNode, doc: Document): Node | null { const text = extractText(node); if (!text) return null; const rPr = (node.elements ?? []).find((el) => el.name === 'm:rPr'); const variant = resolveMathVariant(rPr); - const atoms = tokenizeMathText(text); + const atoms = tokenizeFunctionNameText(text); - if (atoms.every((a) => a.tag === 'mi')) { - const el = doc.createElementNS(MATHML_NS, 'mi'); - el.textContent = text; - if (variant) el.setAttribute('mathvariant', variant); - return el; - } - - // Mixed content inside m:fName is rare (e.g. "log_2"). Fall through to the - // per-atom path so operators and numbers render with correct semantics. - if (atoms.length === 1) { - const atom = atoms[0]!; + const createAtom = (atom: { tag: MathAtomTag; content: string }): Element => { const el = doc.createElementNS(MATHML_NS, atom.tag); el.textContent = atom.content; if (variant) el.setAttribute('mathvariant', variant); return el; - } + }; + + if (atoms.length === 1) return createAtom(atoms[0]!); + const fragment = doc.createDocumentFragment(); - for (const atom of atoms) { - const el = doc.createElementNS(MATHML_NS, atom.tag); - el.textContent = atom.content; - if (variant) el.setAttribute('mathvariant', variant); - fragment.appendChild(el); - } + for (const atom of atoms) fragment.appendChild(createAtom(atom)); return fragment; } diff --git a/packages/layout-engine/painters/dom/src/features/math/omml-to-mathml.test.ts b/packages/layout-engine/painters/dom/src/features/math/omml-to-mathml.test.ts index 138824639e..2d9cf3cd8b 100644 --- a/packages/layout-engine/painters/dom/src/features/math/omml-to-mathml.test.ts +++ b/packages/layout-engine/painters/dom/src/features/math/omml-to-mathml.test.ts @@ -1,6 +1,7 @@ import { describe, it, expect } from 'vitest'; import { JSDOM } from 'jsdom'; import { convertOmmlToMathml, MATHML_NS } from './omml-to-mathml.js'; +import { tokenizeMathText } from './converters/math-run.js'; const dom = new JSDOM(''); const doc = dom.window.document; @@ -344,6 +345,57 @@ describe('convertOmmlToMathml', () => { expect(children.some((c) => c.localName === 'mn')).toBe(true); // 1 }); + // ─── tokenizeMathText direct coverage (SD-2632) ──────────────────────────── + + it('tokenizes leading-decimal content with . as an operator followed by digits', () => { + // ".5" has no leading digit, so the "." is not part of a number. + expect(tokenizeMathText('.5')).toEqual([ + { tag: 'mo', content: '.' }, + { tag: 'mn', content: '5' }, + ]); + }); + + it('tokenizes a trailing decimal point as a separate operator', () => { + // "5." — the digit run ends at "5" because a lookahead digit is required. + expect(tokenizeMathText('5.')).toEqual([ + { tag: 'mn', content: '5' }, + { tag: 'mo', content: '.' }, + ]); + }); + + it('tokenizes "1.2.3" as number, operator, number — only first dot is inline', () => { + expect(tokenizeMathText('1.2.3')).toEqual([ + { tag: 'mn', content: '1.2' }, + { tag: 'mo', content: '.' }, + { tag: 'mn', content: '3' }, + ]); + }); + + it('tokenizes "2x+1" — number-identifier-operator-number', () => { + expect(tokenizeMathText('2x+1')).toEqual([ + { tag: 'mn', content: '2' }, + { tag: 'mi', content: 'x' }, + { tag: 'mo', content: '+' }, + { tag: 'mn', content: '1' }, + ]); + }); + + it('tokenizes consecutive operator characters as separate atoms', () => { + expect(tokenizeMathText('\u2264\u2265')).toEqual([ + { tag: 'mo', content: '\u2264' }, + { tag: 'mo', content: '\u2265' }, + ]); + }); + + it('tokenizes empty text as an empty list', () => { + expect(tokenizeMathText('')).toEqual([]); + }); + + it('tokenizes standalone ∞ as identifier, not operator (SD-2632)', () => { + // U+221E was removed from OPERATOR_CHARS; Word classifies it as . + expect(tokenizeMathText('\u221E')).toEqual([{ tag: 'mi', content: '\u221E' }]); + }); + // ─── SD-2632: per-character split of multi-char m:r text ────────────────── it('splits a single m:r containing operator + identifier into + (SD-2632)', () => { @@ -426,6 +478,138 @@ describe('convertOmmlToMathml', () => { expect(variants).toEqual(['bold', 'bold', 'bold']); }); + it('keeps "log" whole but splits operators and digits for m:fName with mixed content (SD-2632)', () => { + // Word's OMML2MML.XSL for log_2: letters group + // into one , operators and digits still split. + const omml = { + name: 'm:oMath', + elements: [ + { + name: 'm:func', + elements: [ + { + name: 'm:fName', + elements: [{ name: 'm:r', elements: [{ name: 'm:t', elements: [{ type: 'text', text: 'log_2' }] }] }], + }, + { + name: 'm:e', + elements: [{ name: 'm:r', elements: [{ name: 'm:t', elements: [{ type: 'text', text: 'x' }] }] }], + }, + ], + }, + ], + }; + const result = convertOmmlToMathml(omml, doc); + const fnameRow = result!.querySelector('mrow > mrow:first-child'); + const children = Array.from(fnameRow!.children); + expect(children.map((c) => `${c.localName}:${c.textContent}`)).toEqual(['mi:log', 'mo:_', 'mn:2']); + }); + + it('collapses a multi-char base inside nested m:sSub wrapped by m:fName (SD-2632)', () => { + // Ensures the msub/msup entries of BASE_BEARING_ELEMENTS are actually pinned. + // fi should + // keep "f" as a single inside the subscript wrapper's base slot. + const omml = { + name: 'm:oMath', + elements: [ + { + name: 'm:func', + elements: [ + { + name: 'm:fName', + elements: [ + { + name: 'm:sSub', + elements: [ + { + name: 'm:e', + elements: [ + { + name: 'm:r', + elements: [ + { name: 'm:rPr', elements: [{ name: 'm:sty', attributes: { 'm:val': 'p' } }] }, + { name: 'm:t', elements: [{ type: 'text', text: 'log' }] }, + ], + }, + ], + }, + { + name: 'm:sub', + elements: [{ name: 'm:r', elements: [{ name: 'm:t', elements: [{ type: 'text', text: '2' }] }] }], + }, + ], + }, + ], + }, + { + name: 'm:e', + elements: [{ name: 'm:r', elements: [{ name: 'm:t', elements: [{ type: 'text', text: 'x' }] }] }], + }, + ], + }, + ], + }; + const result = convertOmmlToMathml(omml, doc); + const msub = result!.querySelector('msub'); + expect(msub).not.toBeNull(); + const baseMi = msub!.children[0]!.querySelector('mi'); + expect(baseMi!.textContent).toBe('log'); + expect(baseMi!.getAttribute('mathvariant')).toBe('normal'); + }); + + it('collapses multi-char base inside nested m:sPre (mmultiscripts) wrapped by m:fName (SD-2632)', () => { + const omml = { + name: 'm:oMath', + elements: [ + { + name: 'm:func', + elements: [ + { + name: 'm:fName', + elements: [ + { + name: 'm:sPre', + elements: [ + { + name: 'm:sub', + elements: [{ name: 'm:r', elements: [{ name: 'm:t', elements: [{ type: 'text', text: '2' }] }] }], + }, + { + name: 'm:sup', + elements: [{ name: 'm:r', elements: [{ name: 'm:t', elements: [{ type: 'text', text: 'n' }] }] }], + }, + { + name: 'm:e', + elements: [ + { + name: 'm:r', + elements: [ + { name: 'm:rPr', elements: [{ name: 'm:sty', attributes: { 'm:val': 'p' } }] }, + { name: 'm:t', elements: [{ type: 'text', text: 'log' }] }, + ], + }, + ], + }, + ], + }, + ], + }, + { + name: 'm:e', + elements: [{ name: 'm:r', elements: [{ name: 'm:t', elements: [{ type: 'text', text: 'x' }] }] }], + }, + ], + }, + ], + }; + const result = convertOmmlToMathml(omml, doc); + const mms = result!.querySelector('mmultiscripts'); + expect(mms).not.toBeNull(); + const baseMi = mms!.children[0]!.querySelector('mi'); + expect(baseMi!.textContent).toBe('log'); + expect(baseMi!.getAttribute('mathvariant')).toBe('normal'); + }); + it('keeps multi-letter function names whole inside m:func > m:fName (SD-2632 exception)', () => { // Word's OMML2MML.XSL keeps "sin" as one when nested in m:fName, // even though it would otherwise per-char split a bare m:r. Exception is diff --git a/tests/behavior/tests/importing/math-equations.spec.ts b/tests/behavior/tests/importing/math-equations.spec.ts index def4590cdc..650c6385be 100644 --- a/tests/behavior/tests/importing/math-equations.spec.ts +++ b/tests/behavior/tests/importing/math-equations.spec.ts @@ -828,20 +828,16 @@ test.describe('m:limLow / m:limUpp (limit object) rendering', () => { // Case 1: lim_(n→∞). Word emits the "→∞" as a single m:r. Previously we // rendered it as one →∞; now per Word's OMML2MML.XSL it splits - // into separate atoms. + // into separate atoms. Assert the full ordered sequence so a regression + // that drops or misclassifies any atom is caught. const limExpressionAtoms = await superdoc.page.evaluate(() => { const munders = Array.from(document.querySelectorAll('munder')); const limMunder = munders.find((m) => m.children[0]?.querySelector('mi')?.textContent === 'lim'); const limExpr = limMunder?.children[1]; - return Array.from(limExpr?.children ?? []).map((c) => ({ tag: c.localName, text: c.textContent })); + return Array.from(limExpr?.children ?? []).map((c) => `${c.localName}:${c.textContent}`); }); - // At minimum the limit expression contains the identifier "n" and an - // operator "→" — they must be separate atoms, not bundled as one . - const nAtom = limExpressionAtoms.find((a) => a.text === 'n'); - const arrowAtom = limExpressionAtoms.find((a) => a.text === '\u2192'); - expect(nAtom).toEqual({ tag: 'mi', text: 'n' }); - expect(arrowAtom).toEqual({ tag: 'mo', text: '\u2192' }); + expect(limExpressionAtoms).toEqual(['mi:n', 'mo:\u2192', 'mi:\u221E']); }); }); From 35f0bb5221aca7384c230bde03704efba322015b Mon Sep 17 00:00:00 2001 From: Caio Pizzol Date: Mon, 20 Apr 2026 18:52:07 -0300 Subject: [PATCH 3/3] fix(math): iterate tokenizer by code point to preserve surrogate pairs Addresses Codex bot review on PR #2875. Astral-plane mathematical alphanumerics (e.g. U+1D465 mathematical italic x, U+1D7D9 mathematical double-struck 1) are UTF-16 surrogate pairs. Walking text by code unit split them into two half-pair atoms with invalid content. `codePointUnitLength` returns 2 when the current position starts a surrogate pair so tokenizeMathText and tokenizeFunctionNameText step across the full code point. --- .../src/features/math/converters/math-run.ts | 41 +++++++++++++------ .../src/features/math/omml-to-mathml.test.ts | 11 +++++ 2 files changed, 40 insertions(+), 12 deletions(-) diff --git a/packages/layout-engine/painters/dom/src/features/math/converters/math-run.ts b/packages/layout-engine/painters/dom/src/features/math/converters/math-run.ts index 4ce5a2a74e..7f979563f9 100644 --- a/packages/layout-engine/painters/dom/src/features/math/converters/math-run.ts +++ b/packages/layout-engine/painters/dom/src/features/math/converters/math-run.ts @@ -70,6 +70,20 @@ function isDigit(ch: string): boolean { return ch >= '0' && ch <= '9'; } +/** + * Length in UTF-16 code units of the code point starting at `text[i]`. + * Handles surrogate pairs so astral-plane characters (e.g. mathematical + * italic U+1D465) don't get split into two bogus atoms. + */ +function codePointUnitLength(text: string, i: number): number { + const hi = text.charCodeAt(i); + if (hi >= 0xd800 && hi <= 0xdbff && i + 1 < text.length) { + const lo = text.charCodeAt(i + 1); + if (lo >= 0xdc00 && lo <= 0xdfff) return 2; + } + return 1; +} + /** * Split a math run's text into MathML atoms, matching Word's OMML2MML.XSL. * @@ -85,8 +99,9 @@ export function tokenizeMathText(text: string): Array<{ tag: MathAtomTag; conten const atoms: Array<{ tag: MathAtomTag; content: string }> = []; let i = 0; while (i < text.length) { - const ch = text[i]!; - if (isDigit(ch)) { + const step = codePointUnitLength(text, i); + const ch = text.slice(i, i + step); + if (step === 1 && isDigit(ch)) { let end = i + 1; let sawDot = false; while (end < text.length) { @@ -104,12 +119,12 @@ export function tokenizeMathText(text: string): Array<{ tag: MathAtomTag; conten } atoms.push({ tag: 'mn', content: text.slice(i, end) }); i = end; - } else if (OPERATOR_CHARS.has(ch)) { + } else if (step === 1 && OPERATOR_CHARS.has(ch)) { atoms.push({ tag: 'mo', content: ch }); i++; } else { atoms.push({ tag: 'mi', content: ch }); - i++; + i += step; } } return atoms; @@ -221,8 +236,9 @@ function tokenizeFunctionNameText(text: string): Array<{ tag: MathAtomTag; conte const atoms: Array<{ tag: MathAtomTag; content: string }> = []; let i = 0; while (i < text.length) { - const ch = text[i]!; - if (isDigit(ch)) { + const step = codePointUnitLength(text, i); + const ch = text.slice(i, i + step); + if (step === 1 && isDigit(ch)) { let end = i + 1; let sawDot = false; while (end < text.length) { @@ -240,16 +256,17 @@ function tokenizeFunctionNameText(text: string): Array<{ tag: MathAtomTag; conte } atoms.push({ tag: 'mn', content: text.slice(i, end) }); i = end; - } else if (OPERATOR_CHARS.has(ch)) { + } else if (step === 1 && OPERATOR_CHARS.has(ch)) { atoms.push({ tag: 'mo', content: ch }); i++; } else { - // Group consecutive non-digit, non-operator characters into a single . - let end = i + 1; + // Group consecutive non-digit, non-operator code points into one . + let end = i + step; while (end < text.length) { - const c = text[end]!; - if (isDigit(c) || OPERATOR_CHARS.has(c)) break; - end++; + const s = codePointUnitLength(text, end); + const c = text.slice(end, end + s); + if (s === 1 && (isDigit(c) || OPERATOR_CHARS.has(c))) break; + end += s; } atoms.push({ tag: 'mi', content: text.slice(i, end) }); i = end; diff --git a/packages/layout-engine/painters/dom/src/features/math/omml-to-mathml.test.ts b/packages/layout-engine/painters/dom/src/features/math/omml-to-mathml.test.ts index 2d9cf3cd8b..32930a01eb 100644 --- a/packages/layout-engine/painters/dom/src/features/math/omml-to-mathml.test.ts +++ b/packages/layout-engine/painters/dom/src/features/math/omml-to-mathml.test.ts @@ -396,6 +396,17 @@ describe('convertOmmlToMathml', () => { expect(tokenizeMathText('\u221E')).toEqual([{ tag: 'mi', content: '\u221E' }]); }); + it('keeps astral-plane characters whole (does not split surrogate pairs)', () => { + // 𝑥 (U+1D465, mathematical italic small x) is a UTF-16 surrogate pair. + // Splitting by code unit would emit two bogus half-pair s. + const text = '\u{1D465}+1'; + expect(tokenizeMathText(text)).toEqual([ + { tag: 'mi', content: '\u{1D465}' }, + { tag: 'mo', content: '+' }, + { tag: 'mn', content: '1' }, + ]); + }); + // ─── SD-2632: per-character split of multi-char m:r text ────────────────── it('splits a single m:r containing operator + identifier into + (SD-2632)', () => {