From 9ceec55257188fd5f1e368aa5e7c4680ec25f021 Mon Sep 17 00:00:00 2001
From: Caio Pizzol <caio@harbourshare.com>
Date: Mon, 20 Apr 2026 17:54:13 -0300
Subject: [PATCH 1/3] fix(math): split multi-char m:r text per character to
 match Word (SD-2632)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Word's OMML2MML.XSL classifies each character in an m:r run individually —
digits group into a single <mn> (with one optional decimal point between
digits), operator characters each become their own <mo>, and everything
else becomes its own <mi>. SuperDoc was emitting the entire run text as
one element, so runs like "→∞" or "x+1" rendered as a single <mi>, losing
operator spacing and semantics.

tokenizeMathText implements the per-character classification. convertMathRun
returns a single Element for one-atom runs and a DocumentFragment when
multiple atoms are emitted, so siblings flow directly into the parent's
<mrow> without an extra wrapper.

m:fName is the documented exception — Word keeps multi-letter function
names like "sin" or "lim" as one <mi> inside the function-name slot.
convertFunction routes m:r children through convertMathRunWhole (no
splitting), and a new collapseFunctionNameBases pass re-merges the base
slot of any structural MathML element (munder/mover/msub/…) that Word
nested inside m:fName — without this, "lim" inside m:limLow would
incorrectly split to three <mi>.

Also drops U+221E (∞) from OPERATOR_CHARS — it's a mathematical constant
per Word's XSL, not an operator.

MathObjectConverter's return type widens from Element|null to Node|null
so convertMathRun can return a DocumentFragment. All other converters
already return Element, which is assignable to Node — no other changes.

Verified against real Word-native fixtures: `→∞` in the limit-tests
fixture case 1 now renders as <mi>n</mi><mo>→</mo><mi>∞</mi> (matches
Word OMML2MML.XSL byte-for-byte), and nested limits keep their function
names intact.

Ref ECMA-376 §22.1.2.116, Annex L.6.1.13, §22.1.2.58.
---
 .../src/features/math/converters/function.ts  |  57 +++++-
 .../src/features/math/converters/math-run.ts  | 168 ++++++++++++++----
 .../src/features/math/omml-to-mathml.test.ts  | 110 ++++++++++++
 .../painters/dom/src/features/math/types.ts   |   2 +-
 .../tests/importing/math-equations.spec.ts    |  31 +++-
 5 files changed, 326 insertions(+), 42 deletions(-)
diff --git a/packages/layout-engine/painters/dom/src/features/math/converters/function.ts b/packages/layout-engine/painters/dom/src/features/math/converters/function.ts
index 145c183af9..b8f8697f01 100644
--- a/packages/layout-engine/painters/dom/src/features/math/converters/function.ts
+++ b/packages/layout-engine/painters/dom/src/features/math/converters/function.ts
@@ -1,4 +1,5 @@
 import type { MathObjectConverter } from '../types.js';
+import { convertMathRunWhole } from './math-run.js';
 
 const MATHML_NS = 'http://www.w3.org/1998/Math/MathML';
 const FUNCTION_APPLY_OPERATOR = '\u2061';
@@ -37,6 +38,48 @@ function forceNormalMathVariant(root: ParentNode): void {
   }
 }
 
+/**
+ * Structural MathML elements whose FIRST child is the "function-name base"
+ * when nested inside m:fName (e.g. m:limLow → <munder>, m:limUpp → <mover>,
+ * m:sSub → <msub>, etc.). Word's OMML2MML.XSL keeps the base text whole
+ * (e.g. "lim" as one <mi>) even though it splits regular runs per-character.
+ */
+const BASE_BEARING_ELEMENTS = new Set(['munder', 'mover', 'munderover', 'msub', 'msup', 'msubsup']);
+
+/**
+ * After per-character splitting in convertMathRun, the base of a nested
+ * limit/script inside m:fName comes out as multiple single-char <mi> siblings
+ * wrapped in an <mrow>. Word's XSL keeps that base whole — merge the siblings
+ * back into a single <mi> if they all share the same (or no) mathvariant.
+ */
+function collapseFunctionNameBases(root: ParentNode): void {
+  for (const child of Array.from(root.children)) {
+    if (BASE_BEARING_ELEMENTS.has(child.localName)) {
+      const base = child.children[0];
+      if (base) {
+        collapseMrowToSingleMi(base);
+        collapseFunctionNameBases(base);
+      }
+    } else {
+      collapseFunctionNameBases(child);
+    }
+  }
+}
+
+function collapseMrowToSingleMi(container: Element): void {
+  const children = Array.from(container.children);
+  if (children.length < 2) return;
+  if (!children.every((c) => c.localName === 'mi')) return;
+  const variant = children[0]!.getAttribute('mathvariant');
+  if (!children.every((c) => c.getAttribute('mathvariant') === variant)) return;
+
+  const merged = container.ownerDocument!.createElementNS(MATHML_NS, 'mi');
+  merged.textContent = children.map((c) => c.textContent ?? '').join('');
+  if (variant) merged.setAttribute('mathvariant', variant);
+  container.insertBefore(merged, children[0]!);
+  for (const c of children) c.remove();
+}
+
 /**
  * Convert m:func (function apply) to MathML.
  *
@@ -59,7 +102,19 @@ export const convertFunction: MathObjectConverter = (node, doc, convertChildren)
   const wrapper = doc.createElementNS(MATHML_NS, 'mrow');
 
   const functionNameRow = doc.createElementNS(MATHML_NS, 'mrow');
-  functionNameRow.appendChild(convertChildren(functionName?.elements ?? []));
+  // m:r children of m:fName stay whole (Word's OMML2MML.XSL keeps multi-letter
+  // function names like "sin" or "lim" as a single <mi>). Non-m:r children —
+  // like a nested m:limLow — go through the normal recursive path.
+  for (const child of functionName?.elements ?? []) {
+    if (child.name === 'm:r') {
+      const whole = convertMathRunWhole(child, doc);
+      if (whole) functionNameRow.appendChild(whole);
+    } else {
+      const converted = convertChildren([child]);
+      if (converted.childNodes.length > 0) functionNameRow.appendChild(converted);
+    }
+  }
+  collapseFunctionNameBases(functionNameRow);
   forceNormalMathVariant(functionNameRow);
 
   if (functionNameRow.childNodes.length > 0) {
diff --git a/packages/layout-engine/painters/dom/src/features/math/converters/math-run.ts b/packages/layout-engine/painters/dom/src/features/math/converters/math-run.ts
index 79fddd2375..49884e3ca9 100644
--- a/packages/layout-engine/painters/dom/src/features/math/converters/math-run.ts
+++ b/packages/layout-engine/painters/dom/src/features/math/converters/math-run.ts
@@ -46,8 +46,7 @@ const OPERATOR_CHARS = new Set([
   '\u220C', // ∈, ∉, ∋, ∌
   '\u2211',
   '\u220F', // ∑, ∏
-  '\u221A',
-  '\u221E', // √, ∞
+  '\u221A', // √ (radical sign — prefix operator)
   '\u2227',
   '\u2228',
   '\u2229',
@@ -65,16 +64,55 @@ const OPERATOR_CHARS = new Set([
   '\u2287', // ⊂, ⊃, ⊆, ⊇
 ]);
 
+type MathAtomTag = 'mi' | 'mo' | 'mn';
+
+function isDigit(ch: string): boolean {
+  return ch >= '0' && ch <= '9';
+}
+
 /**
- * Classify a text string into MathML element type.
- * - All-digit strings → <mn> (number)
- * - Known operators → <mo> (operator)
- * - Everything else → <mi> (identifier)
+ * Split a math run's text into MathML atoms, matching Word's OMML2MML.XSL.
+ *
+ * Rules (ECMA-376 §22.1.2.116 example + Annex L.6.1.13):
+ * - Consecutive digits — optionally containing one decimal point between digits —
+ *   group into a single `<mn>`.
+ * - Each recognized operator character becomes its own `<mo>`.
+ * - Every other character becomes its own `<mi>`.
+ *
+ * Example: `"n+1"` → `[<mi>n</mi>, <mo>+</mo>, <mn>1</mn>]`.
  */
-function classifyMathText(text: string): 'mn' | 'mo' | 'mi' {
-  if (/^\d*\.?\d+$/.test(text)) return 'mn';
-  if (text.length === 1 && OPERATOR_CHARS.has(text)) return 'mo';
-  return 'mi';
+export function tokenizeMathText(text: string): Array<{ tag: MathAtomTag; content: string }> {
+  const atoms: Array<{ tag: MathAtomTag; content: string }> = [];
+  let i = 0;
+  while (i < text.length) {
+    const ch = text[i]!;
+    if (isDigit(ch)) {
+      let end = i + 1;
+      let sawDot = false;
+      while (end < text.length) {
+        const c = text[end]!;
+        if (isDigit(c)) {
+          end++;
+          continue;
+        }
+        if (c === '.' && !sawDot && end + 1 < text.length && isDigit(text[end + 1]!)) {
+          sawDot = true;
+          end++;
+          continue;
+        }
+        break;
+      }
+      atoms.push({ tag: 'mn', content: text.slice(i, end) });
+      i = end;
+    } else if (OPERATOR_CHARS.has(ch)) {
+      atoms.push({ tag: 'mo', content: ch });
+      i++;
+    } else {
+      atoms.push({ tag: 'mi', content: ch });
+      i++;
+    }
+  }
+  return atoms;
 }
 
 /** ECMA-376 m:sty → MathML mathvariant (§22.1.2 math run properties). */
@@ -115,47 +153,103 @@ function resolveMathVariant(rPr: OmmlJsonNode | undefined): string | null {
   return null;
 }
 
+function extractText(node: OmmlJsonNode): string {
+  let text = '';
+  for (const child of node.elements ?? []) {
+    if (child.name === 'm:t') {
+      for (const tc of child.elements ?? []) {
+        if (tc.type === 'text' && typeof tc.text === 'string') text += tc.text;
+      }
+    }
+  }
+  return text;
+}
+
 /**
- * Convert an m:r (math run) element to MathML.
+ * Convert an m:r (math run) element to MathML atoms.
  *
  * m:r contains:
  * - m:rPr (math run properties: script, style, normal text flag)
  * - m:t (text content)
  * - Optionally w:rPr (WordprocessingML run properties for formatting)
  *
- * The text is classified as <mi>, <mo>, or <mn> based on content.
+ * The run's text is split per-character into `<mi>` / `<mo>` / `<mn>` atoms
+ * per Word's OMML2MML.XSL. For a single-atom run (common case — a one-letter
+ * variable, single operator, or an all-digit number) the converter returns a
+ * single Element. For a multi-atom run (e.g. "→∞", "x+1") it returns a
+ * DocumentFragment whose children become siblings of the parent mrow.
+ *
+ * @spec ECMA-376 §22.1.2.116 (t) — example shows multi-char mixed runs as the
+ *   normal authored shape; §22.1.2.58 (lit) implies operators are classified
+ *   per-character by default.
  */
 export const convertMathRun: MathObjectConverter = (node, doc) => {
-  const elements = node.elements ?? [];
+  const text = extractText(node);
+  if (!text) return null;
 
-  // Extract text from m:t children
-  let text = '';
-  for (const child of elements) {
-    if (child.name === 'm:t') {
-      const textChildren = child.elements ?? [];
-      for (const tc of textChildren) {
-        if (tc.type === 'text' && typeof tc.text === 'string') {
-          text += tc.text;
-        }
-      }
-    }
-  }
+  const rPr = (node.elements ?? []).find((el) => el.name === 'm:rPr');
+  const variant = resolveMathVariant(rPr);
+  const atoms = tokenizeMathText(text);
+
+  const createAtom = (atom: { tag: MathAtomTag; content: string }): Element => {
+    const el = doc.createElementNS(MATHML_NS, atom.tag);
+    el.textContent = atom.content;
+    // Apply m:rPr-derived variant to every atom in the run. Omitted attribute
+    // means "use the MathML default" (italic for single-char <mi>, normal
+    // for multi-char <mi>/<mo>/<mn>).
+    if (variant) el.setAttribute('mathvariant', variant);
+    return el;
+  };
+
+  if (atoms.length === 1) return createAtom(atoms[0]!);
+
+  const fragment = doc.createDocumentFragment();
+  for (const atom of atoms) fragment.appendChild(createAtom(atom));
+  return fragment;
+};
 
+/**
+ * Convert an m:r to a single `<mi>` element, preserving the entire run text.
+ *
+ * Used by m:func when processing m:fName children — Word's OMML2MML.XSL treats
+ * multi-letter function names (e.g. "sin", "lim", "max") as one identifier
+ * rather than splitting per character. See `convertFunction` for the calling
+ * context.
+ *
+ * Returns null if the run has no text. If the run contains non-letter
+ * characters (digits or operators), falls back to the splitting path so
+ * composite function names still render correctly.
+ */
+export function convertMathRunWhole(node: OmmlJsonNode, doc: Document): Node | null {
+  const text = extractText(node);
   if (!text) return null;
 
-  const rPr = elements.find((el) => el.name === 'm:rPr');
+  const rPr = (node.elements ?? []).find((el) => el.name === 'm:rPr');
   const variant = resolveMathVariant(rPr);
-  const tag = classifyMathText(text);
-
-  const el = doc.createElementNS(MATHML_NS, tag);
-  el.textContent = text;
+  const atoms = tokenizeMathText(text);
 
-  // Apply mathvariant when the spec properties resolve to one. The default
-  // for single-char <mi> is italic and for multi-char <mi>/<mo>/<mn> is
-  // normal — we only set an attribute when m:rPr explicitly specifies it.
-  if (variant) {
-    el.setAttribute('mathvariant', variant);
+  if (atoms.every((a) => a.tag === 'mi')) {
+    const el = doc.createElementNS(MATHML_NS, 'mi');
+    el.textContent = text;
+    if (variant) el.setAttribute('mathvariant', variant);
+    return el;
   }
 
-  return el;
-};
+  // Mixed content inside m:fName is rare (e.g. "log_2"). Fall through to the
+  // per-atom path so operators and numbers render with correct semantics.
+  if (atoms.length === 1) {
+    const atom = atoms[0]!;
+    const el = doc.createElementNS(MATHML_NS, atom.tag);
+    el.textContent = atom.content;
+    if (variant) el.setAttribute('mathvariant', variant);
+    return el;
+  }
+  const fragment = doc.createDocumentFragment();
+  for (const atom of atoms) {
+    const el = doc.createElementNS(MATHML_NS, atom.tag);
+    el.textContent = atom.content;
+    if (variant) el.setAttribute('mathvariant', variant);
+    fragment.appendChild(el);
+  }
+  return fragment;
+}
diff --git a/packages/layout-engine/painters/dom/src/features/math/omml-to-mathml.test.ts b/packages/layout-engine/painters/dom/src/features/math/omml-to-mathml.test.ts
index 010d6b9218..138824639e 100644
--- a/packages/layout-engine/painters/dom/src/features/math/omml-to-mathml.test.ts
+++ b/packages/layout-engine/painters/dom/src/features/math/omml-to-mathml.test.ts
@@ -343,6 +343,116 @@ describe('convertOmmlToMathml', () => {
     expect(children.some((c) => c.localName === 'mo')).toBe(true); // +
     expect(children.some((c) => c.localName === 'mn')).toBe(true); // 1
   });
+
+  // ─── SD-2632: per-character split of multi-char m:r text ──────────────────
+
+  it('splits a single m:r containing operator + identifier into <mo> + <mi> (SD-2632)', () => {
+    // Fixture case 1 of math-limit-tests.docx has m:r "→∞" as one run inside
+    // m:limLow's m:lim. Word's OMML2MML.XSL splits it to <mo>→</mo><mi>∞</mi>.
+    const omml = {
+      name: 'm:oMath',
+      elements: [{ name: 'm:r', elements: [{ name: 'm:t', elements: [{ type: 'text', text: '\u2192\u221E' }] }] }],
+    };
+    const result = convertOmmlToMathml(omml, doc);
+    const children = Array.from(result!.children);
+    expect(children.map((c) => `${c.localName}:${c.textContent}`)).toEqual(['mo:\u2192', 'mi:\u221E']);
+  });
+
+  it('splits "x+1=2" per character with digits grouped (SD-2632)', () => {
+    const omml = {
+      name: 'm:oMath',
+      elements: [{ name: 'm:r', elements: [{ name: 'm:t', elements: [{ type: 'text', text: 'x+1=2' }] }] }],
+    };
+    const result = convertOmmlToMathml(omml, doc);
+    const children = Array.from(result!.children);
+    expect(children.map((c) => `${c.localName}:${c.textContent}`)).toEqual(['mi:x', 'mo:+', 'mn:1', 'mo:=', 'mn:2']);
+  });
+
+  it('groups consecutive digits with an interior decimal point into one <mn> (SD-2632)', () => {
+    const omml = {
+      name: 'm:oMath',
+      elements: [{ name: 'm:r', elements: [{ name: 'm:t', elements: [{ type: 'text', text: '123.45+67' }] }] }],
+    };
+    const result = convertOmmlToMathml(omml, doc);
+    const children = Array.from(result!.children);
+    expect(children.map((c) => `${c.localName}:${c.textContent}`)).toEqual(['mn:123.45', 'mo:+', 'mn:67']);
+  });
+
+  it('splits m:r content inside m:sub of an m:sSub (SD-2632 F3)', () => {
+    // Word's built-up "b_(n+1)" has "n+1" as a single m:r inside m:sub.
+    // The subscript should contain separate <mi>n</mi><mo>+</mo><mn>1</mn>.
+    const omml = {
+      name: 'm:oMath',
+      elements: [
+        {
+          name: 'm:sSub',
+          elements: [
+            {
+              name: 'm:e',
+              elements: [{ name: 'm:r', elements: [{ name: 'm:t', elements: [{ type: 'text', text: 'b' }] }] }],
+            },
+            {
+              name: 'm:sub',
+              elements: [{ name: 'm:r', elements: [{ name: 'm:t', elements: [{ type: 'text', text: 'n+1' }] }] }],
+            },
+          ],
+        },
+      ],
+    };
+    const result = convertOmmlToMathml(omml, doc);
+    const subMrow = result!.querySelector('msub > mrow:nth-child(2)');
+    expect(subMrow).not.toBeNull();
+    const children = Array.from(subMrow!.children);
+    expect(children.map((c) => `${c.localName}:${c.textContent}`)).toEqual(['mi:n', 'mo:+', 'mn:1']);
+  });
+
+  it('preserves m:rPr mathvariant across every atom of a split run (SD-2632)', () => {
+    // When m:sty="b" (bold) applies to the whole run, every atom emitted
+    // from the split inherits it.
+    const omml = {
+      name: 'm:oMath',
+      elements: [
+        {
+          name: 'm:r',
+          elements: [
+            { name: 'm:rPr', elements: [{ name: 'm:sty', attributes: { 'm:val': 'b' } }] },
+            { name: 'm:t', elements: [{ type: 'text', text: 'x+1' }] },
+          ],
+        },
+      ],
+    };
+    const result = convertOmmlToMathml(omml, doc);
+    const variants = Array.from(result!.children).map((c) => c.getAttribute('mathvariant'));
+    expect(variants).toEqual(['bold', 'bold', 'bold']);
+  });
+
+  it('keeps multi-letter function names whole inside m:func > m:fName (SD-2632 exception)', () => {
+    // Word's OMML2MML.XSL keeps "sin" as one <mi> when nested in m:fName,
+    // even though it would otherwise per-char split a bare m:r. Exception is
+    // applied by convertFunction.
+    const omml = {
+      name: 'm:oMath',
+      elements: [
+        {
+          name: 'm:func',
+          elements: [
+            {
+              name: 'm:fName',
+              elements: [{ name: 'm:r', elements: [{ name: 'm:t', elements: [{ type: 'text', text: 'sin' }] }] }],
+            },
+            {
+              name: 'm:e',
+              elements: [{ name: 'm:r', elements: [{ name: 'm:t', elements: [{ type: 'text', text: 'x' }] }] }],
+            },
+          ],
+        },
+      ],
+    };
+    const result = convertOmmlToMathml(omml, doc);
+    const functionName = result!.querySelector('mrow > mrow:first-child > mi');
+    expect(functionName!.textContent).toBe('sin');
+    expect(functionName!.getAttribute('mathvariant')).toBe('normal');
+  });
 });
 
 describe('m:bar converter', () => {
diff --git a/packages/layout-engine/painters/dom/src/features/math/types.ts b/packages/layout-engine/painters/dom/src/features/math/types.ts
index f1e8d90b71..00c31ea60f 100644
--- a/packages/layout-engine/painters/dom/src/features/math/types.ts
+++ b/packages/layout-engine/painters/dom/src/features/math/types.ts
@@ -43,4 +43,4 @@ export type MathObjectConverter = (
   node: OmmlJsonNode,
   doc: Document,
   convertChildren: (children: OmmlJsonNode[]) => DocumentFragment,
-) => Element | null;
+) => Node | null;
diff --git a/tests/behavior/tests/importing/math-equations.spec.ts b/tests/behavior/tests/importing/math-equations.spec.ts
index 647c9d8342..def4590cdc 100644
--- a/tests/behavior/tests/importing/math-equations.spec.ts
+++ b/tests/behavior/tests/importing/math-equations.spec.ts
@@ -821,6 +821,28 @@ test.describe('m:limLow / m:limUpp (limit object) rendering', () => {
     });
     expect(leaked).toEqual([]);
   });
+
+  test('splits multi-char operator runs in m:lim content (SD-2632)', async ({ superdoc }) => {
+    await superdoc.loadDocument(LIMIT_DOC);
+    await superdoc.waitForStable();
+
+    // Case 1: lim_(n→∞). Word emits the "→∞" as a single m:r. Previously we
+    // rendered it as one <mi>→∞</mi>; now per Word's OMML2MML.XSL it splits
+    // into separate atoms.
+    const limExpressionAtoms = await superdoc.page.evaluate(() => {
+      const munders = Array.from(document.querySelectorAll('munder'));
+      const limMunder = munders.find((m) => m.children[0]?.querySelector('mi')?.textContent === 'lim');
+      const limExpr = limMunder?.children[1];
+      return Array.from(limExpr?.children ?? []).map((c) => ({ tag: c.localName, text: c.textContent }));
+    });
+
+    // At minimum the limit expression contains the identifier "n" and an
+    // operator "→" — they must be separate atoms, not bundled as one <mi>.
+    const nAtom = limExpressionAtoms.find((a) => a.text === 'n');
+    const arrowAtom = limExpressionAtoms.find((a) => a.text === '\u2192');
+    expect(nAtom).toEqual({ tag: 'mi', text: 'n' });
+    expect(arrowAtom).toEqual({ tag: 'mo', text: '\u2192' });
+  });
 });
 
 test.describe('m:eqArr (equation array) rendering', () => {
@@ -1341,9 +1363,12 @@ test.describe('m:groupChr (group character) rendering', () => {
     await superdoc.loadDocument(GROUPCHR_DOC);
     await superdoc.waitForStable();
 
+    // Use `:scope > mo` to target the group character directly — the base
+    // expression may itself contain <mo> atoms (e.g. "a+b" splits to
+    // <mi>a</mi><mo>+</mo><mi>b</mi> per Word's OMML2MML.XSL).
     const firstMunder = await superdoc.page.evaluate(() => {
       const munder = document.querySelector('munder');
-      const mo = munder?.querySelector('mo');
+      const mo = munder?.querySelector(':scope > mo');
       return mo ? { text: mo.textContent, stretchy: mo.getAttribute('stretchy') } : null;
     });
 
@@ -1359,7 +1384,7 @@ test.describe('m:groupChr (group character) rendering', () => {
     // Variant 2 — second munder in DOM order.
     const hiddenChar = await superdoc.page.evaluate(() => {
       const munders = document.querySelectorAll('munder');
-      const mo = munders[1]?.querySelector('mo');
+      const mo = munders[1]?.querySelector(':scope > mo');
       return mo?.textContent;
     });
 
@@ -1372,7 +1397,7 @@ test.describe('m:groupChr (group character) rendering', () => {
 
     const chars = await superdoc.page.evaluate(() => {
       const wrappers = document.querySelectorAll('munder, mover');
-      return Array.from(wrappers).map((w) => w.querySelector('mo')?.textContent ?? null);
+      return Array.from(wrappers).map((w) => w.querySelector(':scope > mo')?.textContent ?? null);
     });
 
     // Variants 4 (U+23DE), 5 (U+2190), 6 (U+2192).

From 799c4efca665006f8a1de942ff6f9c85eed0c3ae Mon Sep 17 00:00:00 2001
From: Caio Pizzol <caio@harbourshare.com>
Date: Mon, 20 Apr 2026 18:18:12 -0300
Subject: [PATCH 2/3] fix(math): group letters inside m:fName mixed runs +
 cover mmultiscripts (SD-2632)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Follow-up to /review feedback backed by fresh Word OMML2MML.XSL evidence:

- `convertMathRunAsFunctionName` (renamed from `convertMathRunWhole` since
  "whole" no longer fits) now groups consecutive non-digit / non-operator
  characters into one <mi> while still splitting digits and operators. Word's
  XSL for `<m:fName><m:r>log_2</m:r></m:fName>` produces
  `<mi>log</mi><mo>_</mo><mn>2</mn>` — not `<mi>l</mi><mi>o</mi><mi>g</mi>…`.
- `BASE_BEARING_ELEMENTS` gains `mmultiscripts` — Word emits it when an
  `m:sPre` sits inside `m:fName`; our base-collapse pass needs to know to
  merge the first-child <mi> run.
- CONTRIBUTING.md now documents the widened `Node | null` return type.

Tests added:
- Direct `tokenizeMathText` edge cases: `.5` / `5.` / `1.2.3` / `2x+1` /
  consecutive operators / empty / standalone ∞.
- m:fName mixed-content: `log_2` stays `<mi>log</mi><mo>_</mo><mn>2</mn>`.
- Base collapse inside nested `m:sSub` under `m:fName`.
- Base collapse inside nested `m:sPre` (mmultiscripts) under `m:fName`.
- Behavior test tightened to pin the full 3-atom sequence for `n→∞`.

Disputed during review and deferred with evidence:
- Opus claim that standalone `m:limLow` with "lim" base regresses to italic:
  Word XSL itself splits "lim" per-char in that shape (with or without
  m:sty=p), so our output matches Word.
- Codex claim that Arabic-Indic digits should be `<mn>`: Word XSL also
  classifies them as `<mi>`, so our behavior matches.
- Non-BMP surrogate-pair support: edge case in extreme mathematical
  alphanumerics; Word XSL itself errored on U+1D465. Separate ticket worth.
---
 .../dom/src/features/math/CONTRIBUTING.md     |   4 +-
 .../src/features/math/converters/function.ts  |  16 +-
 .../src/features/math/converters/math-run.ts  |  91 ++++++---
 .../src/features/math/omml-to-mathml.test.ts  | 184 ++++++++++++++++++
 .../tests/importing/math-equations.spec.ts    |  12 +-
 5 files changed, 266 insertions(+), 41 deletions(-)

diff --git a/packages/layout-engine/painters/dom/src/features/math/CONTRIBUTING.md b/packages/layout-engine/painters/dom/src/features/math/CONTRIBUTING.md
index 8c002d976d..3764f6da11 100644
--- a/packages/layout-engine/painters/dom/src/features/math/CONTRIBUTING.md
+++ b/packages/layout-engine/painters/dom/src/features/math/CONTRIBUTING.md
@@ -36,7 +36,9 @@ type MathObjectConverter = (
   doc: Document,            // For creating DOM elements
   convertChildren: (children: OmmlJsonNode[]) => DocumentFragment,
                              // Recursively converts nested OMML content
-) => Element | null;
+) => Node | null;           // Return a single Element for one atom, or a
+                             // DocumentFragment when your converter produces
+                             // multiple sibling elements (see m:r / math-run).
 ```
 
 `convertChildren` is the important one. Pass it any child elements that contain nested math content (`m:e`, `m:num`, `m:sub`, etc.). It handles everything inside them, including other math objects.
diff --git a/packages/layout-engine/painters/dom/src/features/math/converters/function.ts b/packages/layout-engine/painters/dom/src/features/math/converters/function.ts
index b8f8697f01..6aa879340b 100644
--- a/packages/layout-engine/painters/dom/src/features/math/converters/function.ts
+++ b/packages/layout-engine/painters/dom/src/features/math/converters/function.ts
@@ -1,5 +1,5 @@
 import type { MathObjectConverter } from '../types.js';
-import { convertMathRunWhole } from './math-run.js';
+import { convertMathRunAsFunctionName } from './math-run.js';
 
 const MATHML_NS = 'http://www.w3.org/1998/Math/MathML';
 const FUNCTION_APPLY_OPERATOR = '\u2061';
@@ -44,7 +44,15 @@ function forceNormalMathVariant(root: ParentNode): void {
  * m:sSub → <msub>, etc.). Word's OMML2MML.XSL keeps the base text whole
  * (e.g. "lim" as one <mi>) even though it splits regular runs per-character.
  */
-const BASE_BEARING_ELEMENTS = new Set(['munder', 'mover', 'munderover', 'msub', 'msup', 'msubsup']);
+const BASE_BEARING_ELEMENTS = new Set([
+  'munder',
+  'mover',
+  'munderover',
+  'msub',
+  'msup',
+  'msubsup',
+  'mmultiscripts', // m:sPre inside m:fName
+]);
 
 /**
  * After per-character splitting in convertMathRun, the base of a nested
@@ -107,8 +115,8 @@ export const convertFunction: MathObjectConverter = (node, doc, convertChildren)
   // like a nested m:limLow — go through the normal recursive path.
   for (const child of functionName?.elements ?? []) {
     if (child.name === 'm:r') {
-      const whole = convertMathRunWhole(child, doc);
-      if (whole) functionNameRow.appendChild(whole);
+      const atom = convertMathRunAsFunctionName(child, doc);
+      if (atom) functionNameRow.appendChild(atom);
     } else {
       const converted = convertChildren([child]);
       if (converted.childNodes.length > 0) functionNameRow.appendChild(converted);
diff --git a/packages/layout-engine/painters/dom/src/features/math/converters/math-run.ts b/packages/layout-engine/painters/dom/src/features/math/converters/math-run.ts
index 49884e3ca9..4ce5a2a74e 100644
--- a/packages/layout-engine/painters/dom/src/features/math/converters/math-run.ts
+++ b/packages/layout-engine/painters/dom/src/features/math/converters/math-run.ts
@@ -209,47 +209,82 @@ export const convertMathRun: MathObjectConverter = (node, doc) => {
 };
 
 /**
- * Convert an m:r to a single `<mi>` element, preserving the entire run text.
+ * Tokenize a math run's text for the m:fName context: consecutive non-digit,
+ * non-operator characters stay grouped in one `<mi>` (so "log" in "log_2"
+ * remains a single identifier), while digits still group into `<mn>` and
+ * each operator character is its own `<mo>`.
  *
- * Used by m:func when processing m:fName children — Word's OMML2MML.XSL treats
- * multi-letter function names (e.g. "sin", "lim", "max") as one identifier
- * rather than splitting per character. See `convertFunction` for the calling
- * context.
+ * Matches Word's OMML2MML.XSL run-internal classification for m:fName
+ * content: `log_2` → `<mi>log</mi><mo>_</mo><mn>2</mn>`.
+ */
+function tokenizeFunctionNameText(text: string): Array<{ tag: MathAtomTag; content: string }> {
+  const atoms: Array<{ tag: MathAtomTag; content: string }> = [];
+  let i = 0;
+  while (i < text.length) {
+    const ch = text[i]!;
+    if (isDigit(ch)) {
+      let end = i + 1;
+      let sawDot = false;
+      while (end < text.length) {
+        const c = text[end]!;
+        if (isDigit(c)) {
+          end++;
+          continue;
+        }
+        if (c === '.' && !sawDot && end + 1 < text.length && isDigit(text[end + 1]!)) {
+          sawDot = true;
+          end++;
+          continue;
+        }
+        break;
+      }
+      atoms.push({ tag: 'mn', content: text.slice(i, end) });
+      i = end;
+    } else if (OPERATOR_CHARS.has(ch)) {
+      atoms.push({ tag: 'mo', content: ch });
+      i++;
+    } else {
+      // Group consecutive non-digit, non-operator characters into a single <mi>.
+      let end = i + 1;
+      while (end < text.length) {
+        const c = text[end]!;
+        if (isDigit(c) || OPERATOR_CHARS.has(c)) break;
+        end++;
+      }
+      atoms.push({ tag: 'mi', content: text.slice(i, end) });
+      i = end;
+    }
+  }
+  return atoms;
+}
+
+/**
+ * Convert an m:r inside m:fName (m:func's function-name slot). Word's
+ * OMML2MML.XSL keeps each letter-sequence whole while still splitting out
+ * digits and operators — so `sin` stays `<mi>sin</mi>`, but `log_2` becomes
+ * `<mi>log</mi><mo>_</mo><mn>2</mn>`.
  *
- * Returns null if the run has no text. If the run contains non-letter
- * characters (digits or operators), falls back to the splitting path so
- * composite function names still render correctly.
+ * Returns a single Element for single-atom runs or a DocumentFragment when
+ * the run emits multiple atoms. Returns null for empty text.
  */
-export function convertMathRunWhole(node: OmmlJsonNode, doc: Document): Node | null {
+export function convertMathRunAsFunctionName(node: OmmlJsonNode, doc: Document): Node | null {
   const text = extractText(node);
   if (!text) return null;
 
   const rPr = (node.elements ?? []).find((el) => el.name === 'm:rPr');
   const variant = resolveMathVariant(rPr);
-  const atoms = tokenizeMathText(text);
+  const atoms = tokenizeFunctionNameText(text);
 
-  if (atoms.every((a) => a.tag === 'mi')) {
-    const el = doc.createElementNS(MATHML_NS, 'mi');
-    el.textContent = text;
-    if (variant) el.setAttribute('mathvariant', variant);
-    return el;
-  }
-
-  // Mixed content inside m:fName is rare (e.g. "log_2"). Fall through to the
-  // per-atom path so operators and numbers render with correct semantics.
-  if (atoms.length === 1) {
-    const atom = atoms[0]!;
+  const createAtom = (atom: { tag: MathAtomTag; content: string }): Element => {
     const el = doc.createElementNS(MATHML_NS, atom.tag);
     el.textContent = atom.content;
     if (variant) el.setAttribute('mathvariant', variant);
     return el;
-  }
+  };
+
+  if (atoms.length === 1) return createAtom(atoms[0]!);
+
   const fragment = doc.createDocumentFragment();
-  for (const atom of atoms) {
-    const el = doc.createElementNS(MATHML_NS, atom.tag);
-    el.textContent = atom.content;
-    if (variant) el.setAttribute('mathvariant', variant);
-    fragment.appendChild(el);
-  }
+  for (const atom of atoms) fragment.appendChild(createAtom(atom));
   return fragment;
 }
diff --git a/packages/layout-engine/painters/dom/src/features/math/omml-to-mathml.test.ts b/packages/layout-engine/painters/dom/src/features/math/omml-to-mathml.test.ts
index 138824639e..2d9cf3cd8b 100644
--- a/packages/layout-engine/painters/dom/src/features/math/omml-to-mathml.test.ts
+++ b/packages/layout-engine/painters/dom/src/features/math/omml-to-mathml.test.ts
@@ -1,6 +1,7 @@
 import { describe, it, expect } from 'vitest';
 import { JSDOM } from 'jsdom';
 import { convertOmmlToMathml, MATHML_NS } from './omml-to-mathml.js';
+import { tokenizeMathText } from './converters/math-run.js';
 
 const dom = new JSDOM('<!DOCTYPE html><html><body></body></html>');
 const doc = dom.window.document;
@@ -344,6 +345,57 @@ describe('convertOmmlToMathml', () => {
     expect(children.some((c) => c.localName === 'mn')).toBe(true); // 1
   });
 
+  // ─── tokenizeMathText direct coverage (SD-2632) ────────────────────────────
+
+  it('tokenizes leading-decimal content with . as an operator followed by digits', () => {
+    // ".5" has no leading digit, so the "." is not part of a number.
+    expect(tokenizeMathText('.5')).toEqual([
+      { tag: 'mo', content: '.' },
+      { tag: 'mn', content: '5' },
+    ]);
+  });
+
+  it('tokenizes a trailing decimal point as a separate operator', () => {
+    // "5." — the digit run ends at "5" because a lookahead digit is required.
+    expect(tokenizeMathText('5.')).toEqual([
+      { tag: 'mn', content: '5' },
+      { tag: 'mo', content: '.' },
+    ]);
+  });
+
+  it('tokenizes "1.2.3" as number, operator, number — only first dot is inline', () => {
+    expect(tokenizeMathText('1.2.3')).toEqual([
+      { tag: 'mn', content: '1.2' },
+      { tag: 'mo', content: '.' },
+      { tag: 'mn', content: '3' },
+    ]);
+  });
+
+  it('tokenizes "2x+1" — number-identifier-operator-number', () => {
+    expect(tokenizeMathText('2x+1')).toEqual([
+      { tag: 'mn', content: '2' },
+      { tag: 'mi', content: 'x' },
+      { tag: 'mo', content: '+' },
+      { tag: 'mn', content: '1' },
+    ]);
+  });
+
+  it('tokenizes consecutive operator characters as separate <mo> atoms', () => {
+    expect(tokenizeMathText('\u2264\u2265')).toEqual([
+      { tag: 'mo', content: '\u2264' },
+      { tag: 'mo', content: '\u2265' },
+    ]);
+  });
+
+  it('tokenizes empty text as an empty list', () => {
+    expect(tokenizeMathText('')).toEqual([]);
+  });
+
+  it('tokenizes standalone ∞ as identifier, not operator (SD-2632)', () => {
+    // U+221E was removed from OPERATOR_CHARS; Word classifies it as <mi>.
+    expect(tokenizeMathText('\u221E')).toEqual([{ tag: 'mi', content: '\u221E' }]);
+  });
+
   // ─── SD-2632: per-character split of multi-char m:r text ──────────────────
 
   it('splits a single m:r containing operator + identifier into <mo> + <mi> (SD-2632)', () => {
@@ -426,6 +478,138 @@ describe('convertOmmlToMathml', () => {
     expect(variants).toEqual(['bold', 'bold', 'bold']);
   });
 
+  it('keeps "log" whole but splits operators and digits for m:fName with mixed content (SD-2632)', () => {
+    // Word's OMML2MML.XSL for <m:fName><m:r>log_2</m:r></m:fName>: letters group
+    // into one <mi>, operators and digits still split.
+    const omml = {
+      name: 'm:oMath',
+      elements: [
+        {
+          name: 'm:func',
+          elements: [
+            {
+              name: 'm:fName',
+              elements: [{ name: 'm:r', elements: [{ name: 'm:t', elements: [{ type: 'text', text: 'log_2' }] }] }],
+            },
+            {
+              name: 'm:e',
+              elements: [{ name: 'm:r', elements: [{ name: 'm:t', elements: [{ type: 'text', text: 'x' }] }] }],
+            },
+          ],
+        },
+      ],
+    };
+    const result = convertOmmlToMathml(omml, doc);
+    const fnameRow = result!.querySelector('mrow > mrow:first-child');
+    const children = Array.from(fnameRow!.children);
+    expect(children.map((c) => `${c.localName}:${c.textContent}`)).toEqual(['mi:log', 'mo:_', 'mn:2']);
+  });
+
+  it('collapses a multi-char base inside nested m:sSub wrapped by m:fName (SD-2632)', () => {
+    // Ensures the msub/msup entries of BASE_BEARING_ELEMENTS are actually pinned.
+    // <m:fName><m:sSub><m:e>f</m:e><m:sub>i</m:sub></m:sSub></m:fName> should
+    // keep "f" as a single <mi> inside the subscript wrapper's base slot.
+    const omml = {
+      name: 'm:oMath',
+      elements: [
+        {
+          name: 'm:func',
+          elements: [
+            {
+              name: 'm:fName',
+              elements: [
+                {
+                  name: 'm:sSub',
+                  elements: [
+                    {
+                      name: 'm:e',
+                      elements: [
+                        {
+                          name: 'm:r',
+                          elements: [
+                            { name: 'm:rPr', elements: [{ name: 'm:sty', attributes: { 'm:val': 'p' } }] },
+                            { name: 'm:t', elements: [{ type: 'text', text: 'log' }] },
+                          ],
+                        },
+                      ],
+                    },
+                    {
+                      name: 'm:sub',
+                      elements: [{ name: 'm:r', elements: [{ name: 'm:t', elements: [{ type: 'text', text: '2' }] }] }],
+                    },
+                  ],
+                },
+              ],
+            },
+            {
+              name: 'm:e',
+              elements: [{ name: 'm:r', elements: [{ name: 'm:t', elements: [{ type: 'text', text: 'x' }] }] }],
+            },
+          ],
+        },
+      ],
+    };
+    const result = convertOmmlToMathml(omml, doc);
+    const msub = result!.querySelector('msub');
+    expect(msub).not.toBeNull();
+    const baseMi = msub!.children[0]!.querySelector('mi');
+    expect(baseMi!.textContent).toBe('log');
+    expect(baseMi!.getAttribute('mathvariant')).toBe('normal');
+  });
+
+  it('collapses multi-char base inside nested m:sPre (mmultiscripts) wrapped by m:fName (SD-2632)', () => {
+    const omml = {
+      name: 'm:oMath',
+      elements: [
+        {
+          name: 'm:func',
+          elements: [
+            {
+              name: 'm:fName',
+              elements: [
+                {
+                  name: 'm:sPre',
+                  elements: [
+                    {
+                      name: 'm:sub',
+                      elements: [{ name: 'm:r', elements: [{ name: 'm:t', elements: [{ type: 'text', text: '2' }] }] }],
+                    },
+                    {
+                      name: 'm:sup',
+                      elements: [{ name: 'm:r', elements: [{ name: 'm:t', elements: [{ type: 'text', text: 'n' }] }] }],
+                    },
+                    {
+                      name: 'm:e',
+                      elements: [
+                        {
+                          name: 'm:r',
+                          elements: [
+                            { name: 'm:rPr', elements: [{ name: 'm:sty', attributes: { 'm:val': 'p' } }] },
+                            { name: 'm:t', elements: [{ type: 'text', text: 'log' }] },
+                          ],
+                        },
+                      ],
+                    },
+                  ],
+                },
+              ],
+            },
+            {
+              name: 'm:e',
+              elements: [{ name: 'm:r', elements: [{ name: 'm:t', elements: [{ type: 'text', text: 'x' }] }] }],
+            },
+          ],
+        },
+      ],
+    };
+    const result = convertOmmlToMathml(omml, doc);
+    const mms = result!.querySelector('mmultiscripts');
+    expect(mms).not.toBeNull();
+    const baseMi = mms!.children[0]!.querySelector('mi');
+    expect(baseMi!.textContent).toBe('log');
+    expect(baseMi!.getAttribute('mathvariant')).toBe('normal');
+  });
+
   it('keeps multi-letter function names whole inside m:func > m:fName (SD-2632 exception)', () => {
     // Word's OMML2MML.XSL keeps "sin" as one <mi> when nested in m:fName,
     // even though it would otherwise per-char split a bare m:r. Exception is
diff --git a/tests/behavior/tests/importing/math-equations.spec.ts b/tests/behavior/tests/importing/math-equations.spec.ts
index def4590cdc..650c6385be 100644
--- a/tests/behavior/tests/importing/math-equations.spec.ts
+++ b/tests/behavior/tests/importing/math-equations.spec.ts
@@ -828,20 +828,16 @@ test.describe('m:limLow / m:limUpp (limit object) rendering', () => {
 
     // Case 1: lim_(n→∞). Word emits the "→∞" as a single m:r. Previously we
     // rendered it as one <mi>→∞</mi>; now per Word's OMML2MML.XSL it splits
-    // into separate atoms.
+    // into separate atoms. Assert the full ordered sequence so a regression
+    // that drops or misclassifies any atom is caught.
     const limExpressionAtoms = await superdoc.page.evaluate(() => {
       const munders = Array.from(document.querySelectorAll('munder'));
       const limMunder = munders.find((m) => m.children[0]?.querySelector('mi')?.textContent === 'lim');
       const limExpr = limMunder?.children[1];
-      return Array.from(limExpr?.children ?? []).map((c) => ({ tag: c.localName, text: c.textContent }));
+      return Array.from(limExpr?.children ?? []).map((c) => `${c.localName}:${c.textContent}`);
     });
 
-    // At minimum the limit expression contains the identifier "n" and an
-    // operator "→" — they must be separate atoms, not bundled as one <mi>.
-    const nAtom = limExpressionAtoms.find((a) => a.text === 'n');
-    const arrowAtom = limExpressionAtoms.find((a) => a.text === '\u2192');
-    expect(nAtom).toEqual({ tag: 'mi', text: 'n' });
-    expect(arrowAtom).toEqual({ tag: 'mo', text: '\u2192' });
+    expect(limExpressionAtoms).toEqual(['mi:n', 'mo:\u2192', 'mi:\u221E']);
   });
 });
 

From 35f0bb5221aca7384c230bde03704efba322015b Mon Sep 17 00:00:00 2001
From: Caio Pizzol <caio@harbourshare.com>
Date: Mon, 20 Apr 2026 18:52:07 -0300
Subject: [PATCH 3/3] fix(math): iterate tokenizer by code point to preserve
 surrogate pairs

Addresses Codex bot review on PR #2875. Astral-plane mathematical
alphanumerics (e.g. U+1D465 mathematical italic x, U+1D7D9
mathematical double-struck 1) are UTF-16 surrogate pairs. Walking
text by code unit split them into two half-pair <mi> atoms with
invalid content. `codePointUnitLength` returns 2 when the current
position starts a surrogate pair so tokenizeMathText and
tokenizeFunctionNameText step across the full code point.
---
 .../src/features/math/converters/math-run.ts  | 41 +++++++++++++------
 .../src/features/math/omml-to-mathml.test.ts  | 11 +++++
 2 files changed, 40 insertions(+), 12 deletions(-)

diff --git a/packages/layout-engine/painters/dom/src/features/math/converters/math-run.ts b/packages/layout-engine/painters/dom/src/features/math/converters/math-run.ts
index 4ce5a2a74e..7f979563f9 100644
--- a/packages/layout-engine/painters/dom/src/features/math/converters/math-run.ts
+++ b/packages/layout-engine/painters/dom/src/features/math/converters/math-run.ts
@@ -70,6 +70,20 @@ function isDigit(ch: string): boolean {
   return ch >= '0' && ch <= '9';
 }
 
+/**
+ * Length in UTF-16 code units of the code point starting at `text[i]`.
+ * Handles surrogate pairs so astral-plane characters (e.g. mathematical
+ * italic U+1D465) don't get split into two bogus <mi> atoms.
+ */
+function codePointUnitLength(text: string, i: number): number {
+  const hi = text.charCodeAt(i);
+  if (hi >= 0xd800 && hi <= 0xdbff && i + 1 < text.length) {
+    const lo = text.charCodeAt(i + 1);
+    if (lo >= 0xdc00 && lo <= 0xdfff) return 2;
+  }
+  return 1;
+}
+
 /**
  * Split a math run's text into MathML atoms, matching Word's OMML2MML.XSL.
  *
@@ -85,8 +99,9 @@ export function tokenizeMathText(text: string): Array<{ tag: MathAtomTag; conten
   const atoms: Array<{ tag: MathAtomTag; content: string }> = [];
   let i = 0;
   while (i < text.length) {
-    const ch = text[i]!;
-    if (isDigit(ch)) {
+    const step = codePointUnitLength(text, i);
+    const ch = text.slice(i, i + step);
+    if (step === 1 && isDigit(ch)) {
       let end = i + 1;
       let sawDot = false;
       while (end < text.length) {
@@ -104,12 +119,12 @@ export function tokenizeMathText(text: string): Array<{ tag: MathAtomTag; conten
       }
       atoms.push({ tag: 'mn', content: text.slice(i, end) });
       i = end;
-    } else if (OPERATOR_CHARS.has(ch)) {
+    } else if (step === 1 && OPERATOR_CHARS.has(ch)) {
       atoms.push({ tag: 'mo', content: ch });
       i++;
     } else {
       atoms.push({ tag: 'mi', content: ch });
-      i++;
+      i += step;
     }
   }
   return atoms;
@@ -221,8 +236,9 @@ function tokenizeFunctionNameText(text: string): Array<{ tag: MathAtomTag; conte
   const atoms: Array<{ tag: MathAtomTag; content: string }> = [];
   let i = 0;
   while (i < text.length) {
-    const ch = text[i]!;
-    if (isDigit(ch)) {
+    const step = codePointUnitLength(text, i);
+    const ch = text.slice(i, i + step);
+    if (step === 1 && isDigit(ch)) {
       let end = i + 1;
       let sawDot = false;
       while (end < text.length) {
@@ -240,16 +256,17 @@ function tokenizeFunctionNameText(text: string): Array<{ tag: MathAtomTag; conte
       }
       atoms.push({ tag: 'mn', content: text.slice(i, end) });
       i = end;
-    } else if (OPERATOR_CHARS.has(ch)) {
+    } else if (step === 1 && OPERATOR_CHARS.has(ch)) {
       atoms.push({ tag: 'mo', content: ch });
       i++;
     } else {
-      // Group consecutive non-digit, non-operator characters into a single <mi>.
-      let end = i + 1;
+      // Group consecutive non-digit, non-operator code points into one <mi>.
+      let end = i + step;
       while (end < text.length) {
-        const c = text[end]!;
-        if (isDigit(c) || OPERATOR_CHARS.has(c)) break;
-        end++;
+        const s = codePointUnitLength(text, end);
+        const c = text.slice(end, end + s);
+        if (s === 1 && (isDigit(c) || OPERATOR_CHARS.has(c))) break;
+        end += s;
       }
       atoms.push({ tag: 'mi', content: text.slice(i, end) });
       i = end;
diff --git a/packages/layout-engine/painters/dom/src/features/math/omml-to-mathml.test.ts b/packages/layout-engine/painters/dom/src/features/math/omml-to-mathml.test.ts
index 2d9cf3cd8b..32930a01eb 100644
--- a/packages/layout-engine/painters/dom/src/features/math/omml-to-mathml.test.ts
+++ b/packages/layout-engine/painters/dom/src/features/math/omml-to-mathml.test.ts
@@ -396,6 +396,17 @@ describe('convertOmmlToMathml', () => {
     expect(tokenizeMathText('\u221E')).toEqual([{ tag: 'mi', content: '\u221E' }]);
   });
 
+  it('keeps astral-plane characters whole (does not split surrogate pairs)', () => {
+    // 𝑥 (U+1D465, mathematical italic small x) is a UTF-16 surrogate pair.
+    // Splitting by code unit would emit two bogus half-pair <mi>s.
+    const text = '\u{1D465}+1';
+    expect(tokenizeMathText(text)).toEqual([
+      { tag: 'mi', content: '\u{1D465}' },
+      { tag: 'mo', content: '+' },
+      { tag: 'mn', content: '1' },
+    ]);
+  });
+
   // ─── SD-2632: per-character split of multi-char m:r text ──────────────────
 
   it('splits a single m:r containing operator + identifier into <mo> + <mi> (SD-2632)', () => {