|
| 1 | +"""PROTOTYPE: Markdown -> Substack ProseMirror via markdown-it-py. |
| 2 | +
|
| 3 | +This replaces the hand-rolled parser in Post.from_markdown() with a real |
| 4 | +CommonMark parser (markdown-it-py) plus the standard footnote plugin, and a |
| 5 | +small renderer that walks the syntax tree into Substack's node schema. |
| 6 | +
|
| 7 | +Node construction goes through ``substack.nodes`` so the (undocumented) schema |
| 8 | +lives in exactly one place. |
| 9 | +
|
| 10 | +Not wired for production; imported by Post.from_markdown() when available so the |
| 11 | +existing test-suite can be run against it for evaluation. |
| 12 | +""" |
| 13 | + |
| 14 | +from __future__ import annotations |
| 15 | + |
| 16 | +from typing import Dict, List, Optional |
| 17 | + |
| 18 | +from markdown_it import MarkdownIt |
| 19 | +from markdown_it.tree import SyntaxTreeNode |
| 20 | +from mdit_py_plugins.footnote import footnote_plugin |
| 21 | + |
| 22 | +from substack import nodes |
| 23 | +from substack.nodes import MarkType, NodeType |
| 24 | + |
| 25 | +_MARK_FOR = { |
| 26 | + "strong": {"type": MarkType.STRONG}, |
| 27 | + "em": {"type": MarkType.EM}, |
| 28 | + "s": {"type": MarkType.STRIKETHROUGH}, |
| 29 | +} |
| 30 | + |
| 31 | + |
| 32 | +def _make_parser() -> MarkdownIt: |
| 33 | + return MarkdownIt("commonmark").use(footnote_plugin).enable("strikethrough") |
| 34 | + |
| 35 | + |
| 36 | +def _coalesce(out_nodes: List[Dict]) -> List[Dict]: |
| 37 | + """Merge adjacent text nodes that carry identical marks (e.g. softbreaks).""" |
| 38 | + merged: List[Dict] = [] |
| 39 | + for node in out_nodes: |
| 40 | + if ( |
| 41 | + merged |
| 42 | + and node.get("type") == NodeType.TEXT |
| 43 | + and merged[-1].get("type") == NodeType.TEXT |
| 44 | + and node.get("marks") == merged[-1].get("marks") |
| 45 | + ): |
| 46 | + merged[-1]["text"] += node["text"] |
| 47 | + else: |
| 48 | + merged.append(node) |
| 49 | + return merged |
| 50 | + |
| 51 | + |
| 52 | +def _render_inline(node: SyntaxTreeNode, marks: List[Dict]) -> List[Dict]: |
| 53 | + """Render an inline subtree into a flat list of text / anchor nodes.""" |
| 54 | + out: List[Dict] = [] |
| 55 | + for child in node.children: |
| 56 | + t = child.type |
| 57 | + if t == "text": |
| 58 | + if child.content: |
| 59 | + out.append(nodes.text(child.content, marks)) |
| 60 | + elif t == "code_inline": |
| 61 | + out.append(nodes.text(child.content, marks + [nodes.code_mark()])) |
| 62 | + elif t in _MARK_FOR: |
| 63 | + out.extend(_render_inline(child, marks + [_MARK_FOR[t]])) |
| 64 | + elif t == "link": |
| 65 | + href = child.attrs.get("href", "") |
| 66 | + out.extend(_render_inline(child, marks + [nodes.link_mark(href)])) |
| 67 | + elif t in ("softbreak", "hardbreak"): |
| 68 | + out.append(nodes.text(" ", marks)) |
| 69 | + elif t == "footnote_ref": |
| 70 | + out.append(nodes.footnote_anchor(child.meta["id"] + 1)) |
| 71 | + elif t == "image": |
| 72 | + # Inline images are rare in this schema; fall back to alt text. |
| 73 | + alt = child.attrs.get("alt") or "".join( |
| 74 | + c.content for c in child.children if c.type == "text" |
| 75 | + ) |
| 76 | + if alt: |
| 77 | + out.append(nodes.text(alt, marks)) |
| 78 | + return _coalesce(out) |
| 79 | + |
| 80 | + |
| 81 | +def _only_image(inline: SyntaxTreeNode) -> Optional[SyntaxTreeNode]: |
| 82 | + """If an inline node is just an image (optionally wrapped in a link), return it.""" |
| 83 | + kids = [c for c in inline.children if c.type != "softbreak"] |
| 84 | + if len(kids) == 1 and kids[0].type == "image": |
| 85 | + return kids[0] |
| 86 | + if len(kids) == 1 and kids[0].type == "link": |
| 87 | + inner = [c for c in kids[0].children if c.type != "softbreak"] |
| 88 | + if len(inner) == 1 and inner[0].type == "image": |
| 89 | + img = inner[0] |
| 90 | + img._link_href = kids[0].attrs.get("href") # type: ignore[attr-defined] |
| 91 | + return img |
| 92 | + return None |
| 93 | + |
| 94 | + |
| 95 | +def _captioned_image(img: SyntaxTreeNode, api) -> Dict: |
| 96 | + src = img.attrs.get("src", "") |
| 97 | + if src.startswith("/"): |
| 98 | + src = src[1:] |
| 99 | + if api is not None and not src.startswith("http"): |
| 100 | + try: |
| 101 | + src = api.get_image(src).get("url") |
| 102 | + except Exception: |
| 103 | + pass |
| 104 | + # markdown-it stores the image alt text as the node's content, not in attrs. |
| 105 | + alt = img.content or img.attrs.get("alt") or None |
| 106 | + return nodes.captioned_image( |
| 107 | + src, |
| 108 | + alt=alt, |
| 109 | + href=getattr(img, "_link_href", None), |
| 110 | + ) |
| 111 | + |
| 112 | + |
| 113 | +def _render_block(node: SyntaxTreeNode, api) -> List[Dict]: |
| 114 | + """Render a block-level node into zero or more Substack nodes.""" |
| 115 | + t = node.type |
| 116 | + |
| 117 | + if t == "paragraph": |
| 118 | + inline = node.children[0] |
| 119 | + img = _only_image(inline) |
| 120 | + if img is not None: |
| 121 | + return [_captioned_image(img, api)] |
| 122 | + return [nodes.paragraph(_render_inline(inline, []))] |
| 123 | + |
| 124 | + if t == "heading": |
| 125 | + level = int(node.tag[1]) |
| 126 | + return [nodes.heading(_render_inline(node.children[0], []), level=level)] |
| 127 | + |
| 128 | + if t == "hr": |
| 129 | + return [nodes.horizontal_rule()] |
| 130 | + |
| 131 | + if t in ("fence", "code_block"): |
| 132 | + return [ |
| 133 | + nodes.code_block( |
| 134 | + node.content.rstrip("\n"), language=node.info.strip() or None |
| 135 | + ) |
| 136 | + ] |
| 137 | + |
| 138 | + if t == "blockquote": |
| 139 | + paras: List[Dict] = [] |
| 140 | + for child in node.children: |
| 141 | + paras.extend(_render_block(child, api)) |
| 142 | + return [nodes.blockquote(paras)] |
| 143 | + |
| 144 | + if t == "bullet_list": |
| 145 | + return [nodes.bullet_list(_render_list_items(node, api))] |
| 146 | + |
| 147 | + if t == "ordered_list": |
| 148 | + return [nodes.ordered_list(_render_list_items(node, api))] |
| 149 | + |
| 150 | + if t == "footnote_block": |
| 151 | + out = [] |
| 152 | + for fn in node.children: |
| 153 | + number = fn.meta["id"] + 1 |
| 154 | + paras = [ |
| 155 | + nodes.paragraph(_render_inline(child.children[0], [])) |
| 156 | + for child in fn.children |
| 157 | + if child.type == "paragraph" |
| 158 | + ] |
| 159 | + out.append(nodes.footnote(number, paras)) |
| 160 | + return out |
| 161 | + |
| 162 | + return [] |
| 163 | + |
| 164 | + |
| 165 | +def _render_list_items(list_node: SyntaxTreeNode, api) -> List[Dict]: |
| 166 | + items = [] |
| 167 | + for li in list_node.children: |
| 168 | + # A list_item built by nodes.list_item wraps inline content in a single |
| 169 | + # paragraph; here items may already contain block nodes, so build directly. |
| 170 | + content: List[Dict] = [] |
| 171 | + for child in li.children: |
| 172 | + content.extend(_render_block(child, api)) |
| 173 | + items.append({"type": NodeType.LIST_ITEM, "content": content}) |
| 174 | + return items |
| 175 | + |
| 176 | + |
| 177 | +def markdown_to_doc(markdown_content: str, api=None) -> List[Dict]: |
| 178 | + """Convert Markdown into a list of Substack ProseMirror block nodes.""" |
| 179 | + tree = SyntaxTreeNode(_make_parser().parse(markdown_content)) |
| 180 | + out: List[Dict] = [] |
| 181 | + for node in tree.children: |
| 182 | + out.extend(_render_block(node, api)) |
| 183 | + return out |
0 commit comments