Skip to content

Commit f257c26

Browse files
committed
Render from_markdown via markdown-it-py
Replace the hand-rolled Markdown parser in from_markdown() with markdown-it-py plus the standard footnote plugin, and a small renderer (mdrender.py) that maps the syntax tree to Substack's node schema. Node construction is centralised in a new nodes.py module so the schema lives in one place. Footnotes (including multi-paragraph definitions) come from the footnote plugin. Adds end-to-end from_markdown feature tests covering every documented feature. Two intentional, CommonMark-correct behaviour changes vs the old parser: consecutive '>' lines are one paragraph (blank '>' lines split them), and unreferenced footnote definitions are dropped rather than appended.
1 parent 21a8e9c commit f257c26

8 files changed

Lines changed: 808 additions & 309 deletions

File tree

README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,17 @@ This is a paragraph with **bold** and *italic* text.
156156
"""
157157
post.from_markdown(markdown_content, api=api)
158158

159+
# Markdown footnotes are supported. References become inline anchors and
160+
# definitions (which may span multiple paragraphs) become footnote blocks,
161+
# numbered by order of first reference. Labels can be numbers or names.
162+
footnote_markdown = """
163+
A claim that needs support.[^1] And another.[^source]
164+
165+
[^1]: The supporting detail, with a [link](https://example.com).
166+
[^source]: Author, *Title* (2025).
167+
"""
168+
post.from_markdown(footnote_markdown, api=api)
169+
159170
draft = api.post_draft(post.get_draft())
160171

161172
# set section (can only be done after first posting the draft)

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ python = "<4.0,>=3.10"
2121
requests = "^2.32.0"
2222
python-dotenv = "^1.2.1"
2323
PyYAML = "^6.0"
24+
markdown-it-py = "^3.0"
25+
mdit-py-plugins = "^0.4"
2426

2527
[tool.poetry.group.dev.dependencies]
2628

substack/mdrender.py

Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
"""PROTOTYPE: Markdown -> Substack ProseMirror via markdown-it-py.
2+
3+
This replaces the hand-rolled parser in Post.from_markdown() with a real
4+
CommonMark parser (markdown-it-py) plus the standard footnote plugin, and a
5+
small renderer that walks the syntax tree into Substack's node schema.
6+
7+
Node construction goes through ``substack.nodes`` so the (undocumented) schema
8+
lives in exactly one place.
9+
10+
Not wired for production; imported by Post.from_markdown() when available so the
11+
existing test-suite can be run against it for evaluation.
12+
"""
13+
14+
from __future__ import annotations
15+
16+
from typing import Dict, List, Optional
17+
18+
from markdown_it import MarkdownIt
19+
from markdown_it.tree import SyntaxTreeNode
20+
from mdit_py_plugins.footnote import footnote_plugin
21+
22+
from substack import nodes
23+
from substack.nodes import MarkType, NodeType
24+
25+
_MARK_FOR = {
26+
"strong": {"type": MarkType.STRONG},
27+
"em": {"type": MarkType.EM},
28+
"s": {"type": MarkType.STRIKETHROUGH},
29+
}
30+
31+
32+
def _make_parser() -> MarkdownIt:
33+
return MarkdownIt("commonmark").use(footnote_plugin).enable("strikethrough")
34+
35+
36+
def _coalesce(out_nodes: List[Dict]) -> List[Dict]:
37+
"""Merge adjacent text nodes that carry identical marks (e.g. softbreaks)."""
38+
merged: List[Dict] = []
39+
for node in out_nodes:
40+
if (
41+
merged
42+
and node.get("type") == NodeType.TEXT
43+
and merged[-1].get("type") == NodeType.TEXT
44+
and node.get("marks") == merged[-1].get("marks")
45+
):
46+
merged[-1]["text"] += node["text"]
47+
else:
48+
merged.append(node)
49+
return merged
50+
51+
52+
def _render_inline(node: SyntaxTreeNode, marks: List[Dict]) -> List[Dict]:
53+
"""Render an inline subtree into a flat list of text / anchor nodes."""
54+
out: List[Dict] = []
55+
for child in node.children:
56+
t = child.type
57+
if t == "text":
58+
if child.content:
59+
out.append(nodes.text(child.content, marks))
60+
elif t == "code_inline":
61+
out.append(nodes.text(child.content, marks + [nodes.code_mark()]))
62+
elif t in _MARK_FOR:
63+
out.extend(_render_inline(child, marks + [_MARK_FOR[t]]))
64+
elif t == "link":
65+
href = child.attrs.get("href", "")
66+
out.extend(_render_inline(child, marks + [nodes.link_mark(href)]))
67+
elif t in ("softbreak", "hardbreak"):
68+
out.append(nodes.text(" ", marks))
69+
elif t == "footnote_ref":
70+
out.append(nodes.footnote_anchor(child.meta["id"] + 1))
71+
elif t == "image":
72+
# Inline images are rare in this schema; fall back to alt text.
73+
alt = child.attrs.get("alt") or "".join(
74+
c.content for c in child.children if c.type == "text"
75+
)
76+
if alt:
77+
out.append(nodes.text(alt, marks))
78+
return _coalesce(out)
79+
80+
81+
def _only_image(inline: SyntaxTreeNode) -> Optional[SyntaxTreeNode]:
82+
"""If an inline node is just an image (optionally wrapped in a link), return it."""
83+
kids = [c for c in inline.children if c.type != "softbreak"]
84+
if len(kids) == 1 and kids[0].type == "image":
85+
return kids[0]
86+
if len(kids) == 1 and kids[0].type == "link":
87+
inner = [c for c in kids[0].children if c.type != "softbreak"]
88+
if len(inner) == 1 and inner[0].type == "image":
89+
img = inner[0]
90+
img._link_href = kids[0].attrs.get("href") # type: ignore[attr-defined]
91+
return img
92+
return None
93+
94+
95+
def _captioned_image(img: SyntaxTreeNode, api) -> Dict:
96+
src = img.attrs.get("src", "")
97+
if src.startswith("/"):
98+
src = src[1:]
99+
if api is not None and not src.startswith("http"):
100+
try:
101+
src = api.get_image(src).get("url")
102+
except Exception:
103+
pass
104+
# markdown-it stores the image alt text as the node's content, not in attrs.
105+
alt = img.content or img.attrs.get("alt") or None
106+
return nodes.captioned_image(
107+
src,
108+
alt=alt,
109+
href=getattr(img, "_link_href", None),
110+
)
111+
112+
113+
def _render_block(node: SyntaxTreeNode, api) -> List[Dict]:
114+
"""Render a block-level node into zero or more Substack nodes."""
115+
t = node.type
116+
117+
if t == "paragraph":
118+
inline = node.children[0]
119+
img = _only_image(inline)
120+
if img is not None:
121+
return [_captioned_image(img, api)]
122+
return [nodes.paragraph(_render_inline(inline, []))]
123+
124+
if t == "heading":
125+
level = int(node.tag[1])
126+
return [nodes.heading(_render_inline(node.children[0], []), level=level)]
127+
128+
if t == "hr":
129+
return [nodes.horizontal_rule()]
130+
131+
if t in ("fence", "code_block"):
132+
return [
133+
nodes.code_block(
134+
node.content.rstrip("\n"), language=node.info.strip() or None
135+
)
136+
]
137+
138+
if t == "blockquote":
139+
paras: List[Dict] = []
140+
for child in node.children:
141+
paras.extend(_render_block(child, api))
142+
return [nodes.blockquote(paras)]
143+
144+
if t == "bullet_list":
145+
return [nodes.bullet_list(_render_list_items(node, api))]
146+
147+
if t == "ordered_list":
148+
return [nodes.ordered_list(_render_list_items(node, api))]
149+
150+
if t == "footnote_block":
151+
out = []
152+
for fn in node.children:
153+
number = fn.meta["id"] + 1
154+
paras = [
155+
nodes.paragraph(_render_inline(child.children[0], []))
156+
for child in fn.children
157+
if child.type == "paragraph"
158+
]
159+
out.append(nodes.footnote(number, paras))
160+
return out
161+
162+
return []
163+
164+
165+
def _render_list_items(list_node: SyntaxTreeNode, api) -> List[Dict]:
166+
items = []
167+
for li in list_node.children:
168+
# A list_item built by nodes.list_item wraps inline content in a single
169+
# paragraph; here items may already contain block nodes, so build directly.
170+
content: List[Dict] = []
171+
for child in li.children:
172+
content.extend(_render_block(child, api))
173+
items.append({"type": NodeType.LIST_ITEM, "content": content})
174+
return items
175+
176+
177+
def markdown_to_doc(markdown_content: str, api=None) -> List[Dict]:
178+
"""Convert Markdown into a list of Substack ProseMirror block nodes."""
179+
tree = SyntaxTreeNode(_make_parser().parse(markdown_content))
180+
out: List[Dict] = []
181+
for node in tree.children:
182+
out.extend(_render_block(node, api))
183+
return out

substack/nodes.py

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
"""ProseMirror node builders for Substack documents.
2+
3+
PROTOTYPE: this module centralises the (undocumented) Substack ProseMirror
4+
schema in one place. Today the node-type strings ("paragraph", "footnoteAnchor",
5+
"image2", ...) and their shapes are scattered across post.py as inline dict
6+
literals. Pulling them here gives:
7+
8+
* one source of truth for node shapes (so a schema change is a one-line fix),
9+
* discoverable, typed constructors instead of bare dict literals,
10+
* a natural seam for validation.
11+
12+
The builders intentionally return plain dicts so they stay 100% compatible with
13+
the existing draft_body structure.
14+
"""
15+
16+
from __future__ import annotations
17+
18+
from typing import Dict, List, Optional
19+
20+
21+
class NodeType:
22+
DOC = "doc"
23+
PARAGRAPH = "paragraph"
24+
HEADING = "heading"
25+
TEXT = "text"
26+
BLOCKQUOTE = "blockquote"
27+
CODE_BLOCK = "codeBlock"
28+
HORIZONTAL_RULE = "horizontal_rule"
29+
BULLET_LIST = "bullet_list"
30+
ORDERED_LIST = "ordered_list"
31+
LIST_ITEM = "list_item"
32+
FOOTNOTE = "footnote"
33+
FOOTNOTE_ANCHOR = "footnoteAnchor"
34+
CAPTIONED_IMAGE = "captionedImage"
35+
36+
37+
class MarkType:
38+
STRONG = "strong"
39+
EM = "em"
40+
CODE = "code"
41+
STRIKETHROUGH = "strikethrough"
42+
LINK = "link"
43+
44+
45+
def code_mark() -> Dict:
46+
return {"type": MarkType.CODE}
47+
48+
49+
def text(value: str, marks: Optional[List[Dict]] = None) -> Dict:
50+
node: Dict = {"type": NodeType.TEXT, "text": value}
51+
if marks:
52+
node["marks"] = marks
53+
return node
54+
55+
56+
def link_mark(href: str) -> Dict:
57+
return {"type": MarkType.LINK, "attrs": {"href": href}}
58+
59+
60+
def paragraph(content: Optional[List[Dict]] = None) -> Dict:
61+
return {"type": NodeType.PARAGRAPH, "content": content or []}
62+
63+
64+
def heading(content: List[Dict], level: int = 1) -> Dict:
65+
return {"type": NodeType.HEADING, "content": content, "attrs": {"level": level}}
66+
67+
68+
def horizontal_rule() -> Dict:
69+
return {"type": NodeType.HORIZONTAL_RULE}
70+
71+
72+
def blockquote(paragraphs: List[Dict]) -> Dict:
73+
node: Dict = {"type": NodeType.BLOCKQUOTE}
74+
if paragraphs:
75+
node["content"] = paragraphs
76+
return node
77+
78+
79+
def list_item(content_nodes: List[Dict]) -> Dict:
80+
return {
81+
"type": NodeType.LIST_ITEM,
82+
"content": [paragraph(content_nodes)],
83+
}
84+
85+
86+
def bullet_list(items: List[Dict]) -> Dict:
87+
return {"type": NodeType.BULLET_LIST, "content": items}
88+
89+
90+
def ordered_list(items: List[Dict]) -> Dict:
91+
return {"type": NodeType.ORDERED_LIST, "content": items}
92+
93+
94+
def code_block(code: str, language: Optional[str] = None) -> Dict:
95+
node: Dict = {"type": NodeType.CODE_BLOCK, "content": [text(code)]}
96+
if language:
97+
node["attrs"] = {"language": language}
98+
return node
99+
100+
101+
def captioned_image(
102+
src: str, alt: Optional[str] = None, href: Optional[str] = None
103+
) -> Dict:
104+
node: Dict = {"type": NodeType.CAPTIONED_IMAGE, "src": src}
105+
if alt:
106+
node["alt"] = alt
107+
if href:
108+
node["href"] = href
109+
return node
110+
111+
112+
def footnote_anchor(number: int) -> Dict:
113+
return {"type": NodeType.FOOTNOTE_ANCHOR, "attrs": {"number": number}}
114+
115+
116+
def footnote(number: int, paragraphs: List[Dict]) -> Dict:
117+
return {
118+
"type": NodeType.FOOTNOTE,
119+
"attrs": {"number": number},
120+
"content": paragraphs or [paragraph()],
121+
}

0 commit comments

Comments
 (0)