Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,17 @@ This is a paragraph with **bold** and *italic* text.
"""
post.from_markdown(markdown_content, api=api)

# Markdown footnotes are supported. References become inline anchors and
# definitions (which may span multiple paragraphs) become footnote blocks,
# numbered by order of first reference. Labels can be numbers or names.
footnote_markdown = """
A claim that needs support.[^1] And another.[^source]

[^1]: The supporting detail, with a [link](https://example.com).
[^source]: Author, *Title* (2025).
"""
post.from_markdown(footnote_markdown, api=api)

draft = api.post_draft(post.get_draft())

# set section (can only be done after first posting the draft)
Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ python = "<4.0,>=3.10"
requests = "^2.32.0"
python-dotenv = "^1.2.1"
PyYAML = "^6.0"
markdown-it-py = "^3.0"
mdit-py-plugins = "^0.4"

[tool.poetry.group.dev.dependencies]

Expand Down
183 changes: 183 additions & 0 deletions substack/mdrender.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
"""PROTOTYPE: Markdown -> Substack ProseMirror via markdown-it-py.

This replaces the hand-rolled parser in Post.from_markdown() with a real
CommonMark parser (markdown-it-py) plus the standard footnote plugin, and a
small renderer that walks the syntax tree into Substack's node schema.

Node construction goes through ``substack.nodes`` so the (undocumented) schema
lives in exactly one place.

Not wired for production; imported by Post.from_markdown() when available so the
existing test-suite can be run against it for evaluation.
"""

from __future__ import annotations

from typing import Dict, List, Optional

from markdown_it import MarkdownIt
from markdown_it.tree import SyntaxTreeNode
from mdit_py_plugins.footnote import footnote_plugin

from substack import nodes
from substack.nodes import MarkType, NodeType

_MARK_FOR = {
"strong": {"type": MarkType.STRONG},
"em": {"type": MarkType.EM},
"s": {"type": MarkType.STRIKETHROUGH},
}


def _make_parser() -> MarkdownIt:
return MarkdownIt("commonmark").use(footnote_plugin).enable("strikethrough")


def _coalesce(out_nodes: List[Dict]) -> List[Dict]:
"""Merge adjacent text nodes that carry identical marks (e.g. softbreaks)."""
merged: List[Dict] = []
for node in out_nodes:
if (
merged
and node.get("type") == NodeType.TEXT
and merged[-1].get("type") == NodeType.TEXT
and node.get("marks") == merged[-1].get("marks")
):
merged[-1]["text"] += node["text"]
else:
merged.append(node)
return merged


def _render_inline(node: SyntaxTreeNode, marks: List[Dict]) -> List[Dict]:
"""Render an inline subtree into a flat list of text / anchor nodes."""
out: List[Dict] = []
for child in node.children:
t = child.type
if t == "text":
if child.content:
out.append(nodes.text(child.content, marks))
elif t == "code_inline":
out.append(nodes.text(child.content, marks + [nodes.code_mark()]))
elif t in _MARK_FOR:
out.extend(_render_inline(child, marks + [_MARK_FOR[t]]))
elif t == "link":
href = child.attrs.get("href", "")
out.extend(_render_inline(child, marks + [nodes.link_mark(href)]))
elif t in ("softbreak", "hardbreak"):
out.append(nodes.text(" ", marks))
elif t == "footnote_ref":
out.append(nodes.footnote_anchor(child.meta["id"] + 1))
elif t == "image":
# Inline images are rare in this schema; fall back to alt text.
alt = child.attrs.get("alt") or "".join(
c.content for c in child.children if c.type == "text"
)
if alt:
out.append(nodes.text(alt, marks))
return _coalesce(out)


def _only_image(inline: SyntaxTreeNode) -> Optional[SyntaxTreeNode]:
"""If an inline node is just an image (optionally wrapped in a link), return it."""
kids = [c for c in inline.children if c.type != "softbreak"]
if len(kids) == 1 and kids[0].type == "image":
return kids[0]
if len(kids) == 1 and kids[0].type == "link":
inner = [c for c in kids[0].children if c.type != "softbreak"]
if len(inner) == 1 and inner[0].type == "image":
img = inner[0]
img._link_href = kids[0].attrs.get("href") # type: ignore[attr-defined]
return img
return None


def _captioned_image(img: SyntaxTreeNode, api) -> Dict:
src = img.attrs.get("src", "")
if src.startswith("/"):
src = src[1:]
if api is not None and not src.startswith("http"):
try:
src = api.get_image(src).get("url")
except Exception:
pass
# markdown-it stores the image alt text as the node's content, not in attrs.
alt = img.content or img.attrs.get("alt") or None
return nodes.captioned_image(
src,
alt=alt,
href=getattr(img, "_link_href", None),
)


def _render_block(node: SyntaxTreeNode, api) -> List[Dict]:
"""Render a block-level node into zero or more Substack nodes."""
t = node.type

if t == "paragraph":
inline = node.children[0]
img = _only_image(inline)
if img is not None:
return [_captioned_image(img, api)]
return [nodes.paragraph(_render_inline(inline, []))]

if t == "heading":
level = int(node.tag[1])
return [nodes.heading(_render_inline(node.children[0], []), level=level)]

if t == "hr":
return [nodes.horizontal_rule()]

if t in ("fence", "code_block"):
return [
nodes.code_block(
node.content.rstrip("\n"), language=node.info.strip() or None
)
]

if t == "blockquote":
paras: List[Dict] = []
for child in node.children:
paras.extend(_render_block(child, api))
return [nodes.blockquote(paras)]

if t == "bullet_list":
return [nodes.bullet_list(_render_list_items(node, api))]

if t == "ordered_list":
return [nodes.ordered_list(_render_list_items(node, api))]

if t == "footnote_block":
out = []
for fn in node.children:
number = fn.meta["id"] + 1
paras = [
nodes.paragraph(_render_inline(child.children[0], []))
for child in fn.children
if child.type == "paragraph"
]
out.append(nodes.footnote(number, paras))
return out

return []


def _render_list_items(list_node: SyntaxTreeNode, api) -> List[Dict]:
items = []
for li in list_node.children:
# A list_item built by nodes.list_item wraps inline content in a single
# paragraph; here items may already contain block nodes, so build directly.
content: List[Dict] = []
for child in li.children:
content.extend(_render_block(child, api))
items.append({"type": NodeType.LIST_ITEM, "content": content})
return items


def markdown_to_doc(markdown_content: str, api=None) -> List[Dict]:
"""Convert Markdown into a list of Substack ProseMirror block nodes."""
tree = SyntaxTreeNode(_make_parser().parse(markdown_content))
out: List[Dict] = []
for node in tree.children:
out.extend(_render_block(node, api))
return out
121 changes: 121 additions & 0 deletions substack/nodes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
"""ProseMirror node builders for Substack documents.

PROTOTYPE: this module centralises the (undocumented) Substack ProseMirror
schema in one place. Today the node-type strings ("paragraph", "footnoteAnchor",
"image2", ...) and their shapes are scattered across post.py as inline dict
literals. Pulling them here gives:

* one source of truth for node shapes (so a schema change is a one-line fix),
* discoverable, typed constructors instead of bare dict literals,
* a natural seam for validation.

The builders intentionally return plain dicts so they stay 100% compatible with
the existing draft_body structure.
"""

from __future__ import annotations

from typing import Dict, List, Optional


class NodeType:
DOC = "doc"
PARAGRAPH = "paragraph"
HEADING = "heading"
TEXT = "text"
BLOCKQUOTE = "blockquote"
CODE_BLOCK = "codeBlock"
HORIZONTAL_RULE = "horizontal_rule"
BULLET_LIST = "bullet_list"
ORDERED_LIST = "ordered_list"
LIST_ITEM = "list_item"
FOOTNOTE = "footnote"
FOOTNOTE_ANCHOR = "footnoteAnchor"
CAPTIONED_IMAGE = "captionedImage"


class MarkType:
STRONG = "strong"
EM = "em"
CODE = "code"
STRIKETHROUGH = "strikethrough"
LINK = "link"


def code_mark() -> Dict:
return {"type": MarkType.CODE}


def text(value: str, marks: Optional[List[Dict]] = None) -> Dict:
node: Dict = {"type": NodeType.TEXT, "text": value}
if marks:
node["marks"] = marks
return node


def link_mark(href: str) -> Dict:
return {"type": MarkType.LINK, "attrs": {"href": href}}


def paragraph(content: Optional[List[Dict]] = None) -> Dict:
return {"type": NodeType.PARAGRAPH, "content": content or []}


def heading(content: List[Dict], level: int = 1) -> Dict:
return {"type": NodeType.HEADING, "content": content, "attrs": {"level": level}}


def horizontal_rule() -> Dict:
return {"type": NodeType.HORIZONTAL_RULE}


def blockquote(paragraphs: List[Dict]) -> Dict:
node: Dict = {"type": NodeType.BLOCKQUOTE}
if paragraphs:
node["content"] = paragraphs
return node


def list_item(content_nodes: List[Dict]) -> Dict:
return {
"type": NodeType.LIST_ITEM,
"content": [paragraph(content_nodes)],
}


def bullet_list(items: List[Dict]) -> Dict:
return {"type": NodeType.BULLET_LIST, "content": items}


def ordered_list(items: List[Dict]) -> Dict:
return {"type": NodeType.ORDERED_LIST, "content": items}


def code_block(code: str, language: Optional[str] = None) -> Dict:
node: Dict = {"type": NodeType.CODE_BLOCK, "content": [text(code)]}
if language:
node["attrs"] = {"language": language}
return node


def captioned_image(
src: str, alt: Optional[str] = None, href: Optional[str] = None
) -> Dict:
node: Dict = {"type": NodeType.CAPTIONED_IMAGE, "src": src}
if alt:
node["alt"] = alt
if href:
node["href"] = href
return node


def footnote_anchor(number: int) -> Dict:
return {"type": NodeType.FOOTNOTE_ANCHOR, "attrs": {"number": number}}


def footnote(number: int, paragraphs: List[Dict]) -> Dict:
return {
"type": NodeType.FOOTNOTE,
"attrs": {"number": number},
"content": paragraphs or [paragraph()],
}
Loading