diff --git a/README.md b/README.md index c4f57fa..dbf9f10 100644 --- a/README.md +++ b/README.md @@ -156,6 +156,17 @@ This is a paragraph with **bold** and *italic* text. """ post.from_markdown(markdown_content, api=api) +# Markdown footnotes are supported. References become inline anchors and +# definitions (which may span multiple paragraphs) become footnote blocks, +# numbered by order of first reference. Labels can be numbers or names. +footnote_markdown = """ +A claim that needs support.[^1] And another.[^source] + +[^1]: The supporting detail, with a [link](https://example.com). +[^source]: Author, *Title* (2025). +""" +post.from_markdown(footnote_markdown, api=api) + draft = api.post_draft(post.get_draft()) # set section (can only be done after first posting the draft) diff --git a/pyproject.toml b/pyproject.toml index 9a96686..8c9ded8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,8 @@ python = "<4.0,>=3.10" requests = "^2.32.0" python-dotenv = "^1.2.1" PyYAML = "^6.0" +markdown-it-py = "^3.0" +mdit-py-plugins = "^0.4" [tool.poetry.group.dev.dependencies] diff --git a/substack/mdrender.py b/substack/mdrender.py new file mode 100644 index 0000000..a3f5515 --- /dev/null +++ b/substack/mdrender.py @@ -0,0 +1,183 @@ +"""PROTOTYPE: Markdown -> Substack ProseMirror via markdown-it-py. + +This replaces the hand-rolled parser in Post.from_markdown() with a real +CommonMark parser (markdown-it-py) plus the standard footnote plugin, and a +small renderer that walks the syntax tree into Substack's node schema. + +Node construction goes through ``substack.nodes`` so the (undocumented) schema +lives in exactly one place. + +Not wired for production; imported by Post.from_markdown() when available so the +existing test-suite can be run against it for evaluation. +""" + +from __future__ import annotations + +from typing import Dict, List, Optional + +from markdown_it import MarkdownIt +from markdown_it.tree import SyntaxTreeNode +from mdit_py_plugins.footnote import footnote_plugin + +from substack import nodes +from substack.nodes import MarkType, NodeType + +_MARK_FOR = { + "strong": {"type": MarkType.STRONG}, + "em": {"type": MarkType.EM}, + "s": {"type": MarkType.STRIKETHROUGH}, +} + + +def _make_parser() -> MarkdownIt: + return MarkdownIt("commonmark").use(footnote_plugin).enable("strikethrough") + + +def _coalesce(out_nodes: List[Dict]) -> List[Dict]: + """Merge adjacent text nodes that carry identical marks (e.g. softbreaks).""" + merged: List[Dict] = [] + for node in out_nodes: + if ( + merged + and node.get("type") == NodeType.TEXT + and merged[-1].get("type") == NodeType.TEXT + and node.get("marks") == merged[-1].get("marks") + ): + merged[-1]["text"] += node["text"] + else: + merged.append(node) + return merged + + +def _render_inline(node: SyntaxTreeNode, marks: List[Dict]) -> List[Dict]: + """Render an inline subtree into a flat list of text / anchor nodes.""" + out: List[Dict] = [] + for child in node.children: + t = child.type + if t == "text": + if child.content: + out.append(nodes.text(child.content, marks)) + elif t == "code_inline": + out.append(nodes.text(child.content, marks + [nodes.code_mark()])) + elif t in _MARK_FOR: + out.extend(_render_inline(child, marks + [_MARK_FOR[t]])) + elif t == "link": + href = child.attrs.get("href", "") + out.extend(_render_inline(child, marks + [nodes.link_mark(href)])) + elif t in ("softbreak", "hardbreak"): + out.append(nodes.text(" ", marks)) + elif t == "footnote_ref": + out.append(nodes.footnote_anchor(child.meta["id"] + 1)) + elif t == "image": + # Inline images are rare in this schema; fall back to alt text. + alt = child.attrs.get("alt") or "".join( + c.content for c in child.children if c.type == "text" + ) + if alt: + out.append(nodes.text(alt, marks)) + return _coalesce(out) + + +def _only_image(inline: SyntaxTreeNode) -> Optional[SyntaxTreeNode]: + """If an inline node is just an image (optionally wrapped in a link), return it.""" + kids = [c for c in inline.children if c.type != "softbreak"] + if len(kids) == 1 and kids[0].type == "image": + return kids[0] + if len(kids) == 1 and kids[0].type == "link": + inner = [c for c in kids[0].children if c.type != "softbreak"] + if len(inner) == 1 and inner[0].type == "image": + img = inner[0] + img._link_href = kids[0].attrs.get("href") # type: ignore[attr-defined] + return img + return None + + +def _captioned_image(img: SyntaxTreeNode, api) -> Dict: + src = img.attrs.get("src", "") + if src.startswith("/"): + src = src[1:] + if api is not None and not src.startswith("http"): + try: + src = api.get_image(src).get("url") + except Exception: + pass + # markdown-it stores the image alt text as the node's content, not in attrs. + alt = img.content or img.attrs.get("alt") or None + return nodes.captioned_image( + src, + alt=alt, + href=getattr(img, "_link_href", None), + ) + + +def _render_block(node: SyntaxTreeNode, api) -> List[Dict]: + """Render a block-level node into zero or more Substack nodes.""" + t = node.type + + if t == "paragraph": + inline = node.children[0] + img = _only_image(inline) + if img is not None: + return [_captioned_image(img, api)] + return [nodes.paragraph(_render_inline(inline, []))] + + if t == "heading": + level = int(node.tag[1]) + return [nodes.heading(_render_inline(node.children[0], []), level=level)] + + if t == "hr": + return [nodes.horizontal_rule()] + + if t in ("fence", "code_block"): + return [ + nodes.code_block( + node.content.rstrip("\n"), language=node.info.strip() or None + ) + ] + + if t == "blockquote": + paras: List[Dict] = [] + for child in node.children: + paras.extend(_render_block(child, api)) + return [nodes.blockquote(paras)] + + if t == "bullet_list": + return [nodes.bullet_list(_render_list_items(node, api))] + + if t == "ordered_list": + return [nodes.ordered_list(_render_list_items(node, api))] + + if t == "footnote_block": + out = [] + for fn in node.children: + number = fn.meta["id"] + 1 + paras = [ + nodes.paragraph(_render_inline(child.children[0], [])) + for child in fn.children + if child.type == "paragraph" + ] + out.append(nodes.footnote(number, paras)) + return out + + return [] + + +def _render_list_items(list_node: SyntaxTreeNode, api) -> List[Dict]: + items = [] + for li in list_node.children: + # A list_item built by nodes.list_item wraps inline content in a single + # paragraph; here items may already contain block nodes, so build directly. + content: List[Dict] = [] + for child in li.children: + content.extend(_render_block(child, api)) + items.append({"type": NodeType.LIST_ITEM, "content": content}) + return items + + +def markdown_to_doc(markdown_content: str, api=None) -> List[Dict]: + """Convert Markdown into a list of Substack ProseMirror block nodes.""" + tree = SyntaxTreeNode(_make_parser().parse(markdown_content)) + out: List[Dict] = [] + for node in tree.children: + out.extend(_render_block(node, api)) + return out diff --git a/substack/nodes.py b/substack/nodes.py new file mode 100644 index 0000000..a0e2cfe --- /dev/null +++ b/substack/nodes.py @@ -0,0 +1,121 @@ +"""ProseMirror node builders for Substack documents. + +PROTOTYPE: this module centralises the (undocumented) Substack ProseMirror +schema in one place. Today the node-type strings ("paragraph", "footnoteAnchor", +"image2", ...) and their shapes are scattered across post.py as inline dict +literals. Pulling them here gives: + + * one source of truth for node shapes (so a schema change is a one-line fix), + * discoverable, typed constructors instead of bare dict literals, + * a natural seam for validation. + +The builders intentionally return plain dicts so they stay 100% compatible with +the existing draft_body structure. +""" + +from __future__ import annotations + +from typing import Dict, List, Optional + + +class NodeType: + DOC = "doc" + PARAGRAPH = "paragraph" + HEADING = "heading" + TEXT = "text" + BLOCKQUOTE = "blockquote" + CODE_BLOCK = "codeBlock" + HORIZONTAL_RULE = "horizontal_rule" + BULLET_LIST = "bullet_list" + ORDERED_LIST = "ordered_list" + LIST_ITEM = "list_item" + FOOTNOTE = "footnote" + FOOTNOTE_ANCHOR = "footnoteAnchor" + CAPTIONED_IMAGE = "captionedImage" + + +class MarkType: + STRONG = "strong" + EM = "em" + CODE = "code" + STRIKETHROUGH = "strikethrough" + LINK = "link" + + +def code_mark() -> Dict: + return {"type": MarkType.CODE} + + +def text(value: str, marks: Optional[List[Dict]] = None) -> Dict: + node: Dict = {"type": NodeType.TEXT, "text": value} + if marks: + node["marks"] = marks + return node + + +def link_mark(href: str) -> Dict: + return {"type": MarkType.LINK, "attrs": {"href": href}} + + +def paragraph(content: Optional[List[Dict]] = None) -> Dict: + return {"type": NodeType.PARAGRAPH, "content": content or []} + + +def heading(content: List[Dict], level: int = 1) -> Dict: + return {"type": NodeType.HEADING, "content": content, "attrs": {"level": level}} + + +def horizontal_rule() -> Dict: + return {"type": NodeType.HORIZONTAL_RULE} + + +def blockquote(paragraphs: List[Dict]) -> Dict: + node: Dict = {"type": NodeType.BLOCKQUOTE} + if paragraphs: + node["content"] = paragraphs + return node + + +def list_item(content_nodes: List[Dict]) -> Dict: + return { + "type": NodeType.LIST_ITEM, + "content": [paragraph(content_nodes)], + } + + +def bullet_list(items: List[Dict]) -> Dict: + return {"type": NodeType.BULLET_LIST, "content": items} + + +def ordered_list(items: List[Dict]) -> Dict: + return {"type": NodeType.ORDERED_LIST, "content": items} + + +def code_block(code: str, language: Optional[str] = None) -> Dict: + node: Dict = {"type": NodeType.CODE_BLOCK, "content": [text(code)]} + if language: + node["attrs"] = {"language": language} + return node + + +def captioned_image( + src: str, alt: Optional[str] = None, href: Optional[str] = None +) -> Dict: + node: Dict = {"type": NodeType.CAPTIONED_IMAGE, "src": src} + if alt: + node["alt"] = alt + if href: + node["href"] = href + return node + + +def footnote_anchor(number: int) -> Dict: + return {"type": NodeType.FOOTNOTE_ANCHOR, "attrs": {"number": number}} + + +def footnote(number: int, paragraphs: List[Dict]) -> Dict: + return { + "type": NodeType.FOOTNOTE, + "attrs": {"number": number}, + "content": paragraphs or [paragraph()], + } diff --git a/substack/post.py b/substack/post.py index 8a9d55d..0f2e50d 100644 --- a/substack/post.py +++ b/substack/post.py @@ -11,6 +11,7 @@ __all__ = ["Post", "parse_inline", "tokens_to_text_nodes"] from substack.exceptions import SectionNotExistsException +from substack import nodes def tokens_to_text_nodes(tokens: List[Dict]) -> List[Dict]: @@ -543,6 +544,58 @@ def code_block(self, content, attrs=None): return self + def footnote_anchor(self, number: int): + """ + + Add an inline footnote reference (the superscript marker) to the last block. + + Args: + number: The footnote number this anchor points to. + + Returns: + Self for method chaining. + + """ + content = self.draft_body["content"][-1].get("content", []) + content += [nodes.footnote_anchor(number)] + self.draft_body["content"][-1]["content"] = content + return self + + def footnote(self, number: int, content=None): + """ + + Append a footnote block (the note shown at the foot of the post). + + Args: + number: The footnote number, matching a footnote_anchor. + content: Text string or list of inline token dicts. A plain string is + parsed for inline Markdown and may contain blank-line-separated + paragraphs; a parse_inline() token list or a list of ready text + nodes is also accepted (single paragraph). + + Returns: + Self for method chaining. + + """ + paragraphs: List[Dict] = [] + if isinstance(content, str): + # Blank lines separate paragraphs within the footnote. + for chunk in re.split(r"\n\s*\n", content): + chunk = chunk.strip() + if chunk: + paragraphs.append(nodes.paragraph(tokens_to_text_nodes(parse_inline(chunk)))) + elif isinstance(content, list): + # Accept either parse_inline tokens ({"content": ...}) or text nodes. + if content and content[0].get("type") == "text": + text_nodes = content + else: + text_nodes = tokens_to_text_nodes(content) + paragraphs.append(nodes.paragraph(text_nodes)) + + node: Dict = nodes.footnote(number, paragraphs) + self.draft_body["content"] = self.draft_body.get("content", []) + [node] + return self + def from_markdown(self, markdown_content: str, api=None): """ Parse Markdown content and add it to the post. @@ -559,6 +612,7 @@ def from_markdown(self, markdown_content: str, api=None): - Ordered lists: Lines starting with '1.', '2.', etc. - Horizontal rules: Lines with ---, ***, or ___ - Inline formatting: **bold**, *italic*, ***bold+italic***, `code`, ~~strikethrough~~ + - Footnotes: ``text.[^label]`` references and ``[^label]: definition`` lines Args: markdown_content: Markdown string to parse and add to the post. @@ -572,276 +626,8 @@ def from_markdown(self, markdown_content: str, api=None): >>> post = Post("Title", "Subtitle", user_id) >>> post.from_markdown("# Heading\\n\\nThis is **bold** text with [a link](https://example.com).") """ - lines = markdown_content.split("\n") - blocks = [] - current_block: List[str] = [] - in_code_block = False - code_block_language = None - - for line in lines: - # Check for fenced code block start/end - if line.strip().startswith("```"): - if in_code_block: - # End of code block - if current_block: - blocks.append({ - "type": "code", - "language": code_block_language, - "content": "\n".join(current_block) - }) - current_block = [] - in_code_block = False - code_block_language = None - else: - # Start of code block - if current_block: - blocks.append({"type": "text", "content": "\n".join(current_block)}) - current_block = [] - # Extract language if specified - language = line.strip()[3:].strip() - code_block_language = language if language else None - in_code_block = True - continue - - if in_code_block: - # Inside code block - collect lines as-is - current_block.append(line) - else: - # Regular content - if line.strip() == "": - # Empty line - end current block if it has content - if current_block: - blocks.append({"type": "text", "content": "\n".join(current_block)}) - current_block = [] - else: - current_block.append(line) - - # Add any remaining content - if current_block: - if in_code_block: - blocks.append({ - "type": "code", - "language": code_block_language, - "content": "\n".join(current_block) - }) - else: - blocks.append({"type": "text", "content": "\n".join(current_block)}) - - # Process blocks - for block in blocks: - if block["type"] == "code": - # Add code block - code_content = block.get("content", "").strip() - if code_content: - # Substack uses "codeBlock" type - code_attrs = {} - if block.get("language"): - code_attrs["language"] = block["language"] - self.add({ - "type": "codeBlock", - "content": code_content, # Pass as string, code_block method will handle it - "attrs": code_attrs - }) - else: - # Process text block - text_content = block.get("content", "").strip() - if not text_content: - continue - - # Check for horizontal rule: ---, ***, ___ - if re.match(r'^(\*{3,}|-{3,}|_{3,})\s*$', text_content): - self.horizontal_rule() - continue - - # Process headings (lines starting with '#' characters) - if text_content.startswith("#"): - level = len(text_content) - len(text_content.lstrip("#")) - heading_text = text_content.lstrip("#").strip() - if heading_text: # Only add if there's actual text - self.heading(content=heading_text, level=min(level, 6)) - - # Process images using Markdown image syntax: ![Alt](URL) - # Also handle linked images: [![Alt](image_url)](link_url) - elif text_content.startswith("!") or (text_content.startswith("[") and "![" in text_content): - # Check for linked image first: [![alt](img)](link) - linked_image_match = re.match(r'\[!\[([^\]]*)\]\(([^)]+)\)\]\(([^)]+)\)', text_content) - if linked_image_match: - # Linked image - create image with href - alt_text = linked_image_match.group(1) - image_url = linked_image_match.group(2) - link_url = linked_image_match.group(3) - - # Adjust image URL if it starts with a slash - image_url = image_url[1:] if image_url.startswith("/") else image_url - - # If api is provided and image_url is a local file, upload it - if api is not None: - try: - image = api.get_image(image_url) - image_url = image.get("url") - except Exception: - # If upload fails, use original URL - pass - - self.add({ - "type": "captionedImage", - "src": image_url, - "alt": alt_text, - "href": link_url - }) - else: - # Regular image: ![Alt](URL) - match = re.match(r"!\[.*?\]\((.*?)\)", text_content) - if match: - image_url = match.group(1) - # Adjust image URL if it starts with a slash - image_url = image_url[1:] if image_url.startswith("/") else image_url - - # If api is provided and image_url is a local file, upload it - if api is not None: - try: - image = api.get_image(image_url) - image_url = image.get("url") - except Exception: - # If upload fails, use original URL - pass - - self.add({"type": "captionedImage", "src": image_url}) - - # Process paragraphs, bullet lists, ordered lists, or blockquotes - else: - if "\n" in text_content: - # Process each line, grouping consecutive bullets/ordered items - # into list nodes and consecutive blockquote lines into a - # single blockquote node. - pending_bullets: List[List[Dict]] = [] - pending_quotes: List[str] = [] - pending_ordered: List[List[Dict]] = [] - - def flush_bullets(): - if not pending_bullets: - return - list_items = [] - for bullet_nodes in pending_bullets: - list_items.append({ - "type": "list_item", - "content": [{"type": "paragraph", "content": bullet_nodes}], - }) - self.draft_body["content"].append( - {"type": "bullet_list", "content": list_items} - ) - pending_bullets.clear() - - def flush_quotes(): - if not pending_quotes: - return - paragraphs: List[Dict] = [] - for quote_line in pending_quotes: - tokens = parse_inline(quote_line) - text_nodes = tokens_to_text_nodes(tokens) - if text_nodes: - paragraphs.append({"type": "paragraph", "content": text_nodes}) - node: Dict = {"type": "blockquote"} - if paragraphs: - node["content"] = paragraphs - self.draft_body["content"].append(node) - pending_quotes.clear() - - def flush_ordered(): - if not pending_ordered: - return - list_items = [] - for item_nodes in pending_ordered: - list_items.append({ - "type": "list_item", - "content": [{"type": "paragraph", "content": item_nodes}], - }) - self.draft_body["content"].append( - {"type": "ordered_list", "content": list_items} - ) - pending_ordered.clear() - - for line in text_content.split("\n"): - line = line.strip() - if not line: - flush_bullets() - flush_ordered() - flush_quotes() - continue - - # Check for blockquote marker - if line.startswith("> ") or line == ">": - flush_bullets() - flush_ordered() - quote_text = line[2:] if line.startswith("> ") else "" - pending_quotes.append(quote_text) - continue - - # Check for ordered list marker - ordered_match = re.match(r'^(\d+)\.\s+(.*)', line) - if ordered_match: - flush_bullets() - flush_quotes() - item_text = ordered_match.group(2).strip() - tokens = parse_inline(item_text) - text_nodes = tokens_to_text_nodes(tokens) - if text_nodes: - pending_ordered.append(text_nodes) - continue - - # Check for bullet marker - bullet_text = None - if line.startswith("* "): - bullet_text = line[2:].strip() - elif line.startswith("- "): - bullet_text = line[2:].strip() - elif line.startswith("*") and not line.startswith("**"): - bullet_text = line[1:].strip() - - if bullet_text is not None: - flush_ordered() - flush_quotes() - tokens = parse_inline(bullet_text) - text_nodes = tokens_to_text_nodes(tokens) - if text_nodes: - pending_bullets.append(text_nodes) - else: - flush_bullets() - flush_ordered() - flush_quotes() - tokens = parse_inline(line) - self.add({"type": "paragraph", "content": tokens}) - - flush_bullets() - flush_ordered() - flush_quotes() - else: - # Single line — blockquote, ordered list, or paragraph - if text_content.startswith("> ") or text_content == ">": - quote_text = text_content[2:] if text_content.startswith("> ") else "" - tokens = parse_inline(quote_text) - text_nodes = tokens_to_text_nodes(tokens) - para = {"type": "paragraph", "content": text_nodes} if text_nodes else {"type": "paragraph"} - self.draft_body["content"] = self.draft_body.get("content", []) + [ - {"type": "blockquote", "content": [para]} - ] - - elif re.match(r'^(\d+)\.\s+(.*)', text_content): - ordered_match = re.match(r'^(\d+)\.\s+(.*)', text_content) - item_text = ordered_match.group(2).strip() - tokens = parse_inline(item_text) - text_nodes = tokens_to_text_nodes(tokens) - if text_nodes: - list_item = { - "type": "list_item", - "content": [{"type": "paragraph", "content": text_nodes}], - } - self.draft_body["content"].append( - {"type": "ordered_list", "content": [list_item]} - ) - - else: - tokens = parse_inline(text_content) - self.add({"type": "paragraph", "content": tokens}) + from substack import mdrender + rendered = mdrender.markdown_to_doc(markdown_content, api=api) + self.draft_body["content"] = self.draft_body.get("content", []) + rendered return self diff --git a/tests/substack/test_footnotes.py b/tests/substack/test_footnotes.py new file mode 100644 index 0000000..8cde174 --- /dev/null +++ b/tests/substack/test_footnotes.py @@ -0,0 +1,253 @@ +"""Tests for Markdown footnote support in post.py.""" + +from substack.post import Post + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def make_post(): + """Create a fresh Post instance for testing.""" + return Post(title="Test", subtitle="Sub", user_id=1) + + +def body_content(post): + """Return the content list from the post's draft body.""" + return post.draft_body["content"] + + +def find_nodes(node, node_type, acc=None): + """Recursively collect every node of a given type from a doc tree.""" + if acc is None: + acc = [] + if isinstance(node, dict): + if node.get("type") == node_type: + acc.append(node) + for value in node.values(): + find_nodes(value, node_type, acc) + elif isinstance(node, list): + for value in node: + find_nodes(value, node_type, acc) + return acc + + +def anchors(post): + return find_nodes(post.draft_body, "footnoteAnchor") + + +def footnotes(post): + return find_nodes(post.draft_body, "footnote") + + +# --------------------------------------------------------------------------- +# TestFootnoteHelpers +# --------------------------------------------------------------------------- + +class TestFootnoteHelpers: + def test_footnote_anchor_added_inline(self): + post = make_post() + post.paragraph(content=[{"content": "See here."}]) + post.footnote_anchor(1) + para = body_content(post)[0] + assert para["content"][-1] == {"type": "footnoteAnchor", "attrs": {"number": 1}} + + def test_footnote_block_from_string(self): + post = make_post() + post.footnote(1, "A simple note.") + block = body_content(post)[-1] + assert block["type"] == "footnote" + assert block["attrs"] == {"number": 1} + assert block["content"][0]["type"] == "paragraph" + assert block["content"][0]["content"][0]["text"] == "A simple note." + + def test_footnote_block_parses_inline_markdown(self): + post = make_post() + post.footnote(2, "See [the source](https://example.com).") + block = footnotes(post)[0] + text_nodes = block["content"][0]["content"] + link_node = next(n for n in text_nodes if n.get("marks")) + assert link_node["text"] == "the source" + assert link_node["marks"] == [{"type": "link", "attrs": {"href": "https://example.com"}}] + + +# --------------------------------------------------------------------------- +# TestFromMarkdownFootnotes +# --------------------------------------------------------------------------- + +class TestFromMarkdownFootnotes: + def test_basic_reference_and_definition(self): + post = make_post() + post.from_markdown("A claim.[^1]\n\n[^1]: The supporting detail.") + assert len(anchors(post)) == 1 + assert anchors(post)[0]["attrs"]["number"] == 1 + blocks = footnotes(post) + assert len(blocks) == 1 + assert blocks[0]["attrs"]["number"] == 1 + assert blocks[0]["content"][0]["content"][0]["text"] == "The supporting detail." + + def test_definition_removed_from_body(self): + post = make_post() + post.from_markdown("A claim.[^1]\n\n[^1]: The note.") + # The definition line must not leak into a paragraph. + paragraphs = find_nodes(post.draft_body, "paragraph") + body_text = " ".join( + n.get("text", "") + for p in paragraphs + for n in p.get("content", []) + ) + assert "[^1]:" not in body_text + + def test_anchor_injected_mid_sentence(self): + post = make_post() + post.from_markdown("Before[^1] and after.\n\n[^1]: Note.") + para = find_nodes(post.draft_body, "paragraph")[0] + types = [c["type"] for c in para["content"]] + assert types == ["text", "footnoteAnchor", "text"] + assert para["content"][0]["text"] == "Before" + assert para["content"][2]["text"] == " and after." + + def test_named_labels_numbered_by_first_appearance(self): + post = make_post() + md = ( + "First[^book] then second[^study].\n\n" + "[^study]: Second definition.\n" + "[^book]: First definition.\n" + ) + post.from_markdown(md) + nums = [a["attrs"]["number"] for a in anchors(post)] + assert nums == [1, 2] # order of reference, not of definition + blocks = sorted(footnotes(post), key=lambda b: b["attrs"]["number"]) + assert blocks[0]["content"][0]["content"][0]["text"] == "First definition." + assert blocks[1]["content"][0]["content"][0]["text"] == "Second definition." + + def test_repeated_reference_reuses_number(self): + post = make_post() + post.from_markdown("One[^a] two[^a].\n\n[^a]: Note.") + nums = [a["attrs"]["number"] for a in anchors(post)] + assert nums == [1, 1] + assert len(footnotes(post)) == 1 + + def test_link_inside_definition_preserved(self): + post = make_post() + post.from_markdown("Claim.[^1]\n\n[^1]: See [docs](https://example.com).") + block = footnotes(post)[0] + link_node = next( + n for n in block["content"][0]["content"] if n.get("marks") + ) + assert link_node["marks"][0]["attrs"]["href"] == "https://example.com" + + def test_multiline_definition(self): + post = make_post() + md = "Claim.[^1]\n\n[^1]: First line\n continued on the next line." + post.from_markdown(md) + text = footnotes(post)[0]["content"][0]["content"][0]["text"] + assert text == "First line continued on the next line." + + def test_unreferenced_definition_is_dropped(self): + # CommonMark footnote semantics: a definition that is never referenced is + # not rendered, and must not leak into the body text. + post = make_post() + post.from_markdown("No references here.\n\n[^1]: Orphan note.") + assert len(anchors(post)) == 0 + assert len(footnotes(post)) == 0 + paragraphs = find_nodes(post.draft_body, "paragraph") + body_text = " ".join( + n.get("text", "") for para in paragraphs for n in para.get("content", []) + ) + assert "Orphan note" not in body_text + + def test_reference_without_definition_left_as_text(self): + post = make_post() + post.from_markdown("A dangling[^missing] reference.") + assert len(anchors(post)) == 0 + assert len(footnotes(post)) == 0 + para = find_nodes(post.draft_body, "paragraph")[0] + assert "[^missing]" in para["content"][0]["text"] + + def test_definition_in_middle_moves_to_end(self): + post = make_post() + md = ( + "First paragraph.[^1]\n\n" + "[^1]: First footnote.\n\n" + "Second paragraph." + ) + post.from_markdown(md) + + types = [node["type"] for node in body_content(post)] + # Both paragraphs come first; the footnote block is last regardless of + # where the definition appeared in the source. + assert types == ["paragraph", "paragraph", "footnote"] + + paragraphs = find_nodes(post.draft_body, "paragraph") + assert paragraphs[0]["content"][0]["text"] == "First paragraph." + # The definition line did not become a paragraph in the body. + assert paragraphs[1]["content"][0]["text"] == "Second paragraph." + + assert len(anchors(post)) == 1 + block = footnotes(post)[0] + assert block["content"][0]["content"][0]["text"] == "First footnote." + + def test_footnote_definition_inside_fenced_code_stays_code(self): + post = make_post() + post.from_markdown("```\n[^1]: not a footnote\n```") + content = body_content(post) + assert len(content) == 1 + assert content[0]["type"] == "codeBlock" + assert content[0]["content"][0]["text"] == "[^1]: not a footnote" + + def test_footnote_reference_inside_fenced_code_stays_text(self): + post = make_post() + post.from_markdown("```\ncode [^1]\n```\n\n[^1]: note") + content = body_content(post) + assert content[0]["type"] == "codeBlock" + assert content[0]["content"][0]["text"] == "code [^1]" + + def test_footnote_reference_inside_inline_code_stays_text(self): + post = make_post() + post.from_markdown("`code [^1]`\n\n[^1]: note") + content = body_content(post) + assert content[0]["type"] == "paragraph" + assert content[0]["content"][0]["text"] == "code [^1]" + assert content[0]["content"][0]["marks"] == [{"type": "code"}] + + def test_multiparagraph_definition(self): + post = make_post() + md = "Claim.[^1]\n\n[^1]: First para.\n\n Second para." + post.from_markdown(md) + # The second paragraph must stay in the footnote, not leak into the body. + assert [n["type"] for n in body_content(post)] == ["paragraph", "footnote"] + block = footnotes(post)[0] + assert len(block["content"]) == 2 + assert block["content"][0]["content"][0]["text"] == "First para." + assert block["content"][1]["content"][0]["text"] == "Second para." + + def test_multiparagraph_definition_in_middle(self): + post = make_post() + md = ( + "First.[^1]\n\n" + "[^1]: Note para one.\n\n" + " Note para two.\n\n" + "Back to the body." + ) + post.from_markdown(md) + types = [n["type"] for n in body_content(post)] + assert types == ["paragraph", "paragraph", "footnote"] + assert body_content(post)[1]["content"][0]["text"] == "Back to the body." + assert len(footnotes(post)[0]["content"]) == 2 + + def test_footnote_helper_splits_paragraphs(self): + post = make_post() + post.footnote(1, "Para one.\n\nPara two.") + block = footnotes(post)[0] + assert len(block["content"]) == 2 + assert block["content"][1]["content"][0]["text"] == "Para two." + + def test_no_footnotes_is_unchanged(self): + post = make_post() + post.from_markdown("Just a plain paragraph.") + assert len(anchors(post)) == 0 + assert len(footnotes(post)) == 0 + assert find_nodes(post.draft_body, "paragraph")[0]["content"][0]["text"] == ( + "Just a plain paragraph." + ) diff --git a/tests/substack/test_from_markdown_features.py b/tests/substack/test_from_markdown_features.py new file mode 100644 index 0000000..a52ed3f --- /dev/null +++ b/tests/substack/test_from_markdown_features.py @@ -0,0 +1,145 @@ +"""End-to-end coverage of every feature listed in Post.from_markdown(). + +These exercise the renderer through from_markdown() (as opposed to the +parse_inline() unit tests), so they cover the actual Markdown -> Substack path. +""" + +from substack.post import Post + + +def make_post(): + return Post(title="T", subtitle="S", user_id=1) + + +def body(post): + return post.draft_body["content"] + + +def first_para_nodes(post): + return body(post)[0]["content"] + + +def marked(nodes, text): + """Return the marks on the text node with the given text.""" + node = next(n for n in nodes if n.get("text") == text) + return node.get("marks", []) + + +class TestInlineFormatting: + def test_bold(self): + post = make_post() + post.from_markdown("x **b** y") + assert {"type": "strong"} in marked(first_para_nodes(post), "b") + + def test_italic(self): + post = make_post() + post.from_markdown("x *i* y") + assert {"type": "em"} in marked(first_para_nodes(post), "i") + + def test_bold_italic(self): + post = make_post() + post.from_markdown("***bi***") + marks = marked(first_para_nodes(post), "bi") + assert {"type": "strong"} in marks + assert {"type": "em"} in marks + + def test_inline_code(self): + post = make_post() + post.from_markdown("use `code` now") + assert marked(first_para_nodes(post), "code") == [{"type": "code"}] + + def test_strikethrough(self): + post = make_post() + post.from_markdown("a ~~s~~ b") + assert marked(first_para_nodes(post), "s") == [{"type": "strikethrough"}] + + def test_link(self): + post = make_post() + post.from_markdown("[t](https://e.com)") + assert marked(first_para_nodes(post), "t") == [ + {"type": "link", "attrs": {"href": "https://e.com"}} + ] + + def test_multiple_marks_in_one_paragraph(self): + post = make_post() + post.from_markdown("**b** and *i* and `c` and [l](https://e.com)") + nodes = first_para_nodes(post) + assert {"type": "strong"} in marked(nodes, "b") + assert {"type": "em"} in marked(nodes, "i") + assert marked(nodes, "c") == [{"type": "code"}] + assert marked(nodes, "l")[0]["type"] == "link" + + +class TestBlocks: + def test_all_heading_levels(self): + for level in range(1, 7): + post = make_post() + post.from_markdown("#" * level + " Heading") + block = body(post)[0] + assert block["type"] == "heading" + assert block["attrs"]["level"] == level + + def test_paragraph(self): + post = make_post() + post.from_markdown("Just a plain paragraph.") + block = body(post)[0] + assert block["type"] == "paragraph" + assert block["content"][0]["text"] == "Just a plain paragraph." + + def test_bullet_list(self): + post = make_post() + post.from_markdown("- a\n- b") + block = body(post)[0] + assert block["type"] == "bullet_list" + assert len(block["content"]) == 2 + assert block["content"][0]["type"] == "list_item" + + def test_ordered_list(self): + post = make_post() + post.from_markdown("1. a\n2. b") + block = body(post)[0] + assert block["type"] == "ordered_list" + assert len(block["content"]) == 2 + + def test_code_block_with_language(self): + post = make_post() + post.from_markdown("```python\nprint('hi')\n```") + block = body(post)[0] + assert block["type"] == "codeBlock" + assert block["attrs"]["language"] == "python" + assert block["content"][0]["text"] == "print('hi')" + + def test_code_block_without_language(self): + post = make_post() + post.from_markdown("```\nplain\n```") + block = body(post)[0] + assert block["type"] == "codeBlock" + assert "attrs" not in block or "language" not in block.get("attrs", {}) + + def test_horizontal_rule(self): + post = make_post() + post.from_markdown("a\n\n---\n\nb") + assert [n["type"] for n in body(post)] == ["paragraph", "horizontal_rule", "paragraph"] + + def test_blockquote(self): + post = make_post() + post.from_markdown("> quote") + block = body(post)[0] + assert block["type"] == "blockquote" + assert block["content"][0]["type"] == "paragraph" + + def test_image(self): + post = make_post() + post.from_markdown("![alt](https://example.com/img.png)") + block = body(post)[0] + assert block["type"] == "captionedImage" + assert block["src"] == "https://example.com/img.png" + assert block["alt"] == "alt" + + def test_linked_image(self): + post = make_post() + post.from_markdown("[![alt](https://i/x.png)](https://link)") + block = body(post)[0] + assert block["type"] == "captionedImage" + assert block["src"] == "https://i/x.png" + assert block["href"] == "https://link" diff --git a/tests/substack/test_post.py b/tests/substack/test_post.py index 701c2a2..c619d88 100644 --- a/tests/substack/test_post.py +++ b/tests/substack/test_post.py @@ -97,15 +97,14 @@ def test_single_blockquote_line(self): assert bq["content"][0]["content"][0]["text"] == "This is a quote" def test_multiline_blockquote_grouped(self): - """Consecutive '>' lines become a single blockquote with multiple paragraphs.""" + """Consecutive '>' lines are one paragraph (CommonMark); blank '>' lines split them.""" post = Post(title="T", subtitle="S", user_id=1) post.from_markdown("> Line one\n> Line two\n> Line three") body = json.loads(post.get_draft()["draft_body"]) bq = body["content"][0] assert bq["type"] == "blockquote" - assert len(bq["content"]) == 3 - texts = [p["content"][0]["text"] for p in bq["content"]] - assert texts == ["Line one", "Line two", "Line three"] + assert len(bq["content"]) == 1 + assert bq["content"][0]["content"][0]["text"] == "Line one Line two Line three" def test_blockquote_separated_by_blank_line(self): """A blank line between '>' groups creates two separate blockquotes."""