From 881808eb73c749a82fe278b96b4fcb96e7f60abf Mon Sep 17 00:00:00 2001 From: Matthias Splieth Date: Fri, 27 Mar 2026 13:15:57 +0100 Subject: [PATCH 1/2] Include draw.io diagrams in exports --- src/confluence2md/cli.py | 8 +- src/confluence2md/client.py | 40 +++++++++ src/confluence2md/renderer.py | 124 +++++++++++++++++++++++-- tests/test_renderer.py | 165 +++++++++++++++++++++++++++++++++- uv.lock | 2 +- 5 files changed, 326 insertions(+), 13 deletions(-) diff --git a/src/confluence2md/cli.py b/src/confluence2md/cli.py index 6b41400..e2d30e0 100644 --- a/src/confluence2md/cli.py +++ b/src/confluence2md/cli.py @@ -80,14 +80,14 @@ def _handle_export(args: argparse.Namespace, config: Config) -> None: if args.page_id: page = fetch_page(confluence, args.page_id) - path = export_page(page, config.output) + path = export_page(page, config.output, confluence) print(f"Exported: {path}") include_children = args.include_children or config.output.include_children if include_children: children = fetch_child_pages(confluence, args.page_id) if children: - child_paths = export_pages(children, config.output) + child_paths = export_pages(children, config.output, confluence) for p in child_paths: print(f"Exported: {p}") total = 1 + len(child_paths) @@ -99,7 +99,7 @@ def _handle_export(args: argparse.Namespace, config: Config) -> None: if not pages: print("No pages found.") return - paths = export_pages(pages, config.output) + paths = export_pages(pages, config.output, confluence) for p in paths: print(f"Exported: {p}") print(f"\n{len(paths)} page(s) exported to {config.output.directory}") @@ -108,7 +108,7 @@ def _handle_export(args: argparse.Namespace, config: Config) -> None: if not pages: print("No pages found.") return - paths = export_pages(pages, config.output) + paths = export_pages(pages, config.output, confluence) for p in paths: print(f"Exported: {p}") print(f"\n{len(paths)} page(s) exported to {config.output.directory}") diff --git a/src/confluence2md/client.py b/src/confluence2md/client.py index 2fa6b8b..6226bb7 100644 --- a/src/confluence2md/client.py +++ b/src/confluence2md/client.py @@ -1,4 +1,5 @@ from dataclasses import dataclass, field +from pathlib import Path from typing import Any from atlassian import Confluence @@ -18,6 +19,14 @@ class Page: parent_title: str = "" +@dataclass +class Attachment: + id: str + title: str + media_type: str + download_url: str + + def connect(config: ConfluenceConfig) -> Confluence: """Create an authenticated Confluence client.""" if config.username: @@ -151,3 +160,34 @@ def _build_page_url(raw: dict[str, Any], confluence: Confluence) -> str: base = links.get("base", confluence.url.rstrip("/")) return f"{base}{webui}" return "" + + +def fetch_attachments(confluence: Confluence, page_id: str) -> list[Attachment]: + """Fetch all attachments for a page.""" + results = confluence.get_attachments_from_content(page_id) # type: ignore[no-untyped-call] + attachments = [] + for item in results.get("results", []): + download_url = item.get("_links", {}).get("download", "") + attachments.append( + Attachment( + id=str(item.get("id", "")), + title=item.get("title", ""), + media_type=item.get("metadata", {}).get("mediaType", ""), + download_url=download_url, + ) + ) + return attachments + + +def download_attachment( + confluence: Confluence, attachment: Attachment, dest: Path +) -> Path: + """Download an attachment to the given directory. Returns the file path.""" + dest.mkdir(parents=True, exist_ok=True) + filepath = dest / attachment.title + response = confluence.request( + method="GET", + path=attachment.download_url, + ) + filepath.write_bytes(response.content) + return filepath diff --git a/src/confluence2md/renderer.py b/src/confluence2md/renderer.py index 3870cbb..dd65f60 100644 --- a/src/confluence2md/renderer.py +++ b/src/confluence2md/renderer.py @@ -1,9 +1,16 @@ import re from pathlib import Path +from typing import Optional +from atlassian import Confluence from markdownify import markdownify -from .client import Page +from .client import ( + Attachment, + Page, + download_attachment, + fetch_attachments, +) from .config import OutputConfig @@ -42,20 +49,46 @@ def render_page(page: Page, config: OutputConfig) -> str: return "\n".join(lines) -def export_page(page: Page, config: OutputConfig) -> Path: +def export_page( + page: Page, + config: OutputConfig, + confluence: Optional[Confluence] = None, +) -> Path: """Export a single page to a Markdown file. Returns the file path.""" - content = render_page(page, config) - filename = _safe_filename(config.filename_pattern.format(title=page.title)) + ".md" output_dir = Path(config.directory) output_dir.mkdir(parents=True, exist_ok=True) + + body = page.body + if confluence: + body = _process_drawio_macros(body, page, output_dir, confluence) + + content = render_page( + Page( + id=page.id, + title=page.title, + space_key=page.space_key, + body=body, + labels=page.labels, + url=page.url, + version=page.version, + parent_title=page.parent_title, + ), + config, + ) + + filename = _safe_filename(config.filename_pattern.format(title=page.title)) + ".md" filepath = output_dir / filename filepath.write_text(content, encoding="utf-8") return filepath -def export_pages(pages: list[Page], config: OutputConfig) -> list[Path]: +def export_pages( + pages: list[Page], + config: OutputConfig, + confluence: Optional[Confluence] = None, +) -> list[Path]: """Export multiple pages to Markdown files.""" - return [export_page(page, config) for page in pages] + return [export_page(page, config, confluence) for page in pages] def _convert_body(html: str) -> str: @@ -63,6 +96,85 @@ def _convert_body(html: str) -> str: return markdownify(html, heading_style="ATX", strip=["style"]) +def _extract_drawio_diagram_names(html: str) -> list[str]: + """Extract draw.io diagram names from Confluence storage format HTML.""" + names: list[str] = [] + # Confluence macros use ac: namespace prefixes which aren't valid XML + # without namespace declarations, so we use regex to extract them. + macro_pattern = re.compile( + r']*ac:name=["\']drawio["\'][^>]*>' + r"(.*?)", + re.DOTALL, + ) + param_pattern = re.compile( + r']*ac:name=["\']diagramName["\'][^>]*>' + r"(.*?)", + re.DOTALL, + ) + for macro_match in macro_pattern.finditer(html): + macro_body = macro_match.group(1) + param_match = param_pattern.search(macro_body) + if param_match: + names.append(param_match.group(1).strip()) + return names + + +def _find_drawio_png( + diagram_name: str, attachments: list[Attachment] +) -> Optional[Attachment]: + """Find the PNG attachment for a draw.io diagram.""" + # draw.io stores previews with various naming conventions + candidates = [ + f"{diagram_name}.png", + f"{diagram_name}.drawio.png", + ] + for att in attachments: + if att.title in candidates: + return att + return None + + +def _process_drawio_macros( + html: str, + page: Page, + output_dir: Path, + confluence: Confluence, +) -> str: + """Replace draw.io macros with image references and download PNGs.""" + diagram_names = _extract_drawio_diagram_names(html) + if not diagram_names: + return html + + attachments = fetch_attachments(confluence, page.id) + + macro_pattern = re.compile( + r']*ac:name=["\']drawio["\'][^>]*>' + r"(.*?)", + re.DOTALL, + ) + param_pattern = re.compile( + r']*ac:name=["\']diagramName["\'][^>]*>' + r"(.*?)", + re.DOTALL, + ) + + def _replace_macro(match: re.Match[str]) -> str: + macro_body = match.group(1) + param_match = param_pattern.search(macro_body) + if not param_match: + return match.group(0) + + diagram_name = param_match.group(1).strip() + png_attachment = _find_drawio_png(diagram_name, attachments) + if not png_attachment: + return match.group(0) + + download_attachment(confluence, png_attachment, output_dir) + return f'{diagram_name}' + + return macro_pattern.sub(_replace_macro, html) + + def _safe_filename(name: str) -> str: """Sanitize a string for use as a filename.""" # Replace characters that are problematic in filenames diff --git a/tests/test_renderer.py b/tests/test_renderer.py index 9223a15..efa0b3a 100644 --- a/tests/test_renderer.py +++ b/tests/test_renderer.py @@ -1,6 +1,15 @@ -from confluence2md.client import Page +from unittest.mock import MagicMock, patch + +from confluence2md.client import Attachment, Page from confluence2md.config import OutputConfig -from confluence2md.renderer import _safe_filename, render_page +from confluence2md.renderer import ( + _extract_drawio_diagram_names, + _find_drawio_png, + _process_drawio_macros, + _safe_filename, + export_page, + render_page, +) def test_render_basic_page(): @@ -148,3 +157,155 @@ def test_render_page_labels_hidden(): # then assert "| Labels |" not in md assert "| Labels | draft |" not in md + + +# --- draw.io support --- + +DRAWIO_MACRO = ( + '' + 'Architecture' + '800' + "" +) + +DRAWIO_MACRO_TWO = ( + '' + 'Flow' + "" +) + + +def test_extract_drawio_diagram_names_single(): + html = f"

Before

{DRAWIO_MACRO}

After

" + assert _extract_drawio_diagram_names(html) == ["Architecture"] + + +def test_extract_drawio_diagram_names_multiple(): + html = f"{DRAWIO_MACRO}

text

{DRAWIO_MACRO_TWO}" + assert _extract_drawio_diagram_names(html) == ["Architecture", "Flow"] + + +def test_extract_drawio_diagram_names_none(): + html = "

No diagrams here

" + assert _extract_drawio_diagram_names(html) == [] + + +def test_find_drawio_png_direct_match(): + attachments = [ + Attachment( + id="1", + title="Architecture.png", + media_type="image/png", + download_url="/download/1", + ), + Attachment( + id="2", + title="other.pdf", + media_type="application/pdf", + download_url="/download/2", + ), + ] + result = _find_drawio_png("Architecture", attachments) + assert result is not None + assert result.title == "Architecture.png" + + +def test_find_drawio_png_drawio_suffix(): + attachments = [ + Attachment( + id="1", + title="Architecture.drawio.png", + media_type="image/png", + download_url="/download/1", + ), + ] + result = _find_drawio_png("Architecture", attachments) + assert result is not None + assert result.title == "Architecture.drawio.png" + + +def test_find_drawio_png_no_match(): + attachments = [ + Attachment( + id="1", + title="unrelated.png", + media_type="image/png", + download_url="/download/1", + ), + ] + result = _find_drawio_png("Architecture", attachments) + assert result is None + + +@patch("confluence2md.renderer.download_attachment") +@patch("confluence2md.renderer.fetch_attachments") +def test_process_drawio_macros(mock_fetch, mock_download): + # given + mock_fetch.return_value = [ + Attachment( + id="1", + title="Architecture.png", + media_type="image/png", + download_url="/download/1", + ), + ] + page = Page(id="100", title="Test", space_key="DEV") + html = f"

Before

{DRAWIO_MACRO}

After

" + confluence = MagicMock() + + # when + result = _process_drawio_macros(html, page, MagicMock(), confluence) + + # then + assert 'Architecture' in result + assert "Before

" in result + assert "

After

" in result + mock_download.assert_called_once() + + +@patch("confluence2md.renderer.download_attachment") +@patch("confluence2md.renderer.fetch_attachments") +def test_process_drawio_macros_no_png_leaves_macro(mock_fetch, mock_download): + # given + mock_fetch.return_value = [] + page = Page(id="100", title="Test", space_key="DEV") + html = f"

Before

{DRAWIO_MACRO}

After

" + confluence = MagicMock() + + # when + result = _process_drawio_macros(html, page, MagicMock(), confluence) + + # then + assert "Intro

{DRAWIO_MACRO}", + ) + config = OutputConfig(directory=str(tmp_path), include_metadata=False) + confluence = MagicMock() + + # when + path = export_page(page, config, confluence) + + # then + content = path.read_text() + assert "![Architecture](Architecture.png)" in content + mock_download.assert_called_once() diff --git a/uv.lock b/uv.lock index 0892441..cfead97 100644 --- a/uv.lock +++ b/uv.lock @@ -126,7 +126,7 @@ wheels = [ [[package]] name = "confluence2md" -version = "0.1.0" +version = "0.2.0" source = { editable = "." } dependencies = [ { name = "atlassian-python-api" }, From a5bda6554c04b6d6223e889ea6a6e1fd850f9e60 Mon Sep 17 00:00:00 2001 From: Matthias Splieth Date: Fri, 27 Mar 2026 13:17:24 +0100 Subject: [PATCH 2/2] Set README for release purposes --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index e0dd5c9..fa91e1a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,6 +2,7 @@ name = "confluence2md" version = "0.2.0" description = "Export Confluence pages to Markdown files" +readme = "README.md" requires-python = ">=3.12" dependencies = [ "atlassian-python-api>=3.41",