Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
name = "confluence2md"
version = "0.2.0"
description = "Export Confluence pages to Markdown files"
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
"atlassian-python-api>=3.41",
Expand Down
8 changes: 4 additions & 4 deletions src/confluence2md/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,14 +80,14 @@ def _handle_export(args: argparse.Namespace, config: Config) -> None:

if args.page_id:
page = fetch_page(confluence, args.page_id)
path = export_page(page, config.output)
path = export_page(page, config.output, confluence)
print(f"Exported: {path}")

include_children = args.include_children or config.output.include_children
if include_children:
children = fetch_child_pages(confluence, args.page_id)
if children:
child_paths = export_pages(children, config.output)
child_paths = export_pages(children, config.output, confluence)
for p in child_paths:
print(f"Exported: {p}")
total = 1 + len(child_paths)
Expand All @@ -99,7 +99,7 @@ def _handle_export(args: argparse.Namespace, config: Config) -> None:
if not pages:
print("No pages found.")
return
paths = export_pages(pages, config.output)
paths = export_pages(pages, config.output, confluence)
for p in paths:
print(f"Exported: {p}")
print(f"\n{len(paths)} page(s) exported to {config.output.directory}")
Expand All @@ -108,7 +108,7 @@ def _handle_export(args: argparse.Namespace, config: Config) -> None:
if not pages:
print("No pages found.")
return
paths = export_pages(pages, config.output)
paths = export_pages(pages, config.output, confluence)
for p in paths:
print(f"Exported: {p}")
print(f"\n{len(paths)} page(s) exported to {config.output.directory}")
Expand Down
40 changes: 40 additions & 0 deletions src/confluence2md/client.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any

from atlassian import Confluence
Expand All @@ -18,6 +19,14 @@ class Page:
parent_title: str = ""


@dataclass
class Attachment:
id: str
title: str
media_type: str
download_url: str


def connect(config: ConfluenceConfig) -> Confluence:
"""Create an authenticated Confluence client."""
if config.username:
Expand Down Expand Up @@ -151,3 +160,34 @@ def _build_page_url(raw: dict[str, Any], confluence: Confluence) -> str:
base = links.get("base", confluence.url.rstrip("/"))
return f"{base}{webui}"
return ""


def fetch_attachments(confluence: Confluence, page_id: str) -> list[Attachment]:
"""Fetch all attachments for a page."""
results = confluence.get_attachments_from_content(page_id) # type: ignore[no-untyped-call]
attachments = []
for item in results.get("results", []):
download_url = item.get("_links", {}).get("download", "")
attachments.append(
Attachment(
id=str(item.get("id", "")),
title=item.get("title", ""),
media_type=item.get("metadata", {}).get("mediaType", ""),
download_url=download_url,
)
)
return attachments


def download_attachment(
confluence: Confluence, attachment: Attachment, dest: Path
) -> Path:
"""Download an attachment to the given directory. Returns the file path."""
dest.mkdir(parents=True, exist_ok=True)
filepath = dest / attachment.title
response = confluence.request(
method="GET",
path=attachment.download_url,
)
filepath.write_bytes(response.content)
Comment thread
splieth marked this conversation as resolved.
return filepath
Comment thread
splieth marked this conversation as resolved.
124 changes: 118 additions & 6 deletions src/confluence2md/renderer.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,16 @@
import re
from pathlib import Path
from typing import Optional

from atlassian import Confluence
from markdownify import markdownify

from .client import Page
from .client import (
Attachment,
Page,
download_attachment,
fetch_attachments,
)
from .config import OutputConfig


Expand Down Expand Up @@ -42,27 +49,132 @@ def render_page(page: Page, config: OutputConfig) -> str:
return "\n".join(lines)


def export_page(page: Page, config: OutputConfig) -> Path:
def export_page(
page: Page,
config: OutputConfig,
confluence: Optional[Confluence] = None,
) -> Path:
"""Export a single page to a Markdown file. Returns the file path."""
content = render_page(page, config)
filename = _safe_filename(config.filename_pattern.format(title=page.title)) + ".md"
output_dir = Path(config.directory)
output_dir.mkdir(parents=True, exist_ok=True)

body = page.body
if confluence:
body = _process_drawio_macros(body, page, output_dir, confluence)
Comment thread
splieth marked this conversation as resolved.

content = render_page(
Page(
id=page.id,
title=page.title,
space_key=page.space_key,
body=body,
labels=page.labels,
url=page.url,
version=page.version,
parent_title=page.parent_title,
),
config,
)

filename = _safe_filename(config.filename_pattern.format(title=page.title)) + ".md"
filepath = output_dir / filename
filepath.write_text(content, encoding="utf-8")
return filepath


def export_pages(pages: list[Page], config: OutputConfig) -> list[Path]:
def export_pages(
pages: list[Page],
config: OutputConfig,
confluence: Optional[Confluence] = None,
) -> list[Path]:
"""Export multiple pages to Markdown files."""
return [export_page(page, config) for page in pages]
return [export_page(page, config, confluence) for page in pages]


def _convert_body(html: str) -> str:
"""Convert Confluence storage format (HTML) to Markdown."""
return markdownify(html, heading_style="ATX", strip=["style"])


def _extract_drawio_diagram_names(html: str) -> list[str]:
"""Extract draw.io diagram names from Confluence storage format HTML."""
names: list[str] = []
# Confluence macros use ac: namespace prefixes which aren't valid XML
# without namespace declarations, so we use regex to extract them.
macro_pattern = re.compile(
r'<ac:structured-macro[^>]*ac:name=["\']drawio["\'][^>]*>'
r"(.*?)</ac:structured-macro>",
re.DOTALL,
)
param_pattern = re.compile(
r'<ac:parameter[^>]*ac:name=["\']diagramName["\'][^>]*>'
r"(.*?)</ac:parameter>",
re.DOTALL,
)
for macro_match in macro_pattern.finditer(html):
macro_body = macro_match.group(1)
param_match = param_pattern.search(macro_body)
if param_match:
names.append(param_match.group(1).strip())
return names


def _find_drawio_png(
diagram_name: str, attachments: list[Attachment]
) -> Optional[Attachment]:
"""Find the PNG attachment for a draw.io diagram."""
# draw.io stores previews with various naming conventions
candidates = [
f"{diagram_name}.png",
f"{diagram_name}.drawio.png",
]
for att in attachments:
if att.title in candidates:
Comment thread
splieth marked this conversation as resolved.
return att
return None


def _process_drawio_macros(
html: str,
page: Page,
output_dir: Path,
confluence: Confluence,
) -> str:
"""Replace draw.io macros with image references and download PNGs."""
diagram_names = _extract_drawio_diagram_names(html)
if not diagram_names:
return html

attachments = fetch_attachments(confluence, page.id)

macro_pattern = re.compile(
r'<ac:structured-macro[^>]*ac:name=["\']drawio["\'][^>]*>'
r"(.*?)</ac:structured-macro>",
re.DOTALL,
)
param_pattern = re.compile(
r'<ac:parameter[^>]*ac:name=["\']diagramName["\'][^>]*>'
r"(.*?)</ac:parameter>",
re.DOTALL,
)

def _replace_macro(match: re.Match[str]) -> str:
macro_body = match.group(1)
param_match = param_pattern.search(macro_body)
if not param_match:
return match.group(0)

diagram_name = param_match.group(1).strip()
png_attachment = _find_drawio_png(diagram_name, attachments)
if not png_attachment:
return match.group(0)

download_attachment(confluence, png_attachment, output_dir)
Comment thread
splieth marked this conversation as resolved.
return f'<img src="{png_attachment.title}" alt="{diagram_name}" />'
Comment thread
splieth marked this conversation as resolved.

return macro_pattern.sub(_replace_macro, html)


def _safe_filename(name: str) -> str:
"""Sanitize a string for use as a filename."""
# Replace characters that are problematic in filenames
Expand Down
Loading
Loading