|
4 | 4 | from pathlib import Path |
5 | 5 | from posixpath import basename |
6 | 6 | from textwrap import dedent |
7 | | -from typing import List, Optional, Tuple |
| 7 | +from typing import Dict, List, Optional, Tuple |
8 | 8 | from urllib.parse import urlparse |
9 | 9 |
|
10 | 10 | import mdformat |
11 | 11 | import requests |
| 12 | +import yaml |
12 | 13 | from bs4 import BeautifulSoup |
13 | 14 | from markdownify import markdownify as md |
14 | 15 | from slugify import slugify |
@@ -150,6 +151,40 @@ def fetch_document(url: str) -> BeautifulSoup: |
150 | 151 | return BeautifulSoup(response.content, "html.parser") |
151 | 152 |
|
152 | 153 |
|
| 154 | +def get_document_metadata(document: BeautifulSoup) -> Dict[str, str]: |
| 155 | + result = {} |
| 156 | + |
| 157 | + ALLOWED_META_NAMES = "description" |
| 158 | + ALLOWED_META_PROPS = () |
| 159 | + |
| 160 | + for meta_element in document.find_all("meta"): |
| 161 | + if not meta_element.has_attr("content") or not meta_element["content"]: |
| 162 | + continue |
| 163 | + |
| 164 | + if meta_element.has_attr("name") and meta_element["name"] in ALLOWED_META_NAMES: |
| 165 | + result[meta_element["name"]] = meta_element["content"] |
| 166 | + elif ( |
| 167 | + meta_element.has_attr("property") |
| 168 | + and meta_element["property"] in ALLOWED_META_PROPS |
| 169 | + ): |
| 170 | + result["property={}".format(meta_element["property"])] = meta_element[ |
| 171 | + "content" |
| 172 | + ] |
| 173 | + |
| 174 | + return result |
| 175 | + |
| 176 | + |
| 177 | +def get_metadata_frontmatter(document: BeautifulSoup) -> str: |
| 178 | + frontmatter_yaml = "\n" |
| 179 | + |
| 180 | + metadata = get_document_metadata(document) |
| 181 | + if metadata: |
| 182 | + frontmatter = {"myst": {"html_meta": metadata}} |
| 183 | + frontmatter_yaml = yaml.dump(frontmatter, default_flow_style=False) |
| 184 | + |
| 185 | + return "---\n{}---".format(frontmatter_yaml) |
| 186 | + |
| 187 | + |
153 | 188 | def convert_document( |
154 | 189 | document: BeautifulSoup, url: str, output_dir: Optional[Path] = None |
155 | 190 | ) -> Tuple[Path, str]: |
@@ -177,7 +212,8 @@ def convert_document( |
177 | 212 |
|
178 | 213 | document_source_comment = f"<!-- source: {url} -->" |
179 | 214 | document_contents = ( |
180 | | - f"{document_source_comment}\n" |
| 215 | + get_metadata_frontmatter(document) + "\n\n" |
| 216 | + f"{document_source_comment}\n\n" |
181 | 217 | f"# {article_heading}\n" |
182 | 218 | f"{article_body_markdown}" |
183 | 219 | ) |
|
0 commit comments