Skip to content

Commit abc8a2e

Browse files
authored
Merge pull request #39 from ByteInternet/downloader_metadata
2 parents f0df65a + c08e87c commit abc8a2e

File tree

2 files changed

+41
-2
lines changed

2 files changed

+41
-2
lines changed

hypernode/downloader/main.py

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,12 @@
44
from pathlib import Path
55
from posixpath import basename
66
from textwrap import dedent
7-
from typing import List, Optional, Tuple
7+
from typing import Dict, List, Optional, Tuple
88
from urllib.parse import urlparse
99

1010
import mdformat
1111
import requests
12+
import yaml
1213
from bs4 import BeautifulSoup
1314
from markdownify import markdownify as md
1415
from slugify import slugify
@@ -150,6 +151,40 @@ def fetch_document(url: str) -> BeautifulSoup:
150151
return BeautifulSoup(response.content, "html.parser")
151152

152153

154+
def get_document_metadata(document: BeautifulSoup) -> Dict[str, str]:
155+
result = {}
156+
157+
ALLOWED_META_NAMES = "description"
158+
ALLOWED_META_PROPS = ()
159+
160+
for meta_element in document.find_all("meta"):
161+
if not meta_element.has_attr("content") or not meta_element["content"]:
162+
continue
163+
164+
if meta_element.has_attr("name") and meta_element["name"] in ALLOWED_META_NAMES:
165+
result[meta_element["name"]] = meta_element["content"]
166+
elif (
167+
meta_element.has_attr("property")
168+
and meta_element["property"] in ALLOWED_META_PROPS
169+
):
170+
result["property={}".format(meta_element["property"])] = meta_element[
171+
"content"
172+
]
173+
174+
return result
175+
176+
177+
def get_metadata_frontmatter(document: BeautifulSoup) -> str:
178+
frontmatter_yaml = "\n"
179+
180+
metadata = get_document_metadata(document)
181+
if metadata:
182+
frontmatter = {"myst": {"html_meta": metadata}}
183+
frontmatter_yaml = yaml.dump(frontmatter, default_flow_style=False)
184+
185+
return "---\n{}---".format(frontmatter_yaml)
186+
187+
153188
def convert_document(
154189
document: BeautifulSoup, url: str, output_dir: Optional[Path] = None
155190
) -> Tuple[Path, str]:
@@ -177,7 +212,8 @@ def convert_document(
177212

178213
document_source_comment = f"<!-- source: {url} -->"
179214
document_contents = (
180-
f"{document_source_comment}\n"
215+
get_metadata_frontmatter(document) + "\n\n"
216+
f"{document_source_comment}\n\n"
181217
f"# {article_heading}\n"
182218
f"{article_body_markdown}"
183219
)

requirements/base.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# docs/ requirements
12
sphinx==5.3.0
23
sphinx_rtd_theme==1.1.1
34
myst-parser==0.18.1
@@ -7,6 +8,8 @@ mdformat==0.7.16
78
mdformat-myst==0.1.5
89
mdformat-frontmatter==0.4.1
910

11+
# hypernode/ requirements
1012
beautifulsoup4==4.11.1
1113
markdownify==0.11.2
1214
python-slugify==6.1.2
15+
pyyaml==6.0

0 commit comments

Comments
 (0)