Skip to content

Commit fc43b71

Browse files
committed
Synced webpages from sitemap to Document model for search.
1 parent b1a45b7 commit fc43b71

File tree

10 files changed

+369
-158
lines changed

10 files changed

+369
-158
lines changed

blog/admin.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,9 @@ class EntryAdmin(admin.ModelAdmin):
1616
"pub_date",
1717
"is_active",
1818
"is_published",
19-
"is_searchable",
2019
"author",
2120
)
22-
list_filter = ("is_active", "is_searchable")
21+
list_filter = ("is_active",)
2322
exclude = ("summary_html", "body_html")
2423
prepopulated_fields = {"slug": ("headline",)}
2524
raw_id_fields = ["social_media_card"]

blog/models.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,6 @@ def published(self):
3737
def active(self):
3838
return self.filter(is_active=True)
3939

40-
def searchable(self):
41-
return self.filter(is_searchable=True)
42-
4340

4441
class ContentFormat(models.TextChoices):
4542
REST = "reST", "reStructuredText"

blog/tests.py

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -70,26 +70,6 @@ def test_manager_published(self):
7070
transform=lambda entry: entry.headline,
7171
)
7272

73-
def test_manager_searchable(self):
74-
"""
75-
Make sure that the Entry manager's `searchable` method works
76-
"""
77-
Entry.objects.create(
78-
pub_date=self.yesterday,
79-
is_searchable=False,
80-
headline="not searchable",
81-
slug="a",
82-
)
83-
Entry.objects.create(
84-
pub_date=self.yesterday, is_searchable=True, headline="searchable", slug="b"
85-
)
86-
87-
self.assertQuerySetEqual(
88-
Entry.objects.searchable(),
89-
["searchable"],
90-
transform=lambda entry: entry.headline,
91-
)
92-
9373
def test_docutils_safe(self):
9474
"""
9575
Make sure docutils' file inclusion directives are disabled by default.

djangoproject/scss/_style.scss

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2650,7 +2650,7 @@ search.filters {
26502650
position: relative;
26512651

26522652
a {
2653-
padding: 10px 20px;
2653+
padding: 10px 15px;
26542654
text-decoration: none;
26552655
border-bottom: 3px solid transparent;
26562656
transition: color 0.3s ease, border-bottom 0.3s ease;

docs/management/commands/update_docs.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,8 @@ def build_doc_release(self, release, force=False, interactive=False):
134134
if self.verbosity >= 1:
135135
self.stdout.write(f"Starting update for {release} at {datetime.now()}...")
136136

137+
release.sync_from_sitemap(force=force)
138+
137139
# checkout_dir is shared for all languages.
138140
checkout_dir = settings.DOCS_BUILD_ROOT / "sources" / release.version
139141
parent_build_dir = settings.DOCS_BUILD_ROOT / release.lang / release.version

docs/models.py

Lines changed: 47 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from functools import partial, reduce
66
from pathlib import Path
77

8+
import requests
89
from django.conf import settings
910
from django.contrib.postgres.indexes import GinIndex
1011
from django.contrib.postgres.search import (
@@ -26,19 +27,19 @@
2627
from django.utils.html import strip_tags
2728
from django_hosts.resolvers import reverse
2829

29-
from blog.models import Entry
3030
from releases.models import Release
3131

3232
from . import utils
3333
from .search import (
3434
DEFAULT_TEXT_SEARCH_CONFIG,
35-
SEARCHABLE_VIEWS,
3635
START_SEL,
3736
STOP_SEL,
3837
TSEARCH_CONFIG_LANGUAGES,
3938
DocumentationCategory,
39+
fetch_html,
4040
get_document_search_vector,
4141
)
42+
from .utils import extract_inner_html
4243

4344

4445
def get_search_config(lang):
@@ -185,7 +186,7 @@ def sync_to_db(self, decoded_documents):
185186
the database. Deletes all the release's documents first then
186187
reinserts them as needed.
187188
"""
188-
self.documents.all().delete()
189+
self.documents.exclude(metadata__parents=DocumentationCategory.WEBSITE).delete()
189190

190191
# Read excluded paths from robots.docs.txt.
191192
robots_path = settings.BASE_DIR / "djangoproject" / "static" / "robots.docs.txt"
@@ -216,81 +217,63 @@ def sync_to_db(self, decoded_documents):
216217
metadata=document,
217218
config=get_search_config(self.lang),
218219
)
219-
for document in self.documents.all():
220+
for document in self.documents.exclude(
221+
metadata__parents=DocumentationCategory.WEBSITE
222+
):
220223
document.metadata["breadcrumbs"] = list(
221224
Document.objects.breadcrumbs(document).values("title", "path")
222225
)
223226
document.save(update_fields=("metadata",))
224227

225-
self._sync_blog_to_db()
226-
self._sync_views_to_db()
228+
def sync_from_sitemap(self, force=False):
229+
from djangoproject.urls.www import sitemaps
227230

228-
def _sync_blog_to_db(self):
229-
"""
230-
Sync the blog entries into search based on the release documents
231-
support end date.
232-
"""
233-
if self.lang != "en":
234-
return # The blog is only written in English currently
231+
if not self.is_dev:
232+
return
235233

236-
entries = Entry.objects.published().searchable()
237-
Document.objects.bulk_create(
238-
[
239-
Document(
240-
release=self,
241-
path=entry.get_absolute_url(),
242-
title=entry.headline,
243-
metadata={
244-
"body": entry.body_html,
245-
"breadcrumbs": [
246-
{
247-
"path": DocumentationCategory.WEBSITE,
248-
"title": "News",
249-
},
250-
],
251-
"parents": DocumentationCategory.WEBSITE,
252-
"slug": entry.slug,
253-
"title": entry.headline,
254-
"toc": "",
255-
},
256-
config=get_search_config(self.lang),
257-
)
258-
for entry in entries
259-
]
260-
)
234+
if force:
235+
Document.objects.filter(
236+
metadata__parents=DocumentationCategory.WEBSITE
237+
).delete()
261238

262-
def _sync_views_to_db(self):
263-
"""
264-
Sync the specific views into search based on the release documents
265-
support end date.
266-
"""
267-
if self.lang != "en":
268-
return # The searchable views are only written in English currently
239+
doc_urls = set(
240+
Document.objects.filter(
241+
metadata__parents=DocumentationCategory.WEBSITE
242+
).values_list("path", flat=True)
243+
)
269244

270-
Document.objects.bulk_create(
271-
[
272-
Document(
245+
for sitemap in sitemaps.values():
246+
for url in sitemap().get_urls():
247+
path = url["location"]
248+
if path in doc_urls:
249+
continue
250+
try:
251+
page_html = fetch_html(path)
252+
except requests.RequestException:
253+
continue
254+
try:
255+
main_html = extract_inner_html(page_html, tag="main")
256+
title = extract_inner_html(page_html, tag="h1")
257+
except ValueError:
258+
continue
259+
Document.objects.create(
273260
release=self,
274-
path=searchable_view.www_absolute_url,
275-
title=searchable_view.page_title,
261+
path=path,
262+
title=title,
276263
metadata={
277-
"body": searchable_view.html,
264+
"body": main_html,
278265
"breadcrumbs": [
279266
{
280267
"path": DocumentationCategory.WEBSITE,
281268
"title": "Website",
282269
},
283270
],
284271
"parents": DocumentationCategory.WEBSITE,
285-
"slug": searchable_view.url_name,
286-
"title": searchable_view.page_title,
272+
"title": title,
287273
"toc": "",
288274
},
289275
config=get_search_config(self.lang),
290276
)
291-
for searchable_view in SEARCHABLE_VIEWS
292-
]
293-
)
294277

295278

296279
def _clean_document_path(path):
@@ -351,6 +334,14 @@ def search(self, query_text, release, document_category=None):
351334
config=models.F("config"),
352335
)
353336
base_filter = Q(release_id=release.id)
337+
if release.lang == settings.DEFAULT_LANGUAGE_CODE and not release.is_dev:
338+
dev_release = DocumentRelease.objects.get_by_version_and_lang(
339+
version="dev", lang=settings.DEFAULT_LANGUAGE_CODE
340+
)
341+
base_filter |= Q(
342+
release_id=dev_release.id,
343+
metadata__parents=DocumentationCategory.WEBSITE,
344+
)
354345
if document_category:
355346
base_filter &= Q(metadata__parents__startswith=document_category)
356347
base_qs = (

docs/search.py

Lines changed: 28 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,8 @@
1-
from dataclasses import dataclass
2-
1+
import requests
32
from django.contrib.postgres.search import SearchVector
43
from django.db.models import TextChoices
54
from django.db.models.fields.json import KeyTextTransform
6-
from django.template.loader import get_template
75
from django.utils.translation import gettext_lazy as _
8-
from django_hosts import reverse
96

107
# Imported from
118
# https://github.com/postgres/postgres/blob/REL_14_STABLE/src/bin/initdb/initdb.c#L659
@@ -81,25 +78,31 @@ def parse(cls, value, default=None):
8178
return None
8279

8380

84-
@dataclass
85-
class SearchableView:
86-
page_title: str
87-
url_name: str
88-
template: str
89-
90-
@property
91-
def html(self):
92-
return get_template(self.template).render()
93-
94-
@property
95-
def www_absolute_url(self):
96-
return reverse(self.url_name, host="www")
97-
81+
def fetch_html(url, timeout=10):
82+
"""
83+
Fetch the HTML of a page if status code is 200.
84+
Simulates a human browser and accepts only text/html.
85+
"""
9886

99-
SEARCHABLE_VIEWS = [
100-
SearchableView(
101-
page_title="Django's Ecosystem",
102-
url_name="community-ecosystem",
103-
template="aggregator/ecosystem.html",
104-
),
105-
]
87+
headers = {
88+
"User-Agent": (
89+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
90+
"AppleWebKit/537.36 (KHTML, like Gecko) "
91+
"Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"
92+
),
93+
"Accept": "text/html",
94+
"Accept-Language": "en-US,en;q=0.9",
95+
}
96+
97+
response = requests.get(url, headers=headers, timeout=timeout)
98+
99+
if response.status_code == 200:
100+
content_type = response.headers.get("Content-Type", "")
101+
if "text/html" in content_type:
102+
return response.text
103+
else:
104+
raise requests.RequestException(f"Unexpected Content-Type: {content_type}")
105+
else:
106+
raise requests.RequestException(
107+
f"Failed to fetch {url}, status code: {response.status_code}"
108+
)

0 commit comments

Comments
 (0)