Skip to content

Commit 802b7f2

Browse files
committed
Synced webpages from sitemap to Document model for search.
1 parent 7662ced commit 802b7f2

File tree

10 files changed

+407
-159
lines changed

10 files changed

+407
-159
lines changed

djangoproject/scss/_style.scss

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2650,7 +2650,7 @@ search.filters {
26502650
position: relative;
26512651

26522652
a {
2653-
padding: 10px 20px;
2653+
padding: 10px 15px;
26542654
text-decoration: none;
26552655
border-bottom: 3px solid transparent;
26562656
transition: color 0.3s ease, border-bottom 0.3s ease;

djangoproject/templates/includes/header.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
<li{% if 'download' in request.path %} class="active"{% endif %}>
2626
<a href="{% url 'download' %}">Download</a>
2727
</li>
28-
<li{% if request.host.name == 'docs' %} class="active"{% endif %}>
28+
<li{% if request.host.name == 'docs' and 'search' not in request.path %} class="active"{% endif %}>
2929
<a href="{% block doc_url %}{% url 'homepage' host 'docs' %}{% endblock %}">Documentation</a>
3030
</li>
3131
<li{% if 'weblog' in request.path %} class="active"{% endif %}>

docs/management/commands/update_docs.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,8 @@ def build_doc_release(self, release, force=False, interactive=False):
134134
if self.verbosity >= 1:
135135
self.stdout.write(f"Starting update for {release} at {datetime.now()}...")
136136

137+
release.sync_from_sitemap(force=force)
138+
137139
# checkout_dir is shared for all languages.
138140
checkout_dir = settings.DOCS_BUILD_ROOT.joinpath("sources", release.version)
139141
parent_build_dir = settings.DOCS_BUILD_ROOT.joinpath(

docs/models.py

Lines changed: 47 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from functools import partial, reduce
66
from pathlib import Path
77

8+
import requests
89
from django.conf import settings
910
from django.contrib.postgres.indexes import GinIndex
1011
from django.contrib.postgres.search import (
@@ -26,19 +27,19 @@
2627
from django.utils.html import strip_tags
2728
from django_hosts.resolvers import reverse
2829

29-
from blog.models import Entry
3030
from releases.models import Release
3131

3232
from . import utils
3333
from .search import (
3434
DEFAULT_TEXT_SEARCH_CONFIG,
35-
SEARCHABLE_VIEWS,
3635
START_SEL,
3736
STOP_SEL,
3837
TSEARCH_CONFIG_LANGUAGES,
3938
DocumentationCategory,
39+
fetch_html,
4040
get_document_search_vector,
4141
)
42+
from .utils import extract_inner_html
4243

4344

4445
def get_search_config(lang):
@@ -185,7 +186,7 @@ def sync_to_db(self, decoded_documents):
185186
the database. Deletes all the release's documents first then
186187
reinserts them as needed.
187188
"""
188-
self.documents.all().delete()
189+
self.documents.exclude(metadata__parents=DocumentationCategory.WEBSITE).delete()
189190

190191
# Read excluded paths from robots.docs.txt.
191192
robots_path = settings.BASE_DIR.joinpath(
@@ -218,81 +219,63 @@ def sync_to_db(self, decoded_documents):
218219
metadata=document,
219220
config=get_search_config(self.lang),
220221
)
221-
for document in self.documents.all():
222+
for document in self.documents.exclude(
223+
metadata__parents=DocumentationCategory.WEBSITE
224+
):
222225
document.metadata["breadcrumbs"] = list(
223226
Document.objects.breadcrumbs(document).values("title", "path")
224227
)
225228
document.save(update_fields=("metadata",))
226229

227-
self._sync_blog_to_db()
228-
self._sync_views_to_db()
230+
def sync_from_sitemap(self, force=False):
231+
from djangoproject.urls.www import sitemaps
229232

230-
def _sync_blog_to_db(self):
231-
"""
232-
Sync the blog entries into search based on the release documents
233-
support end date.
234-
"""
235-
if self.lang != "en":
236-
return # The blog is only written in English currently
233+
if not self.is_dev:
234+
return
237235

238-
entries = Entry.objects.published().searchable()
239-
Document.objects.bulk_create(
240-
[
241-
Document(
242-
release=self,
243-
path=entry.get_absolute_url(),
244-
title=entry.headline,
245-
metadata={
246-
"body": entry.body_html,
247-
"breadcrumbs": [
248-
{
249-
"path": DocumentationCategory.WEBSITE,
250-
"title": "News",
251-
},
252-
],
253-
"parents": DocumentationCategory.WEBSITE,
254-
"slug": entry.slug,
255-
"title": entry.headline,
256-
"toc": "",
257-
},
258-
config=get_search_config(self.lang),
259-
)
260-
for entry in entries
261-
]
262-
)
236+
if force:
237+
Document.objects.filter(
238+
metadata__parents=DocumentationCategory.WEBSITE
239+
).delete()
263240

264-
def _sync_views_to_db(self):
265-
"""
266-
Sync the specific views into search based on the release documents
267-
support end date.
268-
"""
269-
if self.lang != "en":
270-
return # The searchable views are only written in English currently
241+
doc_urls = set(
242+
Document.objects.filter(
243+
metadata__parents=DocumentationCategory.WEBSITE
244+
).values_list("path", flat=True)
245+
)
271246

272-
Document.objects.bulk_create(
273-
[
274-
Document(
247+
for sitemap in sitemaps.values():
248+
for url in sitemap().get_urls():
249+
path = url["location"]
250+
if path in doc_urls:
251+
continue
252+
try:
253+
page_html = fetch_html(path)
254+
except requests.RequestException:
255+
continue
256+
try:
257+
main_html = extract_inner_html(page_html, tag="main")
258+
title = extract_inner_html(page_html, tag="h1")
259+
except ValueError:
260+
continue
261+
Document.objects.create(
275262
release=self,
276-
path=searchable_view.www_absolute_url,
277-
title=searchable_view.page_title,
263+
path=path,
264+
title=title,
278265
metadata={
279-
"body": searchable_view.html,
266+
"body": main_html,
280267
"breadcrumbs": [
281268
{
282269
"path": DocumentationCategory.WEBSITE,
283270
"title": "Website",
284271
},
285272
],
286273
"parents": DocumentationCategory.WEBSITE,
287-
"slug": searchable_view.url_name,
288-
"title": searchable_view.page_title,
274+
"title": title,
289275
"toc": "",
290276
},
291277
config=get_search_config(self.lang),
292278
)
293-
for searchable_view in SEARCHABLE_VIEWS
294-
]
295-
)
296279

297280

298281
def _clean_document_path(path):
@@ -353,6 +336,14 @@ def search(self, query_text, release, document_category=None):
353336
config=models.F("config"),
354337
)
355338
base_filter = Q(release_id=release.id)
339+
if release.lang == settings.DEFAULT_LANGUAGE_CODE and not release.is_dev:
340+
dev_release = DocumentRelease.objects.get_by_version_and_lang(
341+
version="dev", lang=settings.DEFAULT_LANGUAGE_CODE
342+
)
343+
base_filter |= Q(
344+
release_id=dev_release.id,
345+
metadata__parents=DocumentationCategory.WEBSITE,
346+
)
356347
if document_category:
357348
base_filter &= Q(metadata__parents__startswith=document_category)
358349
base_qs = (

docs/search.py

Lines changed: 28 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,8 @@
1-
from dataclasses import dataclass
2-
1+
import requests
32
from django.contrib.postgres.search import SearchVector
43
from django.db.models import TextChoices
54
from django.db.models.fields.json import KeyTextTransform
6-
from django.template.loader import get_template
75
from django.utils.translation import gettext_lazy as _
8-
from django_hosts import reverse
96

107
# Imported from
118
# https://github.com/postgres/postgres/blob/REL_14_STABLE/src/bin/initdb/initdb.c#L659
@@ -81,25 +78,31 @@ def parse(cls, value, default=None):
8178
return None
8279

8380

84-
@dataclass
85-
class SearchableView:
86-
page_title: str
87-
url_name: str
88-
template: str
89-
90-
@property
91-
def html(self):
92-
return get_template(self.template).render()
93-
94-
@property
95-
def www_absolute_url(self):
96-
return reverse(self.url_name, host="www")
97-
81+
def fetch_html(url, timeout=10):
82+
"""
83+
Fetch the HTML of a page if status code is 200.
84+
Simulates a human browser and accepts only text/html.
85+
"""
9886

99-
SEARCHABLE_VIEWS = [
100-
SearchableView(
101-
page_title="Django's Ecosystem",
102-
url_name="community-ecosystem",
103-
template="aggregator/ecosystem.html",
104-
),
105-
]
87+
headers = {
88+
"User-Agent": (
89+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
90+
"AppleWebKit/537.36 (KHTML, like Gecko) "
91+
"Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"
92+
),
93+
"Accept": "text/html",
94+
"Accept-Language": "en-US,en;q=0.9",
95+
}
96+
97+
response = requests.get(url, headers=headers, timeout=timeout)
98+
99+
if response.status_code == 200:
100+
content_type = response.headers.get("Content-Type", "")
101+
if "text/html" in content_type:
102+
return response.text
103+
else:
104+
raise requests.RequestException(f"Unexpected Content-Type: {content_type}")
105+
else:
106+
raise requests.RequestException(
107+
f"Failed to fetch {url}, status code: {response.status_code}"
108+
)

docs/templates/docs/search_results.html

Lines changed: 11 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
{% extends "docs/doc.html" %}
22
{% load i18n docs %}
33

4-
{% block title %}{% translate "Search | Django documentation" %}{% endblock %}
4+
{% block title %}{% translate "Search" %}{% endblock %}
5+
{% block header %}{% endblock %}
56

67
{% block toc-wrapper %}{% endblock %}
78
{% block breadcrumbs-wrapper %}{% endblock %}
@@ -19,19 +20,11 @@
1920
{% endfor %}
2021
</search>
2122
<h2>
22-
{% if release.is_dev %}
23-
{% blocktranslate count num_results=paginator.count trimmed %}
24-
{{ num_results }} result for <em>{{ query }}</em> in the development version
25-
{% plural %}
26-
{{ num_results }} results for <em>{{ query }}</em> in the development version
27-
{% endblocktranslate %}
28-
{% else %}
29-
{% blocktranslate count num_results=paginator.count trimmed %}
30-
{{ num_results }} result for <em>{{ query }}</em> in version {{ version }}
31-
{% plural %}
32-
{{ num_results }} results for <em>{{ query }}</em> in version {{ version }}
33-
{% endblocktranslate %}
34-
{% endif %}
23+
{% blocktranslate count num_results=paginator.count trimmed %}
24+
{{ num_results }} result for <em>{{ query }}</em>
25+
{% plural %}
26+
{{ num_results }} results for <em>{{ query }}</em>
27+
{% endblocktranslate %}
3528
</h2>
3629
{% else %}
3730
<h2>{% translate "No search query given" %}</h2>
@@ -74,9 +67,12 @@ <h2 class="result-title">
7467
{% empty %}
7568
{% if active_category %}
7669
<dt>
70+
{% if active_category == "website" and lang != "en" %}
71+
<p>{% blocktranslate trimmed %}The website content can only be searched in English.{% endblocktranslate %}</p>
72+
{% endif %}
7773
<p>
7874
{% querystring category=None page=None as all_search %}
79-
{% blocktranslate trimmed %}Please try searching <a href="{{ all_search }}">all documentation results</a>.{% endblocktranslate %}
75+
{% blocktranslate trimmed %}Please try searching <a href="{{ all_search }}">all results</a>.{% endblocktranslate %}
8076
</p>
8177
</dt>
8278
{% endif %}

0 commit comments

Comments
 (0)