|
5 | 5 | from functools import partial, reduce |
6 | 6 | from pathlib import Path |
7 | 7 |
|
| 8 | +import requests |
8 | 9 | from django.conf import settings |
9 | 10 | from django.contrib.postgres.indexes import GinIndex |
10 | 11 | from django.contrib.postgres.search import ( |
|
26 | 27 | from django.utils.html import strip_tags |
27 | 28 | from django_hosts.resolvers import reverse |
28 | 29 |
|
29 | | -from blog.models import Entry |
30 | 30 | from releases.models import Release |
31 | 31 |
|
32 | 32 | from . import utils |
33 | 33 | from .search import ( |
34 | 34 | DEFAULT_TEXT_SEARCH_CONFIG, |
35 | | - SEARCHABLE_VIEWS, |
36 | 35 | START_SEL, |
37 | 36 | STOP_SEL, |
38 | 37 | TSEARCH_CONFIG_LANGUAGES, |
39 | 38 | DocumentationCategory, |
| 39 | + fetch_html, |
40 | 40 | get_document_search_vector, |
41 | 41 | ) |
| 42 | +from .utils import extract_inner_html |
42 | 43 |
|
43 | 44 |
|
44 | 45 | def get_search_config(lang): |
@@ -185,7 +186,7 @@ def sync_to_db(self, decoded_documents): |
185 | 186 | the database. Deletes all the release's documents first then |
186 | 187 | reinserts them as needed. |
187 | 188 | """ |
188 | | - self.documents.all().delete() |
| 189 | + self.documents.exclude(metadata__parents=DocumentationCategory.WEBSITE).delete() |
189 | 190 |
|
190 | 191 | # Read excluded paths from robots.docs.txt. |
191 | 192 | robots_path = settings.BASE_DIR.joinpath( |
@@ -218,81 +219,63 @@ def sync_to_db(self, decoded_documents): |
218 | 219 | metadata=document, |
219 | 220 | config=get_search_config(self.lang), |
220 | 221 | ) |
221 | | - for document in self.documents.all(): |
| 222 | + for document in self.documents.exclude( |
| 223 | + metadata__parents=DocumentationCategory.WEBSITE |
| 224 | + ): |
222 | 225 | document.metadata["breadcrumbs"] = list( |
223 | 226 | Document.objects.breadcrumbs(document).values("title", "path") |
224 | 227 | ) |
225 | 228 | document.save(update_fields=("metadata",)) |
226 | 229 |
|
227 | | - self._sync_blog_to_db() |
228 | | - self._sync_views_to_db() |
| 230 | + def sync_from_sitemap(self, force=False): |
| 231 | + from djangoproject.urls.www import sitemaps |
229 | 232 |
|
230 | | - def _sync_blog_to_db(self): |
231 | | - """ |
232 | | - Sync the blog entries into search based on the release documents |
233 | | - support end date. |
234 | | - """ |
235 | | - if self.lang != "en": |
236 | | - return # The blog is only written in English currently |
| 233 | + if not self.is_dev: |
| 234 | + return |
237 | 235 |
|
238 | | - entries = Entry.objects.published().searchable() |
239 | | - Document.objects.bulk_create( |
240 | | - [ |
241 | | - Document( |
242 | | - release=self, |
243 | | - path=entry.get_absolute_url(), |
244 | | - title=entry.headline, |
245 | | - metadata={ |
246 | | - "body": entry.body_html, |
247 | | - "breadcrumbs": [ |
248 | | - { |
249 | | - "path": DocumentationCategory.WEBSITE, |
250 | | - "title": "News", |
251 | | - }, |
252 | | - ], |
253 | | - "parents": DocumentationCategory.WEBSITE, |
254 | | - "slug": entry.slug, |
255 | | - "title": entry.headline, |
256 | | - "toc": "", |
257 | | - }, |
258 | | - config=get_search_config(self.lang), |
259 | | - ) |
260 | | - for entry in entries |
261 | | - ] |
262 | | - ) |
| 236 | + if force: |
| 237 | + Document.objects.filter( |
| 238 | + metadata__parents=DocumentationCategory.WEBSITE |
| 239 | + ).delete() |
263 | 240 |
|
264 | | - def _sync_views_to_db(self): |
265 | | - """ |
266 | | - Sync the specific views into search based on the release documents |
267 | | - support end date. |
268 | | - """ |
269 | | - if self.lang != "en": |
270 | | - return # The searchable views are only written in English currently |
| 241 | + doc_urls = set( |
| 242 | + Document.objects.filter( |
| 243 | + metadata__parents=DocumentationCategory.WEBSITE |
| 244 | + ).values_list("path", flat=True) |
| 245 | + ) |
271 | 246 |
|
272 | | - Document.objects.bulk_create( |
273 | | - [ |
274 | | - Document( |
| 247 | + for sitemap in sitemaps.values(): |
| 248 | + for url in sitemap().get_urls(): |
| 249 | + path = url["location"] |
| 250 | + if path in doc_urls: |
| 251 | + continue |
| 252 | + try: |
| 253 | + page_html = fetch_html(path) |
| 254 | + except requests.RequestException: |
| 255 | + continue |
| 256 | + try: |
| 257 | + main_html = extract_inner_html(page_html, tag="main") |
| 258 | + title = extract_inner_html(page_html, tag="h1") |
| 259 | + except ValueError: |
| 260 | + continue |
| 261 | + Document.objects.create( |
275 | 262 | release=self, |
276 | | - path=searchable_view.www_absolute_url, |
277 | | - title=searchable_view.page_title, |
| 263 | + path=path, |
| 264 | + title=title, |
278 | 265 | metadata={ |
279 | | - "body": searchable_view.html, |
| 266 | + "body": main_html, |
280 | 267 | "breadcrumbs": [ |
281 | 268 | { |
282 | 269 | "path": DocumentationCategory.WEBSITE, |
283 | 270 | "title": "Website", |
284 | 271 | }, |
285 | 272 | ], |
286 | 273 | "parents": DocumentationCategory.WEBSITE, |
287 | | - "slug": searchable_view.url_name, |
288 | | - "title": searchable_view.page_title, |
| 274 | + "title": title, |
289 | 275 | "toc": "", |
290 | 276 | }, |
291 | 277 | config=get_search_config(self.lang), |
292 | 278 | ) |
293 | | - for searchable_view in SEARCHABLE_VIEWS |
294 | | - ] |
295 | | - ) |
296 | 279 |
|
297 | 280 |
|
298 | 281 | def _clean_document_path(path): |
@@ -353,6 +336,14 @@ def search(self, query_text, release, document_category=None): |
353 | 336 | config=models.F("config"), |
354 | 337 | ) |
355 | 338 | base_filter = Q(release_id=release.id) |
| 339 | + if release.lang == settings.DEFAULT_LANGUAGE_CODE and not release.is_dev: |
| 340 | + dev_release = DocumentRelease.objects.get_by_version_and_lang( |
| 341 | + version="dev", lang=settings.DEFAULT_LANGUAGE_CODE |
| 342 | + ) |
| 343 | + base_filter |= Q( |
| 344 | + release_id=dev_release.id, |
| 345 | + metadata__parents=DocumentationCategory.WEBSITE, |
| 346 | + ) |
356 | 347 | if document_category: |
357 | 348 | base_filter &= Q(metadata__parents__startswith=document_category) |
358 | 349 | base_qs = ( |
|
0 commit comments