|
5 | 5 | from functools import partial, reduce |
6 | 6 | from pathlib import Path |
7 | 7 |
|
| 8 | +import requests |
8 | 9 | from django.conf import settings |
9 | 10 | from django.contrib.postgres.indexes import GinIndex |
10 | 11 | from django.contrib.postgres.search import ( |
|
26 | 27 | from django.utils.html import strip_tags |
27 | 28 | from django_hosts.resolvers import reverse |
28 | 29 |
|
29 | | -from blog.models import Entry |
30 | 30 | from releases.models import Release |
31 | 31 |
|
32 | 32 | from . import utils |
33 | 33 | from .search import ( |
34 | 34 | DEFAULT_TEXT_SEARCH_CONFIG, |
35 | | - SEARCHABLE_VIEWS, |
36 | 35 | START_SEL, |
37 | 36 | STOP_SEL, |
38 | 37 | TSEARCH_CONFIG_LANGUAGES, |
39 | 38 | DocumentationCategory, |
| 39 | + fetch_html, |
40 | 40 | get_document_search_vector, |
41 | 41 | ) |
| 42 | +from .utils import extract_inner_html |
42 | 43 |
|
43 | 44 |
|
44 | 45 | def get_search_config(lang): |
@@ -185,7 +186,7 @@ def sync_to_db(self, decoded_documents): |
185 | 186 | the database. Deletes all the release's documents first then |
186 | 187 | reinserts them as needed. |
187 | 188 | """ |
188 | | - self.documents.all().delete() |
| 189 | + self.documents.exclude(metadata__parents=DocumentationCategory.WEBSITE).delete() |
189 | 190 |
|
190 | 191 | # Read excluded paths from robots.docs.txt. |
191 | 192 | robots_path = settings.BASE_DIR / "djangoproject" / "static" / "robots.docs.txt" |
@@ -216,81 +217,63 @@ def sync_to_db(self, decoded_documents): |
216 | 217 | metadata=document, |
217 | 218 | config=get_search_config(self.lang), |
218 | 219 | ) |
219 | | - for document in self.documents.all(): |
| 220 | + for document in self.documents.exclude( |
| 221 | + metadata__parents=DocumentationCategory.WEBSITE |
| 222 | + ): |
220 | 223 | document.metadata["breadcrumbs"] = list( |
221 | 224 | Document.objects.breadcrumbs(document).values("title", "path") |
222 | 225 | ) |
223 | 226 | document.save(update_fields=("metadata",)) |
224 | 227 |
|
225 | | - self._sync_blog_to_db() |
226 | | - self._sync_views_to_db() |
| 228 | + def sync_from_sitemap(self, force=False): |
| 229 | + from djangoproject.urls.www import sitemaps |
227 | 230 |
|
228 | | - def _sync_blog_to_db(self): |
229 | | - """ |
230 | | - Sync the blog entries into search based on the release documents |
231 | | - support end date. |
232 | | - """ |
233 | | - if self.lang != "en": |
234 | | - return # The blog is only written in English currently |
| 231 | + if not self.is_dev: |
| 232 | + return |
235 | 233 |
|
236 | | - entries = Entry.objects.published().searchable() |
237 | | - Document.objects.bulk_create( |
238 | | - [ |
239 | | - Document( |
240 | | - release=self, |
241 | | - path=entry.get_absolute_url(), |
242 | | - title=entry.headline, |
243 | | - metadata={ |
244 | | - "body": entry.body_html, |
245 | | - "breadcrumbs": [ |
246 | | - { |
247 | | - "path": DocumentationCategory.WEBSITE, |
248 | | - "title": "News", |
249 | | - }, |
250 | | - ], |
251 | | - "parents": DocumentationCategory.WEBSITE, |
252 | | - "slug": entry.slug, |
253 | | - "title": entry.headline, |
254 | | - "toc": "", |
255 | | - }, |
256 | | - config=get_search_config(self.lang), |
257 | | - ) |
258 | | - for entry in entries |
259 | | - ] |
260 | | - ) |
| 234 | + if force: |
| 235 | + Document.objects.filter( |
| 236 | + metadata__parents=DocumentationCategory.WEBSITE |
| 237 | + ).delete() |
261 | 238 |
|
262 | | - def _sync_views_to_db(self): |
263 | | - """ |
264 | | - Sync the specific views into search based on the release documents |
265 | | - support end date. |
266 | | - """ |
267 | | - if self.lang != "en": |
268 | | - return # The searchable views are only written in English currently |
| 239 | + doc_urls = set( |
| 240 | + Document.objects.filter( |
| 241 | + metadata__parents=DocumentationCategory.WEBSITE |
| 242 | + ).values_list("path", flat=True) |
| 243 | + ) |
269 | 244 |
|
270 | | - Document.objects.bulk_create( |
271 | | - [ |
272 | | - Document( |
| 245 | + for sitemap in sitemaps.values(): |
| 246 | + for url in sitemap().get_urls(): |
| 247 | + path = url["location"] |
| 248 | + if path in doc_urls: |
| 249 | + continue |
| 250 | + try: |
| 251 | + page_html = fetch_html(path) |
| 252 | + except requests.RequestException: |
| 253 | + continue |
| 254 | + try: |
| 255 | + main_html = extract_inner_html(page_html, tag="main") |
| 256 | + title = extract_inner_html(page_html, tag="h1") |
| 257 | + except ValueError: |
| 258 | + continue |
| 259 | + Document.objects.create( |
273 | 260 | release=self, |
274 | | - path=searchable_view.www_absolute_url, |
275 | | - title=searchable_view.page_title, |
| 261 | + path=path, |
| 262 | + title=title, |
276 | 263 | metadata={ |
277 | | - "body": searchable_view.html, |
| 264 | + "body": main_html, |
278 | 265 | "breadcrumbs": [ |
279 | 266 | { |
280 | 267 | "path": DocumentationCategory.WEBSITE, |
281 | 268 | "title": "Website", |
282 | 269 | }, |
283 | 270 | ], |
284 | 271 | "parents": DocumentationCategory.WEBSITE, |
285 | | - "slug": searchable_view.url_name, |
286 | | - "title": searchable_view.page_title, |
| 272 | + "title": title, |
287 | 273 | "toc": "", |
288 | 274 | }, |
289 | 275 | config=get_search_config(self.lang), |
290 | 276 | ) |
291 | | - for searchable_view in SEARCHABLE_VIEWS |
292 | | - ] |
293 | | - ) |
294 | 277 |
|
295 | 278 |
|
296 | 279 | def _clean_document_path(path): |
@@ -351,6 +334,14 @@ def search(self, query_text, release, document_category=None): |
351 | 334 | config=models.F("config"), |
352 | 335 | ) |
353 | 336 | base_filter = Q(release_id=release.id) |
| 337 | + if release.lang == settings.DEFAULT_LANGUAGE_CODE and not release.is_dev: |
| 338 | + dev_release = DocumentRelease.objects.get_by_version_and_lang( |
| 339 | + version="dev", lang=settings.DEFAULT_LANGUAGE_CODE |
| 340 | + ) |
| 341 | + base_filter |= Q( |
| 342 | + release_id=dev_release.id, |
| 343 | + metadata__parents=DocumentationCategory.WEBSITE, |
| 344 | + ) |
354 | 345 | if document_category: |
355 | 346 | base_filter &= Q(metadata__parents__startswith=document_category) |
356 | 347 | base_qs = ( |
|
0 commit comments