From fc43b710a6f2f137ea8384868da6c83c2bf2cc9b Mon Sep 17 00:00:00 2001 From: Sarah Boyce <42296566+sarahboyce@users.noreply.github.com> Date: Tue, 21 Oct 2025 19:09:28 +0200 Subject: [PATCH 1/2] Synced webpages from sitemap to Document model for search. --- blog/admin.py | 3 +- blog/models.py | 3 - blog/tests.py | 20 -- djangoproject/scss/_style.scss | 2 +- docs/management/commands/update_docs.py | 2 + docs/models.py | 103 +++++----- docs/search.py | 53 ++--- docs/tests/test_models.py | 246 +++++++++++++++++++----- docs/tests/test_utils.py | 38 +++- docs/utils.py | 57 ++++++ 10 files changed, 369 insertions(+), 158 deletions(-) diff --git a/blog/admin.py b/blog/admin.py index ceabd0ff71..f4c3b764a3 100644 --- a/blog/admin.py +++ b/blog/admin.py @@ -16,10 +16,9 @@ class EntryAdmin(admin.ModelAdmin): "pub_date", "is_active", "is_published", - "is_searchable", "author", ) - list_filter = ("is_active", "is_searchable") + list_filter = ("is_active",) exclude = ("summary_html", "body_html") prepopulated_fields = {"slug": ("headline",)} raw_id_fields = ["social_media_card"] diff --git a/blog/models.py b/blog/models.py index f27cffd6cd..6eec356ed6 100644 --- a/blog/models.py +++ b/blog/models.py @@ -37,9 +37,6 @@ def published(self): def active(self): return self.filter(is_active=True) - def searchable(self): - return self.filter(is_searchable=True) - class ContentFormat(models.TextChoices): REST = "reST", "reStructuredText" diff --git a/blog/tests.py b/blog/tests.py index f67bc7ea16..1e91e13e94 100644 --- a/blog/tests.py +++ b/blog/tests.py @@ -70,26 +70,6 @@ def test_manager_published(self): transform=lambda entry: entry.headline, ) - def test_manager_searchable(self): - """ - Make sure that the Entry manager's `searchable` method works - """ - Entry.objects.create( - pub_date=self.yesterday, - is_searchable=False, - headline="not searchable", - slug="a", - ) - Entry.objects.create( - pub_date=self.yesterday, is_searchable=True, headline="searchable", slug="b" - ) - - self.assertQuerySetEqual( - Entry.objects.searchable(), - ["searchable"], - transform=lambda entry: entry.headline, - ) - def test_docutils_safe(self): """ Make sure docutils' file inclusion directives are disabled by default. diff --git a/djangoproject/scss/_style.scss b/djangoproject/scss/_style.scss index 08a9a834fd..82d5f4131c 100644 --- a/djangoproject/scss/_style.scss +++ b/djangoproject/scss/_style.scss @@ -2650,7 +2650,7 @@ search.filters { position: relative; a { - padding: 10px 20px; + padding: 10px 15px; text-decoration: none; border-bottom: 3px solid transparent; transition: color 0.3s ease, border-bottom 0.3s ease; diff --git a/docs/management/commands/update_docs.py b/docs/management/commands/update_docs.py index acc0069b61..b5b01131f1 100644 --- a/docs/management/commands/update_docs.py +++ b/docs/management/commands/update_docs.py @@ -134,6 +134,8 @@ def build_doc_release(self, release, force=False, interactive=False): if self.verbosity >= 1: self.stdout.write(f"Starting update for {release} at {datetime.now()}...") + release.sync_from_sitemap(force=force) + # checkout_dir is shared for all languages. checkout_dir = settings.DOCS_BUILD_ROOT / "sources" / release.version parent_build_dir = settings.DOCS_BUILD_ROOT / release.lang / release.version diff --git a/docs/models.py b/docs/models.py index b5ad287009..1e6c664e3d 100644 --- a/docs/models.py +++ b/docs/models.py @@ -5,6 +5,7 @@ from functools import partial, reduce from pathlib import Path +import requests from django.conf import settings from django.contrib.postgres.indexes import GinIndex from django.contrib.postgres.search import ( @@ -26,19 +27,19 @@ from django.utils.html import strip_tags from django_hosts.resolvers import reverse -from blog.models import Entry from releases.models import Release from . import utils from .search import ( DEFAULT_TEXT_SEARCH_CONFIG, - SEARCHABLE_VIEWS, START_SEL, STOP_SEL, TSEARCH_CONFIG_LANGUAGES, DocumentationCategory, + fetch_html, get_document_search_vector, ) +from .utils import extract_inner_html def get_search_config(lang): @@ -185,7 +186,7 @@ def sync_to_db(self, decoded_documents): the database. Deletes all the release's documents first then reinserts them as needed. """ - self.documents.all().delete() + self.documents.exclude(metadata__parents=DocumentationCategory.WEBSITE).delete() # Read excluded paths from robots.docs.txt. robots_path = settings.BASE_DIR / "djangoproject" / "static" / "robots.docs.txt" @@ -216,65 +217,51 @@ def sync_to_db(self, decoded_documents): metadata=document, config=get_search_config(self.lang), ) - for document in self.documents.all(): + for document in self.documents.exclude( + metadata__parents=DocumentationCategory.WEBSITE + ): document.metadata["breadcrumbs"] = list( Document.objects.breadcrumbs(document).values("title", "path") ) document.save(update_fields=("metadata",)) - self._sync_blog_to_db() - self._sync_views_to_db() + def sync_from_sitemap(self, force=False): + from djangoproject.urls.www import sitemaps - def _sync_blog_to_db(self): - """ - Sync the blog entries into search based on the release documents - support end date. - """ - if self.lang != "en": - return # The blog is only written in English currently + if not self.is_dev: + return - entries = Entry.objects.published().searchable() - Document.objects.bulk_create( - [ - Document( - release=self, - path=entry.get_absolute_url(), - title=entry.headline, - metadata={ - "body": entry.body_html, - "breadcrumbs": [ - { - "path": DocumentationCategory.WEBSITE, - "title": "News", - }, - ], - "parents": DocumentationCategory.WEBSITE, - "slug": entry.slug, - "title": entry.headline, - "toc": "", - }, - config=get_search_config(self.lang), - ) - for entry in entries - ] - ) + if force: + Document.objects.filter( + metadata__parents=DocumentationCategory.WEBSITE + ).delete() - def _sync_views_to_db(self): - """ - Sync the specific views into search based on the release documents - support end date. - """ - if self.lang != "en": - return # The searchable views are only written in English currently + doc_urls = set( + Document.objects.filter( + metadata__parents=DocumentationCategory.WEBSITE + ).values_list("path", flat=True) + ) - Document.objects.bulk_create( - [ - Document( + for sitemap in sitemaps.values(): + for url in sitemap().get_urls(): + path = url["location"] + if path in doc_urls: + continue + try: + page_html = fetch_html(path) + except requests.RequestException: + continue + try: + main_html = extract_inner_html(page_html, tag="main") + title = extract_inner_html(page_html, tag="h1") + except ValueError: + continue + Document.objects.create( release=self, - path=searchable_view.www_absolute_url, - title=searchable_view.page_title, + path=path, + title=title, metadata={ - "body": searchable_view.html, + "body": main_html, "breadcrumbs": [ { "path": DocumentationCategory.WEBSITE, @@ -282,15 +269,11 @@ def _sync_views_to_db(self): }, ], "parents": DocumentationCategory.WEBSITE, - "slug": searchable_view.url_name, - "title": searchable_view.page_title, + "title": title, "toc": "", }, config=get_search_config(self.lang), ) - for searchable_view in SEARCHABLE_VIEWS - ] - ) def _clean_document_path(path): @@ -351,6 +334,14 @@ def search(self, query_text, release, document_category=None): config=models.F("config"), ) base_filter = Q(release_id=release.id) + if release.lang == settings.DEFAULT_LANGUAGE_CODE and not release.is_dev: + dev_release = DocumentRelease.objects.get_by_version_and_lang( + version="dev", lang=settings.DEFAULT_LANGUAGE_CODE + ) + base_filter |= Q( + release_id=dev_release.id, + metadata__parents=DocumentationCategory.WEBSITE, + ) if document_category: base_filter &= Q(metadata__parents__startswith=document_category) base_qs = ( diff --git a/docs/search.py b/docs/search.py index 0b7eaef1d3..d6d1ec9959 100644 --- a/docs/search.py +++ b/docs/search.py @@ -1,11 +1,8 @@ -from dataclasses import dataclass - +import requests from django.contrib.postgres.search import SearchVector from django.db.models import TextChoices from django.db.models.fields.json import KeyTextTransform -from django.template.loader import get_template from django.utils.translation import gettext_lazy as _ -from django_hosts import reverse # Imported from # https://github.com/postgres/postgres/blob/REL_14_STABLE/src/bin/initdb/initdb.c#L659 @@ -81,25 +78,31 @@ def parse(cls, value, default=None): return None -@dataclass -class SearchableView: - page_title: str - url_name: str - template: str - - @property - def html(self): - return get_template(self.template).render() - - @property - def www_absolute_url(self): - return reverse(self.url_name, host="www") - +def fetch_html(url, timeout=10): + """ + Fetch the HTML of a page if status code is 200. + Simulates a human browser and accepts only text/html. + """ -SEARCHABLE_VIEWS = [ - SearchableView( - page_title="Django's Ecosystem", - url_name="community-ecosystem", - template="aggregator/ecosystem.html", - ), -] + headers = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0" + ), + "Accept": "text/html", + "Accept-Language": "en-US,en;q=0.9", + } + + response = requests.get(url, headers=headers, timeout=timeout) + + if response.status_code == 200: + content_type = response.headers.get("Content-Type", "") + if "text/html" in content_type: + return response.text + else: + raise requests.RequestException(f"Unexpected Content-Type: {content_type}") + else: + raise requests.RequestException( + f"Failed to fetch {url}, status code: {response.status_code}" + ) diff --git a/docs/tests/test_models.py b/docs/tests/test_models.py index a5e22065b8..205d1e1177 100644 --- a/docs/tests/test_models.py +++ b/docs/tests/test_models.py @@ -1,13 +1,14 @@ import datetime from operator import attrgetter +from unittest import mock +import requests_mock from django.conf import settings from django.db import connection from django.test import TestCase from django.utils import timezone -from django_hosts import reverse -from blog.models import Entry +from blog.models import ContentFormat, Entry from releases.models import Release from ..models import Document, DocumentRelease @@ -184,6 +185,9 @@ def test_get_available_languages_by_version(self): class DocumentManagerTest(TestCase): @classmethod def setUpTestData(cls): + cls.dev_release = DocumentRelease.objects.create( + lang=settings.DEFAULT_LANGUAGE_CODE + ) cls.release = DocumentRelease.objects.create( release=Release.objects.create(version="1.2.3"), ) @@ -358,6 +362,20 @@ def setUpTestData(cls): "release": cls.release_fr, "title": "Notes de publication de Django 1.9.4", }, + { + "metadata": { + "body": "Main 1", + "breadcrumbs": [ + {"path": DocumentationCategory.WEBSITE, "title": "Website"} + ], + "parents": DocumentationCategory.WEBSITE, + "title": "Title 1", + "toc": "", + }, + "path": "example", + "release": cls.dev_release, + "title": "Blog post", + }, ] Document.objects.bulk_create(Document(**doc) for doc in documents) @@ -457,28 +475,21 @@ def test_search_title(self): ), ) + def test_website_document_items_included_english(self): + self.assertQuerySetEqual( + Document.objects.search("Main", self.release), + ["Blog post"], + transform=attrgetter("title"), + ) + + def test_website_document_items_excluded_non_english(self): + self.assertEqual(Document.objects.search("Main", self.release_fr).count(), 0) + class UpdateDocTests(TestCase): @classmethod def setUpTestData(cls): - now = timezone.now() - cls.release = DocumentRelease.objects.create( - release=Release.objects.create( - version="1.0.0", - eol_date=now + datetime.timedelta(days=1), - ) - ) - cls.entry = Entry.objects.create( - pub_date=now, - is_active=True, - is_searchable=True, - headline="Searchable post", - slug="a", - body_html="
Hello
Hello
"), + ( + 'Bye
Bye
"), + ]: + with self.subTest(html=html): + self.assertEqual(extract_inner_html(html, tag="main"), expected_output) + + def test_extract_inner_html_multiple_same_tags_raises(self): + with self.assertRaisesMessage( + ValueError, "Test
", tag="main") diff --git a/docs/utils.py b/docs/utils.py index bbbf579008..973f684b9b 100644 --- a/docs/utils.py +++ b/docs/utils.py @@ -1,5 +1,6 @@ import re import unicodedata +from html.parser import HTMLParser from pathlib import Path from django.conf import settings @@ -93,3 +94,59 @@ def get_module_path(name, full_path): if full_path.endswith(name_suffix): return full_path.removesuffix(name_suffix) return None + + +class SingleTagInnerHTMLExtractor(HTMLParser): + def __init__(self, target_tag): + super().__init__() + self.target_tag = target_tag.lower() + self.capturing = False + self.inner_html = [] + self.tag_count = 0 + + def handle_starttag(self, tag, attrs): + tag = tag.lower() + if tag == self.target_tag: + self.tag_count += 1 + if self.capturing: + # Nested target tag not allowed. + raise ValueError(f"Nested <{self.target_tag}> tags are not allowed.") + self.capturing = True + elif self.capturing: + self.inner_html.append(self.get_starttag_text()) + + def handle_endtag(self, tag): + tag = tag.lower() + if self.capturing: + if tag == self.target_tag: + self.capturing = False + else: + self.inner_html.append(f"{tag}>") + + def handle_data(self, data): + if self.capturing: + self.inner_html.append(data) + + def handle_entityref(self, name): + if self.capturing: + self.inner_html.append(f"&{name};") + + def handle_charref(self, name): + if self.capturing: + self.inner_html.append(f"{name};") + + +def extract_inner_html(html, tag): + """ + Extracts the inner HTML of a tag that appears exactly once. + """ + parser = SingleTagInnerHTMLExtractor(tag) + parser.feed(html) + parser.close() + + if parser.tag_count == 0: + raise ValueError(f"<{tag}> not found in HTML.") + if parser.tag_count > 1: + raise ValueError(f"<{tag}> occurs more than once in HTML.") + + return "".join(parser.inner_html) From 1f1fc2da007c950d3b345210105a66dbe45a804d Mon Sep 17 00:00:00 2001 From: Sarah Boyce <42296566+sarahboyce@users.noreply.github.com> Date: Sun, 19 Oct 2025 14:07:52 +0200 Subject: [PATCH 2/2] Added static views to sitemap. --- djangoproject/sitemaps.py | 76 +++++++++++++++++++++++++++++++++++++++ djangoproject/tests.py | 7 ++++ djangoproject/urls/www.py | 3 ++ docs/sitemaps.py | 23 ++++-------- docs/tests/test_models.py | 31 ++++++++++------ 5 files changed, 112 insertions(+), 28 deletions(-) create mode 100644 djangoproject/sitemaps.py diff --git a/djangoproject/sitemaps.py b/djangoproject/sitemaps.py new file mode 100644 index 0000000000..3c7becbf3a --- /dev/null +++ b/djangoproject/sitemaps.py @@ -0,0 +1,76 @@ +from dataclasses import dataclass + +from django.contrib import sitemaps +from django_hosts.resolvers import reverse + + +@dataclass +class URLObject: + name: str + host: str = "www" + + +class LocationAbsoluteUrlMixin: + def get_urls(self, site=None, **kwargs): + """ + Prevent the Django sitemap framework from prefixing the domain. + Use the absolute URL returned by location(). + """ + urls = [] + for item in self.items(): + loc = self.location(item) + urls.append( + { + "location": loc, + "lastmod": None, + "changefreq": self.changefreq, + "priority": self.priority, + } + ) + return urls + + +class StaticViewSitemap(LocationAbsoluteUrlMixin, sitemaps.Sitemap): + priority = 0.5 + changefreq = "monthly" + + def items(self): + return [ + # accounts + URLObject("registration_register"), + # aggregator + URLObject("community-index"), + URLObject("community-ecosystem"), + URLObject("local-django-communities"), + # contact + URLObject("contact_foundation"), + # dashboard + URLObject("dashboard-index", host="dashboard"), + URLObject("metric-list", host="dashboard"), + # djangoproject + URLObject("homepage"), + URLObject("overview"), + URLObject("start"), + URLObject("code_of_conduct"), + URLObject("conduct_faq"), + URLObject("conduct_reporting"), + URLObject("conduct_enforcement"), + URLObject("conduct_changes"), + URLObject("diversity"), + URLObject("diversity_changes"), + # foundation + URLObject("foundation_meeting_archive_index"), + # fundraising + URLObject("fundraising:index"), + # members + URLObject("members:individual-members"), + URLObject("members:corporate-members"), + URLObject("members:corporate-members-join"), + URLObject("members:corporate-members-badges"), + URLObject("members:teams"), + # releases + URLObject("download"), + ] + + def location(self, item): + return reverse(item.name, host=item.host) diff --git a/djangoproject/tests.py b/djangoproject/tests.py index 2c67432110..5e8355601b 100644 --- a/djangoproject/tests.py +++ b/djangoproject/tests.py @@ -202,6 +202,7 @@ def test_single_h1_per_page(self): "styleguide/", # Has multiple