From 81ec9ee8250e6e0082eb98fb0d32b0571a64d99a Mon Sep 17 00:00:00 2001 From: Marek Honzal Date: Thu, 28 May 2026 17:39:39 +0200 Subject: [PATCH 1/7] feat: add header with navigation links to all .md pages --- package.json | 2 +- scripts/addNavHeaders.mjs | 300 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 301 insertions(+), 1 deletion(-) create mode 100644 scripts/addNavHeaders.mjs diff --git a/package.json b/package.json index f84be4a952..4ca0086043 100644 --- a/package.json +++ b/package.json @@ -47,7 +47,7 @@ "test:academy": "bats --print-output-on-failure -r .", "test:llms-size": "node ./scripts/checkLlmsSize.mjs", "postinstall": "patch-package", - "postbuild": "node ./scripts/joinLlmsFiles.mjs && node ./scripts/indentLlmsFile.mjs" + "postbuild": "node ./scripts/joinLlmsFiles.mjs && node ./scripts/indentLlmsFile.mjs && node ./scripts/addNavHeaders.mjs" }, "devDependencies": { "@apify/oxlint-config": "^0.2.5", diff --git a/scripts/addNavHeaders.mjs b/scripts/addNavHeaders.mjs new file mode 100644 index 0000000000..76dfdcd22c --- /dev/null +++ b/scripts/addNavHeaders.mjs @@ -0,0 +1,300 @@ +import fs from 'node:fs/promises'; +import path from 'node:path'; + +// Prepend a Vercel-style nav header (frontmatter) to every per-page `.md` file +// generated by `@signalwire/docusaurus-plugin-llms-txt`. See +// `/Users/marekhonzal/.claude/plans/let-s-create-a-proper-nifty-summit.md` +// and issue #2557. +// +// Why post-build (not a remark/rehype plugin): the llms-txt plugin writes +// `llms-full.txt` from the SAME per-page content during its own `postBuild`. +// Editing the per-page `.md` files AFTER that — in npm `postbuild` — keeps the +// header out of `llms-full.txt` for free, as required by the issue. +// +// Data sources (both generated by `docusaurus build`): +// - `.docusaurus/docusaurus-plugin-content-docs//p/*.json` +// The real left-menu tree (`version.docsSidebars`) — labels, order, +// nesting, hrefs. One file per docs instance (platform, academy, legal, +// openapi). Academy has TWO sidebars (`courses`, `tutorials`). +// - `.docusaurus/docusaurus-plugin-llms-txt/cache.json` +// Route → title fallback for pages absent from every sidebar +// (`/`, `/api`, `/open-source`, `/sdk`) and for section-landing titles +// where the sidebar label is "Home" but the page title is more descriptive. + +const BUILD_DIR = path.resolve('build'); +const DOCUSAURUS_DIR = path.resolve('.docusaurus'); +const CONTENT_DOCS_DIR = path.join(DOCUSAURUS_DIR, 'docusaurus-plugin-content-docs'); +const LLMS_CACHE_FILE = path.join(DOCUSAURUS_DIR, 'docusaurus-plugin-llms-txt', 'cache.json'); + +// Aligned with scripts/joinLlmsFiles.mjs + scripts/indentLlmsFile.mjs so +// preview/PR builds (which set APIFY_DOCS_ABSOLUTE_URL) get correct URLs. +const SITE_URL = process.env.APIFY_DOCS_ABSOLUTE_URL || 'https://docs.apify.com'; + +// --- URL / label helpers --------------------------------------------------- + +function normalizeRoute(href) { + if (typeof href !== 'string' || href.length === 0) return null; + return href.length > 1 && href.endsWith('/') ? href.slice(0, -1) : href; +} + +function routeToUrl(route) { + // Defensive: at least one source page (academy cloudflare-challenge) has a + // `slug:` that already ends in `.md`. Don't double up the suffix. + const stripped = route.endsWith('.md') ? route.slice(0, -3) : route; + return `${SITE_URL}${stripped}.md`; +} + +// Escape characters that would break a `[Label](url)` markdown link parse. +// Labels are almost always clean; this is a safety net. +function escapeLinkLabel(label) { + return label.replace(/\\/g, '\\\\').replace(/\]/g, '\\]'); +} + +function mdLink(label, route) { + return `[${escapeLinkLabel(label)}](${routeToUrl(route)})`; +} + +// Quote the `title:` value if it contains characters that would derail a +// downstream YAML-ish parser reading the title line. We intentionally don't +// commit to strict YAML for the body (the `- [Label](url)` list items aren't +// valid YAML scalars), but the title line itself stays parseable. +const TITLE_NEEDS_QUOTE = /[:#&*!|>'"%@`[\]{},\n]|^\s|\s$|^[\d?-]/; +function yamlScalar(value) { + if (TITLE_NEEDS_QUOTE.test(value)) { + return `"${value.replace(/\\/g, '\\\\').replace(/"/g, '\\"')}"`; + } + return value; +} + +// --- Sidebar walking ------------------------------------------------------- + +/** + * Walk one sidebar tree, populating: + * - `indexMap`: normalizedRoute → { label, parents[], children[] } + * - `flat`: flat DFS list of {route, label} for prev/next within this sidebar + * + * Rules (per approved plan): + * - Skip `unlisted: true` nodes entirely (not in any list, no own entry). + * - Parents = sidebar category ancestors that have an `href`. Href-less + * categories ("Programming", API's "Actors") pass through but aren't + * recorded as parents — they're nav grouping labels, not real pages. + * - Children = the node's direct `items[]`. For href-less category children, + * we recurse ONE level to lift their href-bearing children up, so direct + * grouping is preserved even though the empty label itself is dropped. + * - First occurrence of a route wins (a route in two sidebars only ends up in + * the first-walked one's prev/next sequence). + */ +function walk(node, ancestors, indexMap, flat) { + const isUnlisted = node.unlisted === true; + const hasHref = typeof node.href === 'string' && node.href.length > 0; + const route = hasHref ? normalizeRoute(node.href) : null; + + if (route && !isUnlisted && !indexMap.has(route)) { + const parents = ancestors + .filter((a) => a.route && !a.unlisted) + .map((a) => ({ label: a.label, route: a.route })); + + const children = []; + if (Array.isArray(node.items)) { + for (const child of node.items) { + if (child.unlisted === true) continue; + const childRoute = normalizeRoute(child.href); + if (childRoute) { + children.push({ label: child.label, route: childRoute }); + } else if (child.type === 'category' && Array.isArray(child.items)) { + // Lift one level through an href-less grouping category. + for (const grand of child.items) { + if (grand.unlisted === true) continue; + const grandRoute = normalizeRoute(grand.href); + if (grandRoute) { + children.push({ label: grand.label, route: grandRoute }); + } + } + } + } + } + + indexMap.set(route, { label: node.label, parents, children }); + flat.push({ route, label: node.label }); + } + + if (Array.isArray(node.items)) { + const next = [...ancestors, { label: node.label, route, unlisted: isUnlisted }]; + for (const child of node.items) walk(child, next, indexMap, flat); + } +} + +async function loadSidebars() { + const indexMap = new Map(); + const flatLists = []; // each is an array of {route, label} for one sidebar + + let instances; + try { + instances = await fs.readdir(CONTENT_DOCS_DIR, { withFileTypes: true }); + } catch { + console.warn(`addNavHeaders: ${CONTENT_DOCS_DIR} missing — no sidebars to load.`); + return { indexMap, flatLists }; + } + + for (const dirent of instances) { + if (!dirent.isDirectory()) continue; + const pDir = path.join(CONTENT_DOCS_DIR, dirent.name, 'p'); + let propFiles; + try { + propFiles = await fs.readdir(pDir); + } catch { + continue; + } + for (const file of propFiles) { + if (!file.endsWith('.json')) continue; + const data = JSON.parse(await fs.readFile(path.join(pDir, file), 'utf8')); + const sidebars = data?.version?.docsSidebars; + if (!sidebars) continue; + for (const items of Object.values(sidebars)) { + if (!Array.isArray(items)) continue; + const flat = []; + for (const item of items) walk(item, [], indexMap, flat); + flatLists.push(flat); + } + } + } + + return { indexMap, flatLists }; +} + +// --- Cache (title fallback) ------------------------------------------------ + +async function loadCacheTitles() { + try { + const data = JSON.parse(await fs.readFile(LLMS_CACHE_FILE, 'utf8')); + const map = new Map(); + for (const r of data.routes || []) { + if (r.path && r.title) map.set(normalizeRoute(r.path), r.title); + } + return map; + } catch (err) { + console.warn('addNavHeaders: could not read llms-txt cache:', err.message); + return new Map(); + } +} + +// --- File processing ------------------------------------------------------- + +async function* walkMd(dir) { + const entries = await fs.readdir(dir, { withFileTypes: true }); + for (const entry of entries) { + const full = path.join(dir, entry.name); + if (entry.isDirectory()) { + yield* walkMd(full); + } else if (entry.isFile() && entry.name.endsWith('.md')) { + yield full; + } + } +} + +function fileToRoute(filePath) { + const rel = path.relative(BUILD_DIR, filePath); + const noExt = rel.replace(/\.md$/, ''); + // Defensive: `/index.md` would map to `` (the signalwire plugin + // doesn't emit those — pages live at `.md` — but handle it anyway). + const noIndex = noExt === 'index' + ? '' + : noExt.endsWith('/index') + ? noExt.slice(0, -'/index'.length) + : noExt; + return `/${noIndex}`; +} + +function titleFromContent(content) { + const match = content.match(/^#\s+(.+?)\s*$/m); + return match ? match[1].trim() : null; +} + +function renderHeader({ title, url, parents, children, previous, next }) { + const lines = ['---']; + lines.push(`title: ${yamlScalar(title)}`); + lines.push(`url: ${url}`); + if (parents && parents.length > 0) { + lines.push('parents:'); + for (const p of parents) lines.push(` - ${mdLink(p.label, p.route)}`); + } + if (children && children.length > 0) { + lines.push('children:'); + for (const c of children) lines.push(` - ${mdLink(c.label, c.route)}`); + } + if (previous) lines.push(`previous: ${mdLink(previous.label, previous.route)}`); + if (next) lines.push(`next: ${mdLink(next.label, next.route)}`); + lines.push('---'); + lines.push(''); + return `${lines.join('\n')}\n`; +} + +async function main() { + try { + await fs.access(BUILD_DIR); + } catch { + console.warn(`addNavHeaders: ${BUILD_DIR} not found — nothing to do.`); + return; + } + + const [{ indexMap, flatLists }, cacheTitles] = await Promise.all([ + loadSidebars(), + loadCacheTitles(), + ]); + + // Build prev/next from each sidebar's flat order independently. Doesn't + // cross sidebars (e.g. academy's `courses` vs `tutorials`). + const prevNext = new Map(); + for (const flat of flatLists) { + for (let i = 0; i < flat.length; i++) { + const pn = {}; + if (i > 0) pn.previous = flat[i - 1]; + if (i < flat.length - 1) pn.next = flat[i + 1]; + if (!prevNext.has(flat[i].route)) prevNext.set(flat[i].route, pn); + } + } + + let processed = 0; + let skipped = 0; + let unknown = 0; + + for await (const filePath of walkMd(BUILD_DIR)) { + const content = await fs.readFile(filePath, 'utf8'); + if (content.startsWith('---\n')) { + skipped++; + continue; + } + + const route = fileToRoute(filePath); + const sidebarEntry = indexMap.get(route); + const pn = prevNext.get(route) || {}; + + // Title preference: cache (true page title, e.g. "Apify platform + // documentation") > sidebar label (sometimes shorter, e.g. "Home") > + // first H1 in content > generic fallback. + const title = cacheTitles.get(route) + || sidebarEntry?.label + || titleFromContent(content) + || 'Untitled'; + + if (!sidebarEntry && !cacheTitles.has(route)) unknown++; + + const header = renderHeader({ + title, + url: routeToUrl(route), + parents: sidebarEntry?.parents || [], + children: sidebarEntry?.children || [], + previous: pn.previous, + next: pn.next, + }); + + await fs.writeFile(filePath, header + content, 'utf8'); + processed++; + } + + console.log( + `addNavHeaders: wrote headers to ${processed} files (${skipped} already had frontmatter, ${unknown} without sidebar or cache title)`, + ); +} + +await main(); From 95b2b56a1831a8ac2ac5eff2db5f536835111840 Mon Sep 17 00:00:00 2001 From: Marek Honzal Date: Thu, 28 May 2026 17:43:53 +0200 Subject: [PATCH 2/7] refactor: post review cleanup --- scripts/addNavHeaders.mjs | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/scripts/addNavHeaders.mjs b/scripts/addNavHeaders.mjs index 76dfdcd22c..503f0edca8 100644 --- a/scripts/addNavHeaders.mjs +++ b/scripts/addNavHeaders.mjs @@ -2,9 +2,7 @@ import fs from 'node:fs/promises'; import path from 'node:path'; // Prepend a Vercel-style nav header (frontmatter) to every per-page `.md` file -// generated by `@signalwire/docusaurus-plugin-llms-txt`. See -// `/Users/marekhonzal/.claude/plans/let-s-create-a-proper-nifty-summit.md` -// and issue #2557. +// generated by `@signalwire/docusaurus-plugin-llms-txt`. See issue #2557. // // Why post-build (not a remark/rehype plugin): the llms-txt plugin writes // `llms-full.txt` from the SAME per-page content during its own `postBuild`. @@ -47,7 +45,7 @@ function routeToUrl(route) { // Escape characters that would break a `[Label](url)` markdown link parse. // Labels are almost always clean; this is a safety net. function escapeLinkLabel(label) { - return label.replace(/\\/g, '\\\\').replace(/\]/g, '\\]'); + return label.replace(/\\/g, '\\\\').replace(/([[\]])/g, '\\$1'); } function mdLink(label, route) { @@ -59,7 +57,7 @@ function mdLink(label, route) { // commit to strict YAML for the body (the `- [Label](url)` list items aren't // valid YAML scalars), but the title line itself stays parseable. const TITLE_NEEDS_QUOTE = /[:#&*!|>'"%@`[\]{},\n]|^\s|\s$|^[\d?-]/; -function yamlScalar(value) { +function quoteTitle(value) { if (TITLE_NEEDS_QUOTE.test(value)) { return `"${value.replace(/\\/g, '\\\\').replace(/"/g, '\\"')}"`; } @@ -194,15 +192,7 @@ async function* walkMd(dir) { function fileToRoute(filePath) { const rel = path.relative(BUILD_DIR, filePath); - const noExt = rel.replace(/\.md$/, ''); - // Defensive: `/index.md` would map to `` (the signalwire plugin - // doesn't emit those — pages live at `.md` — but handle it anyway). - const noIndex = noExt === 'index' - ? '' - : noExt.endsWith('/index') - ? noExt.slice(0, -'/index'.length) - : noExt; - return `/${noIndex}`; + return `/${rel.replace(/\.md$/, '')}`; } function titleFromContent(content) { @@ -212,7 +202,7 @@ function titleFromContent(content) { function renderHeader({ title, url, parents, children, previous, next }) { const lines = ['---']; - lines.push(`title: ${yamlScalar(title)}`); + lines.push(`title: ${quoteTitle(title)}`); lines.push(`url: ${url}`); if (parents && parents.length > 0) { lines.push('parents:'); @@ -256,7 +246,7 @@ async function main() { let processed = 0; let skipped = 0; - let unknown = 0; + const unknownRoutes = []; for await (const filePath of walkMd(BUILD_DIR)) { const content = await fs.readFile(filePath, 'utf8'); @@ -277,7 +267,7 @@ async function main() { || titleFromContent(content) || 'Untitled'; - if (!sidebarEntry && !cacheTitles.has(route)) unknown++; + if (!sidebarEntry && !cacheTitles.has(route)) unknownRoutes.push(route); const header = renderHeader({ title, @@ -293,8 +283,11 @@ async function main() { } console.log( - `addNavHeaders: wrote headers to ${processed} files (${skipped} already had frontmatter, ${unknown} without sidebar or cache title)`, + `addNavHeaders: wrote headers to ${processed} files (${skipped} already had frontmatter, ${unknownRoutes.length} without sidebar or cache title)`, ); + if (unknownRoutes.length > 0) { + console.log(`addNavHeaders: routes without sidebar or cache title:\n ${unknownRoutes.sort().join('\n ')}`); + } } await main(); From df958c0e0cddde488b8955dcda3f0aba9cc725ba Mon Sep 17 00:00:00 2001 From: Marek Honzal Date: Thu, 28 May 2026 17:51:38 +0200 Subject: [PATCH 3/7] chore: rename `pn` to `nav`, format --- scripts/addNavHeaders.mjs | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/scripts/addNavHeaders.mjs b/scripts/addNavHeaders.mjs index 503f0edca8..786fbc039f 100644 --- a/scripts/addNavHeaders.mjs +++ b/scripts/addNavHeaders.mjs @@ -227,20 +227,17 @@ async function main() { return; } - const [{ indexMap, flatLists }, cacheTitles] = await Promise.all([ - loadSidebars(), - loadCacheTitles(), - ]); + const [{ indexMap, flatLists }, cacheTitles] = await Promise.all([loadSidebars(), loadCacheTitles()]); // Build prev/next from each sidebar's flat order independently. Doesn't // cross sidebars (e.g. academy's `courses` vs `tutorials`). const prevNext = new Map(); for (const flat of flatLists) { for (let i = 0; i < flat.length; i++) { - const pn = {}; - if (i > 0) pn.previous = flat[i - 1]; - if (i < flat.length - 1) pn.next = flat[i + 1]; - if (!prevNext.has(flat[i].route)) prevNext.set(flat[i].route, pn); + const nav = {}; + if (i > 0) nav.previous = flat[i - 1]; + if (i < flat.length - 1) nav.next = flat[i + 1]; + if (!prevNext.has(flat[i].route)) prevNext.set(flat[i].route, nav); } } @@ -257,15 +254,12 @@ async function main() { const route = fileToRoute(filePath); const sidebarEntry = indexMap.get(route); - const pn = prevNext.get(route) || {}; + const nav = prevNext.get(route) || {}; // Title preference: cache (true page title, e.g. "Apify platform // documentation") > sidebar label (sometimes shorter, e.g. "Home") > // first H1 in content > generic fallback. - const title = cacheTitles.get(route) - || sidebarEntry?.label - || titleFromContent(content) - || 'Untitled'; + const title = cacheTitles.get(route) || sidebarEntry?.label || titleFromContent(content) || 'Untitled'; if (!sidebarEntry && !cacheTitles.has(route)) unknownRoutes.push(route); @@ -274,8 +268,8 @@ async function main() { url: routeToUrl(route), parents: sidebarEntry?.parents || [], children: sidebarEntry?.children || [], - previous: pn.previous, - next: pn.next, + previous: nav.previous, + next: nav.next, }); await fs.writeFile(filePath, header + content, 'utf8'); From 48fa6c82e144628909aefd57b8030358d5a5b356 Mon Sep 17 00:00:00 2001 From: Marek Honzal Date: Fri, 29 May 2026 13:22:12 +0200 Subject: [PATCH 4/7] refactor: abstract duplicated walk --- scripts/addNavHeaders.mjs | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/scripts/addNavHeaders.mjs b/scripts/addNavHeaders.mjs index 786fbc039f..b885d42288 100644 --- a/scripts/addNavHeaders.mjs +++ b/scripts/addNavHeaders.mjs @@ -66,6 +66,16 @@ function quoteTitle(value) { // --- Sidebar walking ------------------------------------------------------- +// Push `{ label, route }` onto `list` if `node` is listed and has a real route. +// Returns whether a route was found, so the caller can decide whether to lift +// children up through an href-less grouping category. +function pushRoutedChild(list, node) { + if (node.unlisted === true) return false; + const route = normalizeRoute(node.href); + if (route) list.push({ label: node.label, route }); + return route !== null; +} + /** * Walk one sidebar tree, populating: * - `indexMap`: normalizedRoute → { label, parents[], children[] } @@ -96,18 +106,10 @@ function walk(node, ancestors, indexMap, flat) { if (Array.isArray(node.items)) { for (const child of node.items) { if (child.unlisted === true) continue; - const childRoute = normalizeRoute(child.href); - if (childRoute) { - children.push({ label: child.label, route: childRoute }); - } else if (child.type === 'category' && Array.isArray(child.items)) { - // Lift one level through an href-less grouping category. - for (const grand of child.items) { - if (grand.unlisted === true) continue; - const grandRoute = normalizeRoute(grand.href); - if (grandRoute) { - children.push({ label: grand.label, route: grandRoute }); - } - } + // Lift one level through an href-less grouping category so its + // routed children stay grouped under this node. + if (!pushRoutedChild(children, child) && child.type === 'category' && Array.isArray(child.items)) { + for (const grand of child.items) pushRoutedChild(children, grand); } } } From 0d019c6d065f77fda447b32a736e1afc3ac09a0b Mon Sep 17 00:00:00 2001 From: Marek Honzal Date: Fri, 29 May 2026 13:41:46 +0200 Subject: [PATCH 5/7] feat: add upward parents all the way up --- scripts/addNavHeaders.mjs | 41 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/scripts/addNavHeaders.mjs b/scripts/addNavHeaders.mjs index b885d42288..32a55afccd 100644 --- a/scripts/addNavHeaders.mjs +++ b/scripts/addNavHeaders.mjs @@ -52,6 +52,39 @@ function mdLink(label, route) { return `[${escapeLinkLabel(label)}](${routeToUrl(route)})`; } +// Synthetic root parent prepended to every page — the agent's link back to the +// full index. It points at `llms.txt` (not a `.md`), so it carries an explicit +// `url` rather than a route. See issue #2557. +const ROOT_PARENT = { label: 'Apify documentation', url: `${SITE_URL}/llms.txt` }; + +// A nav item is either {label, route} (a real per-page `.md`) or {label, url} +// (a literal link, e.g. the llms.txt root). Render whichever it carries. +function navLink(item) { + return item.url ? `[${escapeLinkLabel(item.label)}](${item.url})` : mdLink(item.label, item.route); +} + +// The section landing (`/platform`, `/academy`, `/api`, ...) is modeled in the +// sidebar as a sibling *link* ("Home"), not an ancestor category, so `walk()` +// never records it as a parent. Derive it from the route's first path segment. +function sectionLandingRoute(route) { + const seg = route.split('/').find(Boolean); + return seg ? `/${seg}` : null; +} + +// Full upward chain for a page: llms.txt root, then the section landing, then +// the sidebar-category ancestors from `walk()`. Bounded by site depth (~5), so +// unlike `children` there's no fan-out/size risk. See issue #2557. +function buildParents(route, sidebarParents, titleFor) { + const parents = [ROOT_PARENT]; + const landing = sectionLandingRoute(route); + if (landing && landing !== route && !sidebarParents.some((p) => p.route === landing)) { + const label = titleFor(landing); + if (label) parents.push({ label, route: landing }); + } + parents.push(...sidebarParents); + return parents; +} + // Quote the `title:` value if it contains characters that would derail a // downstream YAML-ish parser reading the title line. We intentionally don't // commit to strict YAML for the body (the `- [Label](url)` list items aren't @@ -208,7 +241,7 @@ function renderHeader({ title, url, parents, children, previous, next }) { lines.push(`url: ${url}`); if (parents && parents.length > 0) { lines.push('parents:'); - for (const p of parents) lines.push(` - ${mdLink(p.label, p.route)}`); + for (const p of parents) lines.push(` - ${navLink(p)}`); } if (children && children.length > 0) { lines.push('children:'); @@ -243,6 +276,10 @@ async function main() { } } + // Resolve a route to its best display title (cache title > sidebar label). + // Used for the section-landing parent, whose sidebar label is "Home". + const titleFor = (r) => cacheTitles.get(r) || indexMap.get(r)?.label || null; + let processed = 0; let skipped = 0; const unknownRoutes = []; @@ -268,7 +305,7 @@ async function main() { const header = renderHeader({ title, url: routeToUrl(route), - parents: sidebarEntry?.parents || [], + parents: buildParents(route, sidebarEntry?.parents || [], titleFor), children: sidebarEntry?.children || [], previous: nav.previous, next: nav.next, From 41f5517cfa783c59b72531a4cd9506fab8d8bdfa Mon Sep 17 00:00:00 2001 From: Marek Honzal Date: Mon, 1 Jun 2026 17:51:08 +0200 Subject: [PATCH 6/7] test: add a test checking headers are present, correct and not leaking into llms-full.txt --- .github/workflows/test.yaml | 5 ++ package.json | 1 + scripts/checkNavHeaders.mjs | 172 ++++++++++++++++++++++++++++++++++++ 3 files changed, 178 insertions(+) create mode 100644 scripts/checkNavHeaders.mjs diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index cbce2a2d57..a7c44f7311 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -221,6 +221,11 @@ jobs: echo "✅ All Nginx header checks passed." + - name: Verify per-page .md navigation headers + # Checks the Vercel-style nav header that scripts/addNavHeaders.mjs + # prepends to every per-page .md in postbuild (#2596 / issue #2557). + run: node scripts/checkNavHeaders.mjs http://localhost:8080 + - name: Stop Nginx if: always() run: nginx -c "$(pwd)/default.conf" -s stop diff --git a/package.json b/package.json index 4ca0086043..5176c57642 100644 --- a/package.json +++ b/package.json @@ -46,6 +46,7 @@ "format:check": "oxfmt --check", "test:academy": "bats --print-output-on-failure -r .", "test:llms-size": "node ./scripts/checkLlmsSize.mjs", + "test:nav-headers": "node ./scripts/checkNavHeaders.mjs", "postinstall": "patch-package", "postbuild": "node ./scripts/joinLlmsFiles.mjs && node ./scripts/indentLlmsFile.mjs && node ./scripts/addNavHeaders.mjs" }, diff --git a/scripts/checkNavHeaders.mjs b/scripts/checkNavHeaders.mjs new file mode 100644 index 0000000000..1b90a6be4c --- /dev/null +++ b/scripts/checkNavHeaders.mjs @@ -0,0 +1,172 @@ +// Verify the per-page `.md` navigation header (#2596 / issue #2557). +// +// `scripts/addNavHeaders.mjs` runs in `postbuild` and prepends a Vercel-style +// YAML frontmatter block to every per-page `.md` file docs.apify.com serves, so +// an AI agent reading a single page can follow parents / children / previous / +// next links without first fetching `llms.txt`. This script checks that the +// served pages actually carry that header and that it never leaked into +// `llms-full.txt`. +// +// It runs against a live server over HTTP (in CI: Nginx on :8080), so it tests +// the real served bytes, not the files on disk. +// +// Usage: +// node scripts/checkNavHeaders.mjs [baseUrl] +// baseUrl defaults to $NAV_HEADERS_BASE_URL or http://localhost:8080 + +const BASE = (process.argv[2] || process.env.NAV_HEADERS_BASE_URL || 'http://localhost:8080').replace(/\/$/, ''); +const SITE_URL = 'https://docs.apify.com'; + +// The synthetic root breadcrumb that addNavHeaders.mjs puts at the top of every +// page's `parents:` list. It is generated nowhere else, which makes it both the +// most stable positive marker and a reliable negative marker for the +// llms-full.txt leak check below. +const ROOT_PARENT = `[Apify documentation](${SITE_URL}/llms.txt)`; + +// A nav value is always a markdown link to a docs URL, e.g. [Label](https://...). +// The label part allows escaped sequences (`\\`, `\[`, `\]`) because the producer's +// escapeLinkLabel can emit them, so a literal `]` in a label won't end the match early. +const MD_LINK = /^\[(?:\\.|[^\]\\])+\]\(https:\/\/docs\.apify\.com\/\S+\)$/; + +// Curated, structurally stable pages chosen to cover every frontmatter-key +// combination the feature can emit. `keys` lists the CONDITIONAL keys that must +// be present on that page (title / url / parents are required on every page and +// checked unconditionally). children = category pages; previous / next = pages +// with a sidebar neighbour on that side. Picked so the set as a whole exercises +// children, previous and next, while each page only asserts what its position in +// the tree guarantees (so reordering siblings can't make it flaky). +const PAGES = [ + // Section landings: only the universal keys, sometimes a `next`. + { path: '/platform.md', keys: ['next'] }, + { path: '/academy.md', keys: [] }, + { path: '/api.md', keys: [] }, + { path: '/legal.md', keys: ['next'] }, + // Category pages: have a `children` list plus neighbours. + { path: '/platform/actors.md', keys: ['children', 'previous', 'next'] }, + { path: '/platform/actors/running.md', keys: ['children', 'previous', 'next'] }, + { path: '/platform/storage.md', keys: ['children', 'previous', 'next'] }, + { path: '/platform/integrations.md', keys: ['children', 'previous', 'next'] }, + { path: '/api/v2.md', keys: ['children', 'next'] }, + // Leaf pages: no children, but sit between two neighbours. + { path: '/platform/storage/dataset.md', keys: ['previous', 'next'] }, + { path: '/platform/proxy/usage.md', keys: ['previous', 'next'] }, + { path: '/api/v2/dataset-get.md', keys: ['previous', 'next'] }, + { path: '/academy/tutorials.md', keys: ['next'] }, + { path: '/legal/general-terms-and-conditions.md', keys: ['previous', 'next'] }, +]; + +const failures = []; +const fail = (page, msg) => failures.push(`${page} → ${msg}`); + +async function fetchText(url) { + const res = await fetch(url, { headers: { Accept: 'text/markdown' } }); + if (!res.ok) throw new Error(`HTTP ${res.status} for ${url}`); + return res.text(); +} + +// Returns the frontmatter lines (between the opening and closing `---`), or null +// if the body does not start with a frontmatter block. +function parseFrontmatter(body) { + const lines = body.split('\n'); + if (lines[0] !== '---') return null; + const close = lines.indexOf('---', 1); + if (close === -1) return null; + return lines.slice(1, close); +} + +// A scalar key like `title: ...` or `next: [Label](url)`. +function getScalar(front, key) { + const line = front.find((l) => l.startsWith(`${key}: `)); + return line ? line.slice(key.length + 2) : null; +} + +// A block key like `parents:` / `children:` followed by ` - ` lines. +function getList(front, key) { + const idx = front.indexOf(`${key}:`); + if (idx === -1) return null; + const items = []; + for (let i = idx + 1; i < front.length && front[i].startsWith(' - '); i++) { + items.push(front[i].slice(4)); + } + return items; +} + +function checkPage(path, body, expectedKeys) { + const front = parseFrontmatter(body); + if (!front) { + fail(path, 'no frontmatter nav header at top of file'); + return; + } + + // Required on every page: title, url (exactly the page's own URL), parents. + const title = getScalar(front, 'title'); + if (!title) fail(path, 'missing or empty `title`'); + + const expectedUrl = `${SITE_URL}${path}`; + const url = getScalar(front, 'url'); + if (url !== expectedUrl) fail(path, `\`url\` is "${url}", expected "${expectedUrl}"`); + + const parents = getList(front, 'parents'); + if (!parents || parents.length === 0) { + fail(path, 'missing `parents`'); + } else { + if (parents[0] !== ROOT_PARENT) fail(path, `first parent is "${parents[0]}", expected the root breadcrumb`); + for (const p of parents) if (!MD_LINK.test(p)) fail(path, `parent is not a markdown link: "${p}"`); + } + + // Conditional keys this page is expected to carry. + for (const key of expectedKeys) { + if (key === 'children') { + const children = getList(front, 'children'); + if (!children || children.length === 0) { + fail(path, 'expected a non-empty `children` list'); + } else { + for (const c of children) if (!MD_LINK.test(c)) fail(path, `child is not a markdown link: "${c}"`); + } + } else { + const value = getScalar(front, key); + if (value === null) fail(path, `expected \`${key}\``); + else if (!MD_LINK.test(value)) fail(path, `\`${key}\` is not a markdown link: "${value}"`); + } + } +} + +console.log(`Checking ${PAGES.length} per-page nav headers against ${BASE} ...\n`); + +for (const { path, keys } of PAGES) { + try { + const body = await fetchText(`${BASE}${path}`); + const before = failures.length; + checkPage(path, body, keys); + console.log(`${failures.length === before ? '✅' : '❌'} ${path} [${['title', 'url', 'parents', ...keys].join(', ')}]`); + } catch (err) { + fail(path, err.message); + console.log(`❌ ${path} (fetch failed)`); + } +} + +// Regression guard for #2557: the header must NOT leak into llms-full.txt. That +// holds only because addNavHeaders runs after the llms-txt plugin writes +// llms-full.txt; reordering the postbuild scripts would silently break it. The +// synthetic root breadcrumb appears only in the header, so its presence here +// means a leak. +try { + const llmsFull = await fetchText(`${BASE}/llms-full.txt`); + if (llmsFull.includes(ROOT_PARENT)) { + fail('/llms-full.txt', 'nav header leaked into llms-full.txt (found the root breadcrumb)'); + console.log('❌ /llms-full.txt (nav header leaked in)'); + } else { + console.log('✅ /llms-full.txt (no nav header leaked in)'); + } +} catch (err) { + fail('/llms-full.txt', err.message); + console.log('❌ /llms-full.txt (fetch failed)'); +} + +if (failures.length > 0) { + console.error(`\n❌ ${failures.length} nav-header check(s) failed:`); + for (const f of failures) console.error(` - ${f}`); + process.exit(1); +} + +console.log(`\n✅ All nav-header checks passed (${PAGES.length} pages + llms-full.txt).`); From 810b934a96cab48d45bc7cafe0cb52dee730aeec Mon Sep 17 00:00:00 2001 From: Marek Honzal Date: Mon, 1 Jun 2026 17:52:46 +0200 Subject: [PATCH 7/7] chore: fix formatting --- scripts/checkNavHeaders.mjs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/checkNavHeaders.mjs b/scripts/checkNavHeaders.mjs index 1b90a6be4c..6ad3b54bfd 100644 --- a/scripts/checkNavHeaders.mjs +++ b/scripts/checkNavHeaders.mjs @@ -138,7 +138,9 @@ for (const { path, keys } of PAGES) { const body = await fetchText(`${BASE}${path}`); const before = failures.length; checkPage(path, body, keys); - console.log(`${failures.length === before ? '✅' : '❌'} ${path} [${['title', 'url', 'parents', ...keys].join(', ')}]`); + console.log( + `${failures.length === before ? '✅' : '❌'} ${path} [${['title', 'url', 'parents', ...keys].join(', ')}]`, + ); } catch (err) { fail(path, err.message); console.log(`❌ ${path} (fetch failed)`);