diff --git a/package.json b/package.json index f84be4a952..4ca0086043 100644 --- a/package.json +++ b/package.json @@ -47,7 +47,7 @@ "test:academy": "bats --print-output-on-failure -r .", "test:llms-size": "node ./scripts/checkLlmsSize.mjs", "postinstall": "patch-package", - "postbuild": "node ./scripts/joinLlmsFiles.mjs && node ./scripts/indentLlmsFile.mjs" + "postbuild": "node ./scripts/joinLlmsFiles.mjs && node ./scripts/indentLlmsFile.mjs && node ./scripts/addNavHeaders.mjs" }, "devDependencies": { "@apify/oxlint-config": "^0.2.5", diff --git a/scripts/addNavHeaders.mjs b/scripts/addNavHeaders.mjs new file mode 100644 index 0000000000..32a55afccd --- /dev/null +++ b/scripts/addNavHeaders.mjs @@ -0,0 +1,326 @@ +import fs from 'node:fs/promises'; +import path from 'node:path'; + +// Prepend a Vercel-style nav header (frontmatter) to every per-page `.md` file +// generated by `@signalwire/docusaurus-plugin-llms-txt`. See issue #2557. +// +// Why post-build (not a remark/rehype plugin): the llms-txt plugin writes +// `llms-full.txt` from the SAME per-page content during its own `postBuild`. +// Editing the per-page `.md` files AFTER that — in npm `postbuild` — keeps the +// header out of `llms-full.txt` for free, as required by the issue. +// +// Data sources (both generated by `docusaurus build`): +// - `.docusaurus/docusaurus-plugin-content-docs//p/*.json` +// The real left-menu tree (`version.docsSidebars`) — labels, order, +// nesting, hrefs. One file per docs instance (platform, academy, legal, +// openapi). Academy has TWO sidebars (`courses`, `tutorials`). +// - `.docusaurus/docusaurus-plugin-llms-txt/cache.json` +// Route → title fallback for pages absent from every sidebar +// (`/`, `/api`, `/open-source`, `/sdk`) and for section-landing titles +// where the sidebar label is "Home" but the page title is more descriptive. + +const BUILD_DIR = path.resolve('build'); +const DOCUSAURUS_DIR = path.resolve('.docusaurus'); +const CONTENT_DOCS_DIR = path.join(DOCUSAURUS_DIR, 'docusaurus-plugin-content-docs'); +const LLMS_CACHE_FILE = path.join(DOCUSAURUS_DIR, 'docusaurus-plugin-llms-txt', 'cache.json'); + +// Aligned with scripts/joinLlmsFiles.mjs + scripts/indentLlmsFile.mjs so +// preview/PR builds (which set APIFY_DOCS_ABSOLUTE_URL) get correct URLs. +const SITE_URL = process.env.APIFY_DOCS_ABSOLUTE_URL || 'https://docs.apify.com'; + +// --- URL / label helpers --------------------------------------------------- + +function normalizeRoute(href) { + if (typeof href !== 'string' || href.length === 0) return null; + return href.length > 1 && href.endsWith('/') ? href.slice(0, -1) : href; +} + +function routeToUrl(route) { + // Defensive: at least one source page (academy cloudflare-challenge) has a + // `slug:` that already ends in `.md`. Don't double up the suffix. + const stripped = route.endsWith('.md') ? route.slice(0, -3) : route; + return `${SITE_URL}${stripped}.md`; +} + +// Escape characters that would break a `[Label](url)` markdown link parse. +// Labels are almost always clean; this is a safety net. +function escapeLinkLabel(label) { + return label.replace(/\\/g, '\\\\').replace(/([[\]])/g, '\\$1'); +} + +function mdLink(label, route) { + return `[${escapeLinkLabel(label)}](${routeToUrl(route)})`; +} + +// Synthetic root parent prepended to every page — the agent's link back to the +// full index. It points at `llms.txt` (not a `.md`), so it carries an explicit +// `url` rather than a route. See issue #2557. +const ROOT_PARENT = { label: 'Apify documentation', url: `${SITE_URL}/llms.txt` }; + +// A nav item is either {label, route} (a real per-page `.md`) or {label, url} +// (a literal link, e.g. the llms.txt root). Render whichever it carries. +function navLink(item) { + return item.url ? `[${escapeLinkLabel(item.label)}](${item.url})` : mdLink(item.label, item.route); +} + +// The section landing (`/platform`, `/academy`, `/api`, ...) is modeled in the +// sidebar as a sibling *link* ("Home"), not an ancestor category, so `walk()` +// never records it as a parent. Derive it from the route's first path segment. +function sectionLandingRoute(route) { + const seg = route.split('/').find(Boolean); + return seg ? `/${seg}` : null; +} + +// Full upward chain for a page: llms.txt root, then the section landing, then +// the sidebar-category ancestors from `walk()`. Bounded by site depth (~5), so +// unlike `children` there's no fan-out/size risk. See issue #2557. +function buildParents(route, sidebarParents, titleFor) { + const parents = [ROOT_PARENT]; + const landing = sectionLandingRoute(route); + if (landing && landing !== route && !sidebarParents.some((p) => p.route === landing)) { + const label = titleFor(landing); + if (label) parents.push({ label, route: landing }); + } + parents.push(...sidebarParents); + return parents; +} + +// Quote the `title:` value if it contains characters that would derail a +// downstream YAML-ish parser reading the title line. We intentionally don't +// commit to strict YAML for the body (the `- [Label](url)` list items aren't +// valid YAML scalars), but the title line itself stays parseable. +const TITLE_NEEDS_QUOTE = /[:#&*!|>'"%@`[\]{},\n]|^\s|\s$|^[\d?-]/; +function quoteTitle(value) { + if (TITLE_NEEDS_QUOTE.test(value)) { + return `"${value.replace(/\\/g, '\\\\').replace(/"/g, '\\"')}"`; + } + return value; +} + +// --- Sidebar walking ------------------------------------------------------- + +// Push `{ label, route }` onto `list` if `node` is listed and has a real route. +// Returns whether a route was found, so the caller can decide whether to lift +// children up through an href-less grouping category. +function pushRoutedChild(list, node) { + if (node.unlisted === true) return false; + const route = normalizeRoute(node.href); + if (route) list.push({ label: node.label, route }); + return route !== null; +} + +/** + * Walk one sidebar tree, populating: + * - `indexMap`: normalizedRoute → { label, parents[], children[] } + * - `flat`: flat DFS list of {route, label} for prev/next within this sidebar + * + * Rules (per approved plan): + * - Skip `unlisted: true` nodes entirely (not in any list, no own entry). + * - Parents = sidebar category ancestors that have an `href`. Href-less + * categories ("Programming", API's "Actors") pass through but aren't + * recorded as parents — they're nav grouping labels, not real pages. + * - Children = the node's direct `items[]`. For href-less category children, + * we recurse ONE level to lift their href-bearing children up, so direct + * grouping is preserved even though the empty label itself is dropped. + * - First occurrence of a route wins (a route in two sidebars only ends up in + * the first-walked one's prev/next sequence). + */ +function walk(node, ancestors, indexMap, flat) { + const isUnlisted = node.unlisted === true; + const hasHref = typeof node.href === 'string' && node.href.length > 0; + const route = hasHref ? normalizeRoute(node.href) : null; + + if (route && !isUnlisted && !indexMap.has(route)) { + const parents = ancestors + .filter((a) => a.route && !a.unlisted) + .map((a) => ({ label: a.label, route: a.route })); + + const children = []; + if (Array.isArray(node.items)) { + for (const child of node.items) { + if (child.unlisted === true) continue; + // Lift one level through an href-less grouping category so its + // routed children stay grouped under this node. + if (!pushRoutedChild(children, child) && child.type === 'category' && Array.isArray(child.items)) { + for (const grand of child.items) pushRoutedChild(children, grand); + } + } + } + + indexMap.set(route, { label: node.label, parents, children }); + flat.push({ route, label: node.label }); + } + + if (Array.isArray(node.items)) { + const next = [...ancestors, { label: node.label, route, unlisted: isUnlisted }]; + for (const child of node.items) walk(child, next, indexMap, flat); + } +} + +async function loadSidebars() { + const indexMap = new Map(); + const flatLists = []; // each is an array of {route, label} for one sidebar + + let instances; + try { + instances = await fs.readdir(CONTENT_DOCS_DIR, { withFileTypes: true }); + } catch { + console.warn(`addNavHeaders: ${CONTENT_DOCS_DIR} missing — no sidebars to load.`); + return { indexMap, flatLists }; + } + + for (const dirent of instances) { + if (!dirent.isDirectory()) continue; + const pDir = path.join(CONTENT_DOCS_DIR, dirent.name, 'p'); + let propFiles; + try { + propFiles = await fs.readdir(pDir); + } catch { + continue; + } + for (const file of propFiles) { + if (!file.endsWith('.json')) continue; + const data = JSON.parse(await fs.readFile(path.join(pDir, file), 'utf8')); + const sidebars = data?.version?.docsSidebars; + if (!sidebars) continue; + for (const items of Object.values(sidebars)) { + if (!Array.isArray(items)) continue; + const flat = []; + for (const item of items) walk(item, [], indexMap, flat); + flatLists.push(flat); + } + } + } + + return { indexMap, flatLists }; +} + +// --- Cache (title fallback) ------------------------------------------------ + +async function loadCacheTitles() { + try { + const data = JSON.parse(await fs.readFile(LLMS_CACHE_FILE, 'utf8')); + const map = new Map(); + for (const r of data.routes || []) { + if (r.path && r.title) map.set(normalizeRoute(r.path), r.title); + } + return map; + } catch (err) { + console.warn('addNavHeaders: could not read llms-txt cache:', err.message); + return new Map(); + } +} + +// --- File processing ------------------------------------------------------- + +async function* walkMd(dir) { + const entries = await fs.readdir(dir, { withFileTypes: true }); + for (const entry of entries) { + const full = path.join(dir, entry.name); + if (entry.isDirectory()) { + yield* walkMd(full); + } else if (entry.isFile() && entry.name.endsWith('.md')) { + yield full; + } + } +} + +function fileToRoute(filePath) { + const rel = path.relative(BUILD_DIR, filePath); + return `/${rel.replace(/\.md$/, '')}`; +} + +function titleFromContent(content) { + const match = content.match(/^#\s+(.+?)\s*$/m); + return match ? match[1].trim() : null; +} + +function renderHeader({ title, url, parents, children, previous, next }) { + const lines = ['---']; + lines.push(`title: ${quoteTitle(title)}`); + lines.push(`url: ${url}`); + if (parents && parents.length > 0) { + lines.push('parents:'); + for (const p of parents) lines.push(` - ${navLink(p)}`); + } + if (children && children.length > 0) { + lines.push('children:'); + for (const c of children) lines.push(` - ${mdLink(c.label, c.route)}`); + } + if (previous) lines.push(`previous: ${mdLink(previous.label, previous.route)}`); + if (next) lines.push(`next: ${mdLink(next.label, next.route)}`); + lines.push('---'); + lines.push(''); + return `${lines.join('\n')}\n`; +} + +async function main() { + try { + await fs.access(BUILD_DIR); + } catch { + console.warn(`addNavHeaders: ${BUILD_DIR} not found — nothing to do.`); + return; + } + + const [{ indexMap, flatLists }, cacheTitles] = await Promise.all([loadSidebars(), loadCacheTitles()]); + + // Build prev/next from each sidebar's flat order independently. Doesn't + // cross sidebars (e.g. academy's `courses` vs `tutorials`). + const prevNext = new Map(); + for (const flat of flatLists) { + for (let i = 0; i < flat.length; i++) { + const nav = {}; + if (i > 0) nav.previous = flat[i - 1]; + if (i < flat.length - 1) nav.next = flat[i + 1]; + if (!prevNext.has(flat[i].route)) prevNext.set(flat[i].route, nav); + } + } + + // Resolve a route to its best display title (cache title > sidebar label). + // Used for the section-landing parent, whose sidebar label is "Home". + const titleFor = (r) => cacheTitles.get(r) || indexMap.get(r)?.label || null; + + let processed = 0; + let skipped = 0; + const unknownRoutes = []; + + for await (const filePath of walkMd(BUILD_DIR)) { + const content = await fs.readFile(filePath, 'utf8'); + if (content.startsWith('---\n')) { + skipped++; + continue; + } + + const route = fileToRoute(filePath); + const sidebarEntry = indexMap.get(route); + const nav = prevNext.get(route) || {}; + + // Title preference: cache (true page title, e.g. "Apify platform + // documentation") > sidebar label (sometimes shorter, e.g. "Home") > + // first H1 in content > generic fallback. + const title = cacheTitles.get(route) || sidebarEntry?.label || titleFromContent(content) || 'Untitled'; + + if (!sidebarEntry && !cacheTitles.has(route)) unknownRoutes.push(route); + + const header = renderHeader({ + title, + url: routeToUrl(route), + parents: buildParents(route, sidebarEntry?.parents || [], titleFor), + children: sidebarEntry?.children || [], + previous: nav.previous, + next: nav.next, + }); + + await fs.writeFile(filePath, header + content, 'utf8'); + processed++; + } + + console.log( + `addNavHeaders: wrote headers to ${processed} files (${skipped} already had frontmatter, ${unknownRoutes.length} without sidebar or cache title)`, + ); + if (unknownRoutes.length > 0) { + console.log(`addNavHeaders: routes without sidebar or cache title:\n ${unknownRoutes.sort().join('\n ')}`); + } +} + +await main();