Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
"test:academy": "bats --print-output-on-failure -r .",
"test:llms-size": "node ./scripts/checkLlmsSize.mjs",
"postinstall": "patch-package",
"postbuild": "node ./scripts/joinLlmsFiles.mjs && node ./scripts/indentLlmsFile.mjs"
"postbuild": "node ./scripts/joinLlmsFiles.mjs && node ./scripts/indentLlmsFile.mjs && node ./scripts/addNavHeaders.mjs"
},
"devDependencies": {
"@apify/oxlint-config": "^0.2.5",
Expand Down
326 changes: 326 additions & 0 deletions scripts/addNavHeaders.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,326 @@
import fs from 'node:fs/promises';
import path from 'node:path';

// Prepend a Vercel-style nav header (frontmatter) to every per-page `.md` file
// generated by `@signalwire/docusaurus-plugin-llms-txt`. See issue #2557.
//
// Why post-build (not a remark/rehype plugin): the llms-txt plugin writes
// `llms-full.txt` from the SAME per-page content during its own `postBuild`.
// Editing the per-page `.md` files AFTER that — in npm `postbuild` — keeps the
// header out of `llms-full.txt` for free, as required by the issue.
//
// Data sources (both generated by `docusaurus build`):
// - `.docusaurus/docusaurus-plugin-content-docs/<id>/p/*.json`
// The real left-menu tree (`version.docsSidebars`) — labels, order,
// nesting, hrefs. One file per docs instance (platform, academy, legal,
// openapi). Academy has TWO sidebars (`courses`, `tutorials`).
// - `.docusaurus/docusaurus-plugin-llms-txt/cache.json`
// Route → title fallback for pages absent from every sidebar
// (`/`, `/api`, `/open-source`, `/sdk`) and for section-landing titles
// where the sidebar label is "Home" but the page title is more descriptive.

const BUILD_DIR = path.resolve('build');
const DOCUSAURUS_DIR = path.resolve('.docusaurus');
const CONTENT_DOCS_DIR = path.join(DOCUSAURUS_DIR, 'docusaurus-plugin-content-docs');
const LLMS_CACHE_FILE = path.join(DOCUSAURUS_DIR, 'docusaurus-plugin-llms-txt', 'cache.json');

// Aligned with scripts/joinLlmsFiles.mjs + scripts/indentLlmsFile.mjs so
// preview/PR builds (which set APIFY_DOCS_ABSOLUTE_URL) get correct URLs.
const SITE_URL = process.env.APIFY_DOCS_ABSOLUTE_URL || 'https://docs.apify.com';

// --- URL / label helpers ---------------------------------------------------

function normalizeRoute(href) {
if (typeof href !== 'string' || href.length === 0) return null;
return href.length > 1 && href.endsWith('/') ? href.slice(0, -1) : href;
}

function routeToUrl(route) {
// Defensive: at least one source page (academy cloudflare-challenge) has a
// `slug:` that already ends in `.md`. Don't double up the suffix.
const stripped = route.endsWith('.md') ? route.slice(0, -3) : route;
return `${SITE_URL}${stripped}.md`;
}

// Escape characters that would break a `[Label](url)` markdown link parse.
// Labels are almost always clean; this is a safety net.
function escapeLinkLabel(label) {
return label.replace(/\\/g, '\\\\').replace(/([[\]])/g, '\\$1');
}

function mdLink(label, route) {
return `[${escapeLinkLabel(label)}](${routeToUrl(route)})`;
}

// Synthetic root parent prepended to every page — the agent's link back to the
// full index. It points at `llms.txt` (not a `.md`), so it carries an explicit
// `url` rather than a route. See issue #2557.
const ROOT_PARENT = { label: 'Apify documentation', url: `${SITE_URL}/llms.txt` };

// A nav item is either {label, route} (a real per-page `.md`) or {label, url}
// (a literal link, e.g. the llms.txt root). Render whichever it carries.
function navLink(item) {
return item.url ? `[${escapeLinkLabel(item.label)}](${item.url})` : mdLink(item.label, item.route);
}

// The section landing (`/platform`, `/academy`, `/api`, ...) is modeled in the
// sidebar as a sibling *link* ("Home"), not an ancestor category, so `walk()`
// never records it as a parent. Derive it from the route's first path segment.
function sectionLandingRoute(route) {
const seg = route.split('/').find(Boolean);
return seg ? `/${seg}` : null;
}

// Full upward chain for a page: llms.txt root, then the section landing, then
// the sidebar-category ancestors from `walk()`. Bounded by site depth (~5), so
// unlike `children` there's no fan-out/size risk. See issue #2557.
function buildParents(route, sidebarParents, titleFor) {
const parents = [ROOT_PARENT];
const landing = sectionLandingRoute(route);
if (landing && landing !== route && !sidebarParents.some((p) => p.route === landing)) {
const label = titleFor(landing);
if (label) parents.push({ label, route: landing });
}
parents.push(...sidebarParents);
return parents;
}

// Quote the `title:` value if it contains characters that would derail a
// downstream YAML-ish parser reading the title line. We intentionally don't
// commit to strict YAML for the body (the `- [Label](url)` list items aren't
// valid YAML scalars), but the title line itself stays parseable.
const TITLE_NEEDS_QUOTE = /[:#&*!|>'"%@`[\]{},\n]|^\s|\s$|^[\d?-]/;
function quoteTitle(value) {
if (TITLE_NEEDS_QUOTE.test(value)) {
return `"${value.replace(/\\/g, '\\\\').replace(/"/g, '\\"')}"`;
}
return value;
}

// --- Sidebar walking -------------------------------------------------------

// Push `{ label, route }` onto `list` if `node` is listed and has a real route.
// Returns whether a route was found, so the caller can decide whether to lift
// children up through an href-less grouping category.
function pushRoutedChild(list, node) {
if (node.unlisted === true) return false;
const route = normalizeRoute(node.href);
if (route) list.push({ label: node.label, route });
return route !== null;
}

/**
* Walk one sidebar tree, populating:
* - `indexMap`: normalizedRoute → { label, parents[], children[] }
* - `flat`: flat DFS list of {route, label} for prev/next within this sidebar
*
* Rules (per approved plan):
* - Skip `unlisted: true` nodes entirely (not in any list, no own entry).
* - Parents = sidebar category ancestors that have an `href`. Href-less
* categories ("Programming", API's "Actors") pass through but aren't
* recorded as parents — they're nav grouping labels, not real pages.
* - Children = the node's direct `items[]`. For href-less category children,
* we recurse ONE level to lift their href-bearing children up, so direct
* grouping is preserved even though the empty label itself is dropped.
* - First occurrence of a route wins (a route in two sidebars only ends up in
* the first-walked one's prev/next sequence).
*/
function walk(node, ancestors, indexMap, flat) {
const isUnlisted = node.unlisted === true;
const hasHref = typeof node.href === 'string' && node.href.length > 0;
const route = hasHref ? normalizeRoute(node.href) : null;

if (route && !isUnlisted && !indexMap.has(route)) {
const parents = ancestors
.filter((a) => a.route && !a.unlisted)
.map((a) => ({ label: a.label, route: a.route }));

const children = [];
if (Array.isArray(node.items)) {
for (const child of node.items) {
if (child.unlisted === true) continue;
// Lift one level through an href-less grouping category so its
// routed children stay grouped under this node.
if (!pushRoutedChild(children, child) && child.type === 'category' && Array.isArray(child.items)) {
for (const grand of child.items) pushRoutedChild(children, grand);
}
}
}

indexMap.set(route, { label: node.label, parents, children });
flat.push({ route, label: node.label });
}

if (Array.isArray(node.items)) {
const next = [...ancestors, { label: node.label, route, unlisted: isUnlisted }];
for (const child of node.items) walk(child, next, indexMap, flat);
}
}

async function loadSidebars() {
const indexMap = new Map();
const flatLists = []; // each is an array of {route, label} for one sidebar

let instances;
try {
instances = await fs.readdir(CONTENT_DOCS_DIR, { withFileTypes: true });
} catch {
console.warn(`addNavHeaders: ${CONTENT_DOCS_DIR} missing — no sidebars to load.`);
return { indexMap, flatLists };
}

for (const dirent of instances) {
if (!dirent.isDirectory()) continue;
const pDir = path.join(CONTENT_DOCS_DIR, dirent.name, 'p');
let propFiles;
try {
propFiles = await fs.readdir(pDir);
} catch {
continue;
}
for (const file of propFiles) {
if (!file.endsWith('.json')) continue;
const data = JSON.parse(await fs.readFile(path.join(pDir, file), 'utf8'));
const sidebars = data?.version?.docsSidebars;
if (!sidebars) continue;
for (const items of Object.values(sidebars)) {
if (!Array.isArray(items)) continue;
const flat = [];
for (const item of items) walk(item, [], indexMap, flat);
flatLists.push(flat);
}
}
}

return { indexMap, flatLists };
}

// --- Cache (title fallback) ------------------------------------------------

async function loadCacheTitles() {
try {
const data = JSON.parse(await fs.readFile(LLMS_CACHE_FILE, 'utf8'));
const map = new Map();
for (const r of data.routes || []) {
if (r.path && r.title) map.set(normalizeRoute(r.path), r.title);
}
return map;
} catch (err) {
console.warn('addNavHeaders: could not read llms-txt cache:', err.message);
return new Map();
}
}

// --- File processing -------------------------------------------------------

async function* walkMd(dir) {
const entries = await fs.readdir(dir, { withFileTypes: true });
for (const entry of entries) {
const full = path.join(dir, entry.name);
if (entry.isDirectory()) {
yield* walkMd(full);
} else if (entry.isFile() && entry.name.endsWith('.md')) {
yield full;
}
}
}

function fileToRoute(filePath) {
const rel = path.relative(BUILD_DIR, filePath);
return `/${rel.replace(/\.md$/, '')}`;
}

function titleFromContent(content) {
const match = content.match(/^#\s+(.+?)\s*$/m);
return match ? match[1].trim() : null;
}

function renderHeader({ title, url, parents, children, previous, next }) {
const lines = ['---'];
lines.push(`title: ${quoteTitle(title)}`);
lines.push(`url: ${url}`);
if (parents && parents.length > 0) {
lines.push('parents:');
for (const p of parents) lines.push(` - ${navLink(p)}`);
}
if (children && children.length > 0) {
lines.push('children:');
for (const c of children) lines.push(` - ${mdLink(c.label, c.route)}`);
}
if (previous) lines.push(`previous: ${mdLink(previous.label, previous.route)}`);
if (next) lines.push(`next: ${mdLink(next.label, next.route)}`);
lines.push('---');
lines.push('');
return `${lines.join('\n')}\n`;
}

async function main() {
try {
await fs.access(BUILD_DIR);
} catch {
console.warn(`addNavHeaders: ${BUILD_DIR} not found — nothing to do.`);
return;
}

const [{ indexMap, flatLists }, cacheTitles] = await Promise.all([loadSidebars(), loadCacheTitles()]);

// Build prev/next from each sidebar's flat order independently. Doesn't
// cross sidebars (e.g. academy's `courses` vs `tutorials`).
const prevNext = new Map();
for (const flat of flatLists) {
for (let i = 0; i < flat.length; i++) {
const nav = {};
if (i > 0) nav.previous = flat[i - 1];
if (i < flat.length - 1) nav.next = flat[i + 1];
if (!prevNext.has(flat[i].route)) prevNext.set(flat[i].route, nav);
}
}

// Resolve a route to its best display title (cache title > sidebar label).
// Used for the section-landing parent, whose sidebar label is "Home".
const titleFor = (r) => cacheTitles.get(r) || indexMap.get(r)?.label || null;

let processed = 0;
let skipped = 0;
const unknownRoutes = [];

for await (const filePath of walkMd(BUILD_DIR)) {
const content = await fs.readFile(filePath, 'utf8');
if (content.startsWith('---\n')) {
skipped++;
continue;
}

const route = fileToRoute(filePath);
const sidebarEntry = indexMap.get(route);
const nav = prevNext.get(route) || {};

// Title preference: cache (true page title, e.g. "Apify platform
// documentation") > sidebar label (sometimes shorter, e.g. "Home") >
// first H1 in content > generic fallback.
const title = cacheTitles.get(route) || sidebarEntry?.label || titleFromContent(content) || 'Untitled';

if (!sidebarEntry && !cacheTitles.has(route)) unknownRoutes.push(route);

const header = renderHeader({
title,
url: routeToUrl(route),
parents: buildParents(route, sidebarEntry?.parents || [], titleFor),
children: sidebarEntry?.children || [],
previous: nav.previous,
next: nav.next,
});

await fs.writeFile(filePath, header + content, 'utf8');
processed++;
}

console.log(
`addNavHeaders: wrote headers to ${processed} files (${skipped} already had frontmatter, ${unknownRoutes.length} without sidebar or cache title)`,
);
if (unknownRoutes.length > 0) {
console.log(`addNavHeaders: routes without sidebar or cache title:\n ${unknownRoutes.sort().join('\n ')}`);
}
}

await main();
Loading