From 8e1b61f9bcf8cadcbb4e4bf20df3f2d453a903bd Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Wed, 1 Jul 2026 08:52:29 +0900 Subject: [PATCH 1/4] Report HTML docloader failures Reject HTML and XHTML document-loader responses that do not advertise an ActivityPub alternate document before they reach the JSON parser. This surfaces remote HTML pages as FetchError failures with response context instead of generic SyntaxError crashes. Add regression coverage for HTML responses without ActivityPub alternates and keep the ReDoS regression test aligned with the new error path. Fixes https://github.com/fedify-dev/fedify/issues/912 Assisted-by: Codex:gpt-5.5 --- CHANGES.md | 11 ++++ packages/vocab-runtime/src/docloader.test.ts | 58 ++++++++++++-------- packages/vocab-runtime/src/docloader.ts | 15 ++++- 3 files changed, 59 insertions(+), 25 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 8bd936766..e6314c274 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -20,6 +20,17 @@ To be released. [#826]: https://github.com/fedify-dev/fedify/issues/826 [#850]: https://github.com/fedify-dev/fedify/pull/850 +### @fedify/vocab-runtime + + - Changed `getDocumentLoader()` to reject HTML and XHTML responses that do + not advertise an ActivityPub alternate document with a `FetchError` + instead of attempting to parse the HTML as JSON. This makes remote HTML + error pages surface as document loading failures with the response URL and + content type, rather than generic JSON parser crashes. [[#912], [#913]] + +[#912]: https://github.com/fedify-dev/fedify/issues/912 +[#913]: https://github.com/fedify-dev/fedify/pull/913 + Version 2.3.1 ------------- diff --git a/packages/vocab-runtime/src/docloader.test.ts b/packages/vocab-runtime/src/docloader.test.ts index e96d8c9ab..9a6ae56f9 100644 --- a/packages/vocab-runtime/src/docloader.test.ts +++ b/packages/vocab-runtime/src/docloader.test.ts @@ -90,7 +90,7 @@ test("getDocumentLoader()", async (t) => { type: "Object", }, headers: { - "Content-Type": "text/html; charset=utf-8", + "Content-Type": "application/activity+json", Link: '; rel="alternate"; ' + 'type="application/ld+json; profile="https://www.w3.org/ns/activitystreams""', }, @@ -247,28 +247,40 @@ test("getDocumentLoader()", async (t) => { }); }); - fetchMock.get("https://example.com/wrong-content-type", { - body: { - "@context": "https://www.w3.org/ns/activitystreams", - id: "https://example.com/wrong-content-type", - name: "Fetched object", - type: "Object", - }, + fetchMock.get("https://example.com/html-no-alternate", { + body: ` + + + Not an ActivityPub document + + Not found + `, headers: { "Content-Type": "text/html; charset=utf-8" }, }); - await t.test("Wrong Content-Type", async () => { - deepStrictEqual( - await fetchDocumentLoader("https://example.com/wrong-content-type"), - { - contextUrl: null, - documentUrl: "https://example.com/wrong-content-type", - document: { - "@context": "https://www.w3.org/ns/activitystreams", - id: "https://example.com/wrong-content-type", - name: "Fetched object", - type: "Object", - }, + await t.test("HTML without ActivityPub alternate link", async () => { + await rejects( + () => fetchDocumentLoader("https://example.com/html-no-alternate"), + (error) => { + ok(error instanceof FetchError); + ok( + error.message.includes( + "HTML document has no ActivityPub alternate link", + ), + ); + ok( + error.message.includes("Content-Type: text/html; charset=utf-8"), + ); + deepStrictEqual( + error.url, + new URL("https://example.com/html-no-alternate"), + ); + ok(error.response != null); + deepStrictEqual( + error.response.headers.get("Content-Type"), + "text/html; charset=utf-8", + ); + return true; }, ); }); @@ -459,11 +471,11 @@ test("getDocumentLoader()", async (t) => { await t.test("ReDoS resistance (CVE-2025-68475)", async () => { const start = performance.now(); - // The malicious HTML will fail JSON parsing, but the important thing is - // that it should complete quickly (not hang due to ReDoS) + // The malicious HTML will fail alternate discovery, but the important + // thing is that it should complete quickly (not hang due to ReDoS). await rejects( () => fetchDocumentLoader("https://example.com/redos"), - SyntaxError, + FetchError, ); const elapsed = performance.now() - start; diff --git a/packages/vocab-runtime/src/docloader.ts b/packages/vocab-runtime/src/docloader.ts index 0cca8d6bf..6ed1a99e6 100644 --- a/packages/vocab-runtime/src/docloader.ts +++ b/packages/vocab-runtime/src/docloader.ts @@ -202,13 +202,19 @@ export async function getRemoteDocument( ) { // Security: Limit HTML response size to mitigate ReDoS attacks const MAX_HTML_SIZE = 1024 * 1024; // 1MB + const errorResponse = response.clone(); const html = await response.text(); if (html.length > MAX_HTML_SIZE) { logger.warn( "HTML response too large, skipping alternate link discovery: {url}", { url: documentUrl, size: html.length }, ); - document = JSON.parse(html); + throw new FetchError( + documentUrl, + `HTML document is too large to scan for an ActivityPub alternate link ` + + `(Content-Type: ${contentType})`, + errorResponse, + ); } else { // Safe regex patterns without nested quantifiers to prevent ReDoS // (CVE-2025-68475) @@ -247,7 +253,12 @@ export async function getRemoteDocument( return await fetch(new URL(attribs.href, docUrl).href); } } - document = JSON.parse(html); + throw new FetchError( + documentUrl, + `HTML document has no ActivityPub alternate link ` + + `(Content-Type: ${contentType})`, + errorResponse, + ); } } else { document = await response.json(); From d248bb18c96b9f74557c77dceac2747bd6058f64 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Wed, 1 Jul 2026 09:07:10 +0900 Subject: [PATCH 2/4] Preserve HTML-labelled JSON Keep the HTML document-loader fallback narrow by parsing responses that look like JSON even when they are served with an HTML Content-Type. This preserves compatibility with misconfigured ActivityPub endpoints while still reporting real HTML pages without alternates as FetchError failures. Read HTML responses through a bounded stream instead of buffering the full body before checking the scan limit. Oversized responses now short-circuit from Content-Length when available, or while reading when the stream crosses the limit. https://github.com/fedify-dev/fedify/pull/913#discussion_r3502470739 https://github.com/fedify-dev/fedify/pull/913#discussion_r3502486555 Assisted-by: Codex:gpt-5.5 --- packages/vocab-runtime/src/docloader.test.ts | 55 ++++++++++++++ packages/vocab-runtime/src/docloader.ts | 76 ++++++++++++++++---- 2 files changed, 118 insertions(+), 13 deletions(-) diff --git a/packages/vocab-runtime/src/docloader.test.ts b/packages/vocab-runtime/src/docloader.test.ts index 9a6ae56f9..f40d63db7 100644 --- a/packages/vocab-runtime/src/docloader.test.ts +++ b/packages/vocab-runtime/src/docloader.test.ts @@ -285,6 +285,61 @@ test("getDocumentLoader()", async (t) => { ); }); + fetchMock.get("https://example.com/wrong-content-type", { + body: { + "@context": "https://www.w3.org/ns/activitystreams", + id: "https://example.com/wrong-content-type", + name: "Fetched object", + type: "Object", + }, + headers: { "Content-Type": "text/html; charset=utf-8" }, + }); + + await t.test("wrong Content-Type with JSON body", async () => { + deepStrictEqual( + await fetchDocumentLoader("https://example.com/wrong-content-type"), + { + contextUrl: null, + documentUrl: "https://example.com/wrong-content-type", + document: { + "@context": "https://www.w3.org/ns/activitystreams", + id: "https://example.com/wrong-content-type", + name: "Fetched object", + type: "Object", + }, + }, + ); + }); + + fetchMock.get("https://example.com/large-html", { + body: "", + headers: { + "Content-Length": String(1024 * 1024 + 1), + "Content-Type": "text/html; charset=utf-8", + }, + }); + + await t.test("HTML Content-Length over limit", async () => { + await rejects( + () => fetchDocumentLoader("https://example.com/large-html"), + (error) => { + ok(error instanceof FetchError); + ok( + error.message.includes( + "HTML document is too large to scan for an ActivityPub alternate link", + ), + ); + ok(error.response != null); + deepStrictEqual(error.response.status, 200); + deepStrictEqual( + error.response.headers.get("Content-Type"), + "text/html; charset=utf-8", + ); + return true; + }, + ); + }); + fetchMock.get("https://example.com/404", { status: 404 }); await t.test("not ok", async () => { diff --git a/packages/vocab-runtime/src/docloader.ts b/packages/vocab-runtime/src/docloader.ts index 6ed1a99e6..319ef6723 100644 --- a/packages/vocab-runtime/src/docloader.ts +++ b/packages/vocab-runtime/src/docloader.ts @@ -13,6 +13,7 @@ import { UrlError, validatePublicUrl } from "./url.ts"; const logger = getLogger(["fedify", "runtime", "docloader"]); const DEFAULT_MAX_REDIRECTION = 20; +const MAX_HTML_SIZE = 1024 * 1024; // 1MB /** * A remote JSON-LD document and its context fetched by @@ -112,6 +113,52 @@ export type AuthenticatedDocumentLoaderFactory = ( options?: DocumentLoaderFactoryOptions, ) => DocumentLoader; +function createResponseMetadata(response: Response): Response { + return new Response(null, { + headers: response.headers, + status: response.status, + statusText: response.statusText, + }); +} + +async function readBoundedText( + response: Response, + maxBytes: number, +): Promise<{ text: string; size: number; tooLarge: boolean }> { + const contentLength = response.headers.get("Content-Length"); + if (contentLength != null) { + const size = Number(contentLength); + if (Number.isFinite(size) && size > maxBytes) { + return { text: "", size, tooLarge: true }; + } + } + + if (response.body == null) return { text: "", size: 0, tooLarge: false }; + + const reader = response.body.getReader(); + const decoder = new TextDecoder(); + let text = ""; + let size = 0; + try { + while (true) { + const result = await reader.read(); + if (result.done) break; + const chunkSize = result.value.byteLength; + if (size + chunkSize > maxBytes) { + size += chunkSize; + await reader.cancel(); + return { text: "", size, tooLarge: true }; + } + size += chunkSize; + text += decoder.decode(result.value, { stream: true }); + } + text += decoder.decode(); + return { text, size, tooLarge: false }; + } finally { + reader.releaseLock(); + } +} + /** * Gets a {@link RemoteDocument} from the given response. * @param url The URL of the document to load. @@ -200,14 +247,12 @@ export async function getRemoteDocument( contentType === "application/xhtml+xml" || contentType?.startsWith("application/xhtml+xml;")) ) { - // Security: Limit HTML response size to mitigate ReDoS attacks - const MAX_HTML_SIZE = 1024 * 1024; // 1MB - const errorResponse = response.clone(); - const html = await response.text(); - if (html.length > MAX_HTML_SIZE) { + const errorResponse = createResponseMetadata(response); + const html = await readBoundedText(response, MAX_HTML_SIZE); + if (html.tooLarge) { logger.warn( "HTML response too large, skipping alternate link discovery: {url}", - { url: documentUrl, size: html.length }, + { url: documentUrl, size: html.size }, ); throw new FetchError( documentUrl, @@ -225,7 +270,7 @@ export async function getRemoteDocument( /([a-z][a-z:_-]*)=(?:"([^"]*)"|'([^']*)'|([^\s>]+))/gi; let tagMatch: RegExpExecArray | null; - while ((tagMatch = tagPattern.exec(html)) !== null) { + while ((tagMatch = tagPattern.exec(html.text)) !== null) { const tagContent = tagMatch[2]; let attrMatch: RegExpExecArray | null; const attribs: Record = {}; @@ -253,12 +298,17 @@ export async function getRemoteDocument( return await fetch(new URL(attribs.href, docUrl).href); } } - throw new FetchError( - documentUrl, - `HTML document has no ActivityPub alternate link ` + - `(Content-Type: ${contentType})`, - errorResponse, - ); + const trimmed = html.text.trimStart(); + if (trimmed.startsWith("{") || trimmed.startsWith("[")) { + document = JSON.parse(html.text); + } else { + throw new FetchError( + documentUrl, + `HTML document has no ActivityPub alternate link ` + + `(Content-Type: ${contentType})`, + errorResponse, + ); + } } } else { document = await response.json(); From 58fa6800cbf022d332f6ee793237d837b7108109 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Wed, 1 Jul 2026 09:33:08 +0900 Subject: [PATCH 3/4] Harden bounded HTML reads Cancel oversized response bodies when Content-Length already exceeds the HTML scan limit, so document-loader failures do not leave the fetch stream open while returning early. Fall back to response.text() when a custom or non-standard response body lacks getReader(). This keeps the bounded reader usable with fetch mocks and older runtimes while preserving the streaming path where available. https://github.com/fedify-dev/fedify/pull/913#discussion_r3502514680 https://github.com/fedify-dev/fedify/pull/913#discussion_r3502519526 Assisted-by: Codex:gpt-5.5 --- packages/vocab-runtime/src/docloader.test.ts | 63 +++++++++++++++++++- packages/vocab-runtime/src/docloader.ts | 23 ++++++- 2 files changed, 84 insertions(+), 2 deletions(-) diff --git a/packages/vocab-runtime/src/docloader.test.ts b/packages/vocab-runtime/src/docloader.test.ts index f40d63db7..4048a4012 100644 --- a/packages/vocab-runtime/src/docloader.test.ts +++ b/packages/vocab-runtime/src/docloader.test.ts @@ -2,7 +2,7 @@ import fetchMock from "fetch-mock"; import { deepStrictEqual, ok, rejects } from "node:assert"; import { test } from "node:test"; import preloadedContexts from "./contexts.ts"; -import { getDocumentLoader } from "./docloader.ts"; +import { getDocumentLoader, getRemoteDocument } from "./docloader.ts"; import { FetchError } from "./request.ts"; import { UrlError } from "./url.ts"; @@ -340,6 +340,67 @@ test("getDocumentLoader()", async (t) => { ); }); + await t.test("HTML Content-Length over limit cancels body", async () => { + let canceled = false; + const response = new Response("", { + headers: { + "Content-Length": String(1024 * 1024 + 1), + "Content-Type": "text/html; charset=utf-8", + }, + }); + Object.defineProperty(response, "body", { + value: { + cancel: () => { + canceled = true; + }, + }, + }); + await rejects( + () => + getRemoteDocument( + "https://example.com/large-html-cancel", + response, + () => { + throw new Error("unexpected alternate fetch"); + }, + ), + FetchError, + ); + deepStrictEqual(canceled, true); + }); + + await t.test("HTML body without getReader falls back to text", async () => { + const response = new Response( + JSON.stringify({ + "@context": "https://www.w3.org/ns/activitystreams", + id: "https://example.com/body-without-get-reader", + name: "Fetched object", + type: "Object", + }), + { headers: { "Content-Type": "text/html; charset=utf-8" } }, + ); + Object.defineProperty(response, "body", { value: {} }); + deepStrictEqual( + await getRemoteDocument( + "https://example.com/body-without-get-reader", + response, + () => { + throw new Error("unexpected alternate fetch"); + }, + ), + { + contextUrl: null, + documentUrl: "https://example.com/body-without-get-reader", + document: { + "@context": "https://www.w3.org/ns/activitystreams", + id: "https://example.com/body-without-get-reader", + name: "Fetched object", + type: "Object", + }, + }, + ); + }); + fetchMock.get("https://example.com/404", { status: 404 }); await t.test("not ok", async () => { diff --git a/packages/vocab-runtime/src/docloader.ts b/packages/vocab-runtime/src/docloader.ts index 319ef6723..35a9c11b8 100644 --- a/packages/vocab-runtime/src/docloader.ts +++ b/packages/vocab-runtime/src/docloader.ts @@ -121,6 +121,13 @@ function createResponseMetadata(response: Response): Response { }); } +async function cancelResponseBody(response: Response): Promise { + const body = response.body as { cancel?: unknown } | null; + if (body != null && typeof body.cancel === "function") { + await body.cancel(); + } +} + async function readBoundedText( response: Response, maxBytes: number, @@ -129,13 +136,27 @@ async function readBoundedText( if (contentLength != null) { const size = Number(contentLength); if (Number.isFinite(size) && size > maxBytes) { + await cancelResponseBody(response); return { text: "", size, tooLarge: true }; } } if (response.body == null) return { text: "", size: 0, tooLarge: false }; - const reader = response.body.getReader(); + const body = response.body as ReadableStream & { + getReader?: unknown; + }; + if (typeof body.getReader !== "function") { + const text = await response.text(); + const size = new TextEncoder().encode(text).byteLength; + return { + text: size <= maxBytes ? text : "", + size, + tooLarge: size > maxBytes, + }; + } + + const reader = body.getReader(); const decoder = new TextDecoder(); let text = ""; let size = 0; From aff4bb5d001da50fb393d8de85fee60b365dc8ce Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Wed, 1 Jul 2026 19:18:06 +0900 Subject: [PATCH 4/4] Simplify bounded HTML reads The bounded HTML path can rely on the standard Response body type instead of defensive assertions for non-standard mocks. The parser now preserves HTML-labelled JSON compatibility by attempting JSON.parse() directly and converts syntax failures into the same FetchError used for ordinary HTML. https://github.com/fedify-dev/fedify/pull/913#discussion_r3503329298 https://github.com/fedify-dev/fedify/pull/913#discussion_r3503341814 https://github.com/fedify-dev/fedify/pull/913#discussion_r3503350874 https://github.com/fedify-dev/fedify/pull/913#discussion_r3503555459 https://github.com/fedify-dev/fedify/pull/913#discussion_r3503590685 Assisted-by: Codex:gpt-5.5 --- packages/vocab-runtime/src/docloader.test.ts | 32 -------------------- packages/vocab-runtime/src/docloader.ts | 28 +++++------------ 2 files changed, 7 insertions(+), 53 deletions(-) diff --git a/packages/vocab-runtime/src/docloader.test.ts b/packages/vocab-runtime/src/docloader.test.ts index 4048a4012..8ac8d719b 100644 --- a/packages/vocab-runtime/src/docloader.test.ts +++ b/packages/vocab-runtime/src/docloader.test.ts @@ -369,38 +369,6 @@ test("getDocumentLoader()", async (t) => { deepStrictEqual(canceled, true); }); - await t.test("HTML body without getReader falls back to text", async () => { - const response = new Response( - JSON.stringify({ - "@context": "https://www.w3.org/ns/activitystreams", - id: "https://example.com/body-without-get-reader", - name: "Fetched object", - type: "Object", - }), - { headers: { "Content-Type": "text/html; charset=utf-8" } }, - ); - Object.defineProperty(response, "body", { value: {} }); - deepStrictEqual( - await getRemoteDocument( - "https://example.com/body-without-get-reader", - response, - () => { - throw new Error("unexpected alternate fetch"); - }, - ), - { - contextUrl: null, - documentUrl: "https://example.com/body-without-get-reader", - document: { - "@context": "https://www.w3.org/ns/activitystreams", - id: "https://example.com/body-without-get-reader", - name: "Fetched object", - type: "Object", - }, - }, - ); - }); - fetchMock.get("https://example.com/404", { status: 404 }); await t.test("not ok", async () => { diff --git a/packages/vocab-runtime/src/docloader.ts b/packages/vocab-runtime/src/docloader.ts index 35a9c11b8..f8a6966d1 100644 --- a/packages/vocab-runtime/src/docloader.ts +++ b/packages/vocab-runtime/src/docloader.ts @@ -122,9 +122,8 @@ function createResponseMetadata(response: Response): Response { } async function cancelResponseBody(response: Response): Promise { - const body = response.body as { cancel?: unknown } | null; - if (body != null && typeof body.cancel === "function") { - await body.cancel(); + if (response.body != null) { + await response.body.cancel(); } } @@ -135,7 +134,7 @@ async function readBoundedText( const contentLength = response.headers.get("Content-Length"); if (contentLength != null) { const size = Number(contentLength); - if (Number.isFinite(size) && size > maxBytes) { + if (size > maxBytes) { await cancelResponseBody(response); return { text: "", size, tooLarge: true }; } @@ -143,20 +142,7 @@ async function readBoundedText( if (response.body == null) return { text: "", size: 0, tooLarge: false }; - const body = response.body as ReadableStream & { - getReader?: unknown; - }; - if (typeof body.getReader !== "function") { - const text = await response.text(); - const size = new TextEncoder().encode(text).byteLength; - return { - text: size <= maxBytes ? text : "", - size, - tooLarge: size > maxBytes, - }; - } - - const reader = body.getReader(); + const reader = response.body.getReader(); const decoder = new TextDecoder(); let text = ""; let size = 0; @@ -319,10 +305,10 @@ export async function getRemoteDocument( return await fetch(new URL(attribs.href, docUrl).href); } } - const trimmed = html.text.trimStart(); - if (trimmed.startsWith("{") || trimmed.startsWith("[")) { + try { document = JSON.parse(html.text); - } else { + } catch (error) { + if (!(error instanceof SyntaxError)) throw error; throw new FetchError( documentUrl, `HTML document has no ActivityPub alternate link ` +