From 8e1b61f9bcf8cadcbb4e4bf20df3f2d453a903bd Mon Sep 17 00:00:00 2001
From: Hong Minhee <hong@minhee.org>
Date: Wed, 1 Jul 2026 08:52:29 +0900
Subject: [PATCH 1/4] Report HTML docloader failures

Reject HTML and XHTML document-loader responses that do not advertise an
ActivityPub alternate document before they reach the JSON parser.  This
surfaces remote HTML pages as FetchError failures with response context
instead of generic SyntaxError crashes.

Add regression coverage for HTML responses without ActivityPub alternates
and keep the ReDoS regression test aligned with the new error path.

Fixes https://github.com/fedify-dev/fedify/issues/912

Assisted-by: Codex:gpt-5.5
---
 CHANGES.md                                   | 11 ++++
 packages/vocab-runtime/src/docloader.test.ts | 58 ++++++++++++--------
 packages/vocab-runtime/src/docloader.ts      | 15 ++++-
 3 files changed, 59 insertions(+), 25 deletions(-)
diff --git a/CHANGES.md b/CHANGES.md
index 8bd936766..e6314c274 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -20,6 +20,17 @@ To be released.
 [#826]: https://github.com/fedify-dev/fedify/issues/826
 [#850]: https://github.com/fedify-dev/fedify/pull/850
 
+### @fedify/vocab-runtime
+
+ -  Changed `getDocumentLoader()` to reject HTML and XHTML responses that do
+    not advertise an ActivityPub alternate document with a `FetchError`
+    instead of attempting to parse the HTML as JSON.  This makes remote HTML
+    error pages surface as document loading failures with the response URL and
+    content type, rather than generic JSON parser crashes.  [[#912], [#913]]
+
+[#912]: https://github.com/fedify-dev/fedify/issues/912
+[#913]: https://github.com/fedify-dev/fedify/pull/913
+
 
 Version 2.3.1
 -------------
diff --git a/packages/vocab-runtime/src/docloader.test.ts b/packages/vocab-runtime/src/docloader.test.ts
index e96d8c9ab..9a6ae56f9 100644
--- a/packages/vocab-runtime/src/docloader.test.ts
+++ b/packages/vocab-runtime/src/docloader.test.ts
@@ -90,7 +90,7 @@ test("getDocumentLoader()", async (t) => {
       type: "Object",
     },
     headers: {
-      "Content-Type": "text/html; charset=utf-8",
+      "Content-Type": "application/activity+json",
       Link: '<https://example.com/object>; rel="alternate"; ' +
         'type="application/ld+json; profile="https://www.w3.org/ns/activitystreams""',
     },
@@ -247,28 +247,40 @@ test("getDocumentLoader()", async (t) => {
     });
   });
 
-  fetchMock.get("https://example.com/wrong-content-type", {
-    body: {
-      "@context": "https://www.w3.org/ns/activitystreams",
-      id: "https://example.com/wrong-content-type",
-      name: "Fetched object",
-      type: "Object",
-    },
+  fetchMock.get("https://example.com/html-no-alternate", {
+    body: `<!DOCTYPE html>
+      <html>
+        <head>
+          <title>Not an ActivityPub document</title>
+        </head>
+        <body>Not found</body>
+      </html>`,
     headers: { "Content-Type": "text/html; charset=utf-8" },
   });
 
-  await t.test("Wrong Content-Type", async () => {
-    deepStrictEqual(
-      await fetchDocumentLoader("https://example.com/wrong-content-type"),
-      {
-        contextUrl: null,
-        documentUrl: "https://example.com/wrong-content-type",
-        document: {
-          "@context": "https://www.w3.org/ns/activitystreams",
-          id: "https://example.com/wrong-content-type",
-          name: "Fetched object",
-          type: "Object",
-        },
+  await t.test("HTML without ActivityPub alternate link", async () => {
+    await rejects(
+      () => fetchDocumentLoader("https://example.com/html-no-alternate"),
+      (error) => {
+        ok(error instanceof FetchError);
+        ok(
+          error.message.includes(
+            "HTML document has no ActivityPub alternate link",
+          ),
+        );
+        ok(
+          error.message.includes("Content-Type: text/html; charset=utf-8"),
+        );
+        deepStrictEqual(
+          error.url,
+          new URL("https://example.com/html-no-alternate"),
+        );
+        ok(error.response != null);
+        deepStrictEqual(
+          error.response.headers.get("Content-Type"),
+          "text/html; charset=utf-8",
+        );
+        return true;
       },
     );
   });
@@ -459,11 +471,11 @@ test("getDocumentLoader()", async (t) => {
 
   await t.test("ReDoS resistance (CVE-2025-68475)", async () => {
     const start = performance.now();
-    // The malicious HTML will fail JSON parsing, but the important thing is
-    // that it should complete quickly (not hang due to ReDoS)
+    // The malicious HTML will fail alternate discovery, but the important
+    // thing is that it should complete quickly (not hang due to ReDoS).
     await rejects(
       () => fetchDocumentLoader("https://example.com/redos"),
-      SyntaxError,
+      FetchError,
     );
     const elapsed = performance.now() - start;
 
diff --git a/packages/vocab-runtime/src/docloader.ts b/packages/vocab-runtime/src/docloader.ts
index 0cca8d6bf..6ed1a99e6 100644
--- a/packages/vocab-runtime/src/docloader.ts
+++ b/packages/vocab-runtime/src/docloader.ts
@@ -202,13 +202,19 @@ export async function getRemoteDocument(
   ) {
     // Security: Limit HTML response size to mitigate ReDoS attacks
     const MAX_HTML_SIZE = 1024 * 1024; // 1MB
+    const errorResponse = response.clone();
     const html = await response.text();
     if (html.length > MAX_HTML_SIZE) {
       logger.warn(
         "HTML response too large, skipping alternate link discovery: {url}",
         { url: documentUrl, size: html.length },
       );
-      document = JSON.parse(html);
+      throw new FetchError(
+        documentUrl,
+        `HTML document is too large to scan for an ActivityPub alternate link ` +
+          `(Content-Type: ${contentType})`,
+        errorResponse,
+      );
     } else {
       // Safe regex patterns without nested quantifiers to prevent ReDoS
       // (CVE-2025-68475)
@@ -247,7 +253,12 @@ export async function getRemoteDocument(
           return await fetch(new URL(attribs.href, docUrl).href);
         }
       }
-      document = JSON.parse(html);
+      throw new FetchError(
+        documentUrl,
+        `HTML document has no ActivityPub alternate link ` +
+          `(Content-Type: ${contentType})`,
+        errorResponse,
+      );
     }
   } else {
     document = await response.json();

From d248bb18c96b9f74557c77dceac2747bd6058f64 Mon Sep 17 00:00:00 2001
From: Hong Minhee <hong@minhee.org>
Date: Wed, 1 Jul 2026 09:07:10 +0900
Subject: [PATCH 2/4] Preserve HTML-labelled JSON

Keep the HTML document-loader fallback narrow by parsing responses that
look like JSON even when they are served with an HTML Content-Type.  This
preserves compatibility with misconfigured ActivityPub endpoints while
still reporting real HTML pages without alternates as FetchError failures.

Read HTML responses through a bounded stream instead of buffering the full
body before checking the scan limit.  Oversized responses now short-circuit
from Content-Length when available, or while reading when the stream crosses
the limit.

https://github.com/fedify-dev/fedify/pull/913#discussion_r3502470739
https://github.com/fedify-dev/fedify/pull/913#discussion_r3502486555

Assisted-by: Codex:gpt-5.5
---
 packages/vocab-runtime/src/docloader.test.ts | 55 ++++++++++++++
 packages/vocab-runtime/src/docloader.ts      | 76 ++++++++++++++++----
 2 files changed, 118 insertions(+), 13 deletions(-)

diff --git a/packages/vocab-runtime/src/docloader.test.ts b/packages/vocab-runtime/src/docloader.test.ts
index 9a6ae56f9..f40d63db7 100644
--- a/packages/vocab-runtime/src/docloader.test.ts
+++ b/packages/vocab-runtime/src/docloader.test.ts
@@ -285,6 +285,61 @@ test("getDocumentLoader()", async (t) => {
     );
   });
 
+  fetchMock.get("https://example.com/wrong-content-type", {
+    body: {
+      "@context": "https://www.w3.org/ns/activitystreams",
+      id: "https://example.com/wrong-content-type",
+      name: "Fetched object",
+      type: "Object",
+    },
+    headers: { "Content-Type": "text/html; charset=utf-8" },
+  });
+
+  await t.test("wrong Content-Type with JSON body", async () => {
+    deepStrictEqual(
+      await fetchDocumentLoader("https://example.com/wrong-content-type"),
+      {
+        contextUrl: null,
+        documentUrl: "https://example.com/wrong-content-type",
+        document: {
+          "@context": "https://www.w3.org/ns/activitystreams",
+          id: "https://example.com/wrong-content-type",
+          name: "Fetched object",
+          type: "Object",
+        },
+      },
+    );
+  });
+
+  fetchMock.get("https://example.com/large-html", {
+    body: "<!DOCTYPE html>",
+    headers: {
+      "Content-Length": String(1024 * 1024 + 1),
+      "Content-Type": "text/html; charset=utf-8",
+    },
+  });
+
+  await t.test("HTML Content-Length over limit", async () => {
+    await rejects(
+      () => fetchDocumentLoader("https://example.com/large-html"),
+      (error) => {
+        ok(error instanceof FetchError);
+        ok(
+          error.message.includes(
+            "HTML document is too large to scan for an ActivityPub alternate link",
+          ),
+        );
+        ok(error.response != null);
+        deepStrictEqual(error.response.status, 200);
+        deepStrictEqual(
+          error.response.headers.get("Content-Type"),
+          "text/html; charset=utf-8",
+        );
+        return true;
+      },
+    );
+  });
+
   fetchMock.get("https://example.com/404", { status: 404 });
 
   await t.test("not ok", async () => {
diff --git a/packages/vocab-runtime/src/docloader.ts b/packages/vocab-runtime/src/docloader.ts
index 6ed1a99e6..319ef6723 100644
--- a/packages/vocab-runtime/src/docloader.ts
+++ b/packages/vocab-runtime/src/docloader.ts
@@ -13,6 +13,7 @@ import { UrlError, validatePublicUrl } from "./url.ts";
 
 const logger = getLogger(["fedify", "runtime", "docloader"]);
 const DEFAULT_MAX_REDIRECTION = 20;
+const MAX_HTML_SIZE = 1024 * 1024; // 1MB
 
 /**
  * A remote JSON-LD document and its context fetched by
@@ -112,6 +113,52 @@ export type AuthenticatedDocumentLoaderFactory = (
   options?: DocumentLoaderFactoryOptions,
 ) => DocumentLoader;
 
+function createResponseMetadata(response: Response): Response {
+  return new Response(null, {
+    headers: response.headers,
+    status: response.status,
+    statusText: response.statusText,
+  });
+}
+
+async function readBoundedText(
+  response: Response,
+  maxBytes: number,
+): Promise<{ text: string; size: number; tooLarge: boolean }> {
+  const contentLength = response.headers.get("Content-Length");
+  if (contentLength != null) {
+    const size = Number(contentLength);
+    if (Number.isFinite(size) && size > maxBytes) {
+      return { text: "", size, tooLarge: true };
+    }
+  }
+
+  if (response.body == null) return { text: "", size: 0, tooLarge: false };
+
+  const reader = response.body.getReader();
+  const decoder = new TextDecoder();
+  let text = "";
+  let size = 0;
+  try {
+    while (true) {
+      const result = await reader.read();
+      if (result.done) break;
+      const chunkSize = result.value.byteLength;
+      if (size + chunkSize > maxBytes) {
+        size += chunkSize;
+        await reader.cancel();
+        return { text: "", size, tooLarge: true };
+      }
+      size += chunkSize;
+      text += decoder.decode(result.value, { stream: true });
+    }
+    text += decoder.decode();
+    return { text, size, tooLarge: false };
+  } finally {
+    reader.releaseLock();
+  }
+}
+
 /**
  * Gets a {@link RemoteDocument} from the given response.
  * @param url The URL of the document to load.
@@ -200,14 +247,12 @@ export async function getRemoteDocument(
       contentType === "application/xhtml+xml" ||
       contentType?.startsWith("application/xhtml+xml;"))
   ) {
-    // Security: Limit HTML response size to mitigate ReDoS attacks
-    const MAX_HTML_SIZE = 1024 * 1024; // 1MB
-    const errorResponse = response.clone();
-    const html = await response.text();
-    if (html.length > MAX_HTML_SIZE) {
+    const errorResponse = createResponseMetadata(response);
+    const html = await readBoundedText(response, MAX_HTML_SIZE);
+    if (html.tooLarge) {
       logger.warn(
         "HTML response too large, skipping alternate link discovery: {url}",
-        { url: documentUrl, size: html.length },
+        { url: documentUrl, size: html.size },
       );
       throw new FetchError(
         documentUrl,
@@ -225,7 +270,7 @@ export async function getRemoteDocument(
         /([a-z][a-z:_-]*)=(?:"([^"]*)"|'([^']*)'|([^\s>]+))/gi;
 
       let tagMatch: RegExpExecArray | null;
-      while ((tagMatch = tagPattern.exec(html)) !== null) {
+      while ((tagMatch = tagPattern.exec(html.text)) !== null) {
         const tagContent = tagMatch[2];
         let attrMatch: RegExpExecArray | null;
         const attribs: Record<string, string> = {};
@@ -253,12 +298,17 @@ export async function getRemoteDocument(
           return await fetch(new URL(attribs.href, docUrl).href);
         }
       }
-      throw new FetchError(
-        documentUrl,
-        `HTML document has no ActivityPub alternate link ` +
-          `(Content-Type: ${contentType})`,
-        errorResponse,
-      );
+      const trimmed = html.text.trimStart();
+      if (trimmed.startsWith("{") || trimmed.startsWith("[")) {
+        document = JSON.parse(html.text);
+      } else {
+        throw new FetchError(
+          documentUrl,
+          `HTML document has no ActivityPub alternate link ` +
+            `(Content-Type: ${contentType})`,
+          errorResponse,
+        );
+      }
     }
   } else {
     document = await response.json();

From 58fa6800cbf022d332f6ee793237d837b7108109 Mon Sep 17 00:00:00 2001
From: Hong Minhee <hong@minhee.org>
Date: Wed, 1 Jul 2026 09:33:08 +0900
Subject: [PATCH 3/4] Harden bounded HTML reads

Cancel oversized response bodies when Content-Length already exceeds the
HTML scan limit, so document-loader failures do not leave the fetch stream
open while returning early.

Fall back to response.text() when a custom or non-standard response body
lacks getReader().  This keeps the bounded reader usable with fetch mocks
and older runtimes while preserving the streaming path where available.

https://github.com/fedify-dev/fedify/pull/913#discussion_r3502514680
https://github.com/fedify-dev/fedify/pull/913#discussion_r3502519526

Assisted-by: Codex:gpt-5.5
---
 packages/vocab-runtime/src/docloader.test.ts | 63 +++++++++++++++++++-
 packages/vocab-runtime/src/docloader.ts      | 23 ++++++-
 2 files changed, 84 insertions(+), 2 deletions(-)

diff --git a/packages/vocab-runtime/src/docloader.test.ts b/packages/vocab-runtime/src/docloader.test.ts
index f40d63db7..4048a4012 100644
--- a/packages/vocab-runtime/src/docloader.test.ts
+++ b/packages/vocab-runtime/src/docloader.test.ts
@@ -2,7 +2,7 @@ import fetchMock from "fetch-mock";
 import { deepStrictEqual, ok, rejects } from "node:assert";
 import { test } from "node:test";
 import preloadedContexts from "./contexts.ts";
-import { getDocumentLoader } from "./docloader.ts";
+import { getDocumentLoader, getRemoteDocument } from "./docloader.ts";
 import { FetchError } from "./request.ts";
 import { UrlError } from "./url.ts";
 
@@ -340,6 +340,67 @@ test("getDocumentLoader()", async (t) => {
     );
   });
 
+  await t.test("HTML Content-Length over limit cancels body", async () => {
+    let canceled = false;
+    const response = new Response("<!DOCTYPE html>", {
+      headers: {
+        "Content-Length": String(1024 * 1024 + 1),
+        "Content-Type": "text/html; charset=utf-8",
+      },
+    });
+    Object.defineProperty(response, "body", {
+      value: {
+        cancel: () => {
+          canceled = true;
+        },
+      },
+    });
+    await rejects(
+      () =>
+        getRemoteDocument(
+          "https://example.com/large-html-cancel",
+          response,
+          () => {
+            throw new Error("unexpected alternate fetch");
+          },
+        ),
+      FetchError,
+    );
+    deepStrictEqual(canceled, true);
+  });
+
+  await t.test("HTML body without getReader falls back to text", async () => {
+    const response = new Response(
+      JSON.stringify({
+        "@context": "https://www.w3.org/ns/activitystreams",
+        id: "https://example.com/body-without-get-reader",
+        name: "Fetched object",
+        type: "Object",
+      }),
+      { headers: { "Content-Type": "text/html; charset=utf-8" } },
+    );
+    Object.defineProperty(response, "body", { value: {} });
+    deepStrictEqual(
+      await getRemoteDocument(
+        "https://example.com/body-without-get-reader",
+        response,
+        () => {
+          throw new Error("unexpected alternate fetch");
+        },
+      ),
+      {
+        contextUrl: null,
+        documentUrl: "https://example.com/body-without-get-reader",
+        document: {
+          "@context": "https://www.w3.org/ns/activitystreams",
+          id: "https://example.com/body-without-get-reader",
+          name: "Fetched object",
+          type: "Object",
+        },
+      },
+    );
+  });
+
   fetchMock.get("https://example.com/404", { status: 404 });
 
   await t.test("not ok", async () => {
diff --git a/packages/vocab-runtime/src/docloader.ts b/packages/vocab-runtime/src/docloader.ts
index 319ef6723..35a9c11b8 100644
--- a/packages/vocab-runtime/src/docloader.ts
+++ b/packages/vocab-runtime/src/docloader.ts
@@ -121,6 +121,13 @@ function createResponseMetadata(response: Response): Response {
   });
 }
 
+async function cancelResponseBody(response: Response): Promise<void> {
+  const body = response.body as { cancel?: unknown } | null;
+  if (body != null && typeof body.cancel === "function") {
+    await body.cancel();
+  }
+}
+
 async function readBoundedText(
   response: Response,
   maxBytes: number,
@@ -129,13 +136,27 @@ async function readBoundedText(
   if (contentLength != null) {
     const size = Number(contentLength);
     if (Number.isFinite(size) && size > maxBytes) {
+      await cancelResponseBody(response);
       return { text: "", size, tooLarge: true };
     }
   }
 
   if (response.body == null) return { text: "", size: 0, tooLarge: false };
 
-  const reader = response.body.getReader();
+  const body = response.body as ReadableStream<Uint8Array> & {
+    getReader?: unknown;
+  };
+  if (typeof body.getReader !== "function") {
+    const text = await response.text();
+    const size = new TextEncoder().encode(text).byteLength;
+    return {
+      text: size <= maxBytes ? text : "",
+      size,
+      tooLarge: size > maxBytes,
+    };
+  }
+
+  const reader = body.getReader();
   const decoder = new TextDecoder();
   let text = "";
   let size = 0;

From aff4bb5d001da50fb393d8de85fee60b365dc8ce Mon Sep 17 00:00:00 2001
From: Hong Minhee <hong@minhee.org>
Date: Wed, 1 Jul 2026 19:18:06 +0900
Subject: [PATCH 4/4] Simplify bounded HTML reads

The bounded HTML path can rely on the standard Response body type instead
of defensive assertions for non-standard mocks.  The parser now preserves
HTML-labelled JSON compatibility by attempting JSON.parse() directly and
converts syntax failures into the same FetchError used for ordinary HTML.

https://github.com/fedify-dev/fedify/pull/913#discussion_r3503329298
https://github.com/fedify-dev/fedify/pull/913#discussion_r3503341814
https://github.com/fedify-dev/fedify/pull/913#discussion_r3503350874
https://github.com/fedify-dev/fedify/pull/913#discussion_r3503555459
https://github.com/fedify-dev/fedify/pull/913#discussion_r3503590685

Assisted-by: Codex:gpt-5.5
---
 packages/vocab-runtime/src/docloader.test.ts | 32 --------------------
 packages/vocab-runtime/src/docloader.ts      | 28 +++++------------
 2 files changed, 7 insertions(+), 53 deletions(-)

diff --git a/packages/vocab-runtime/src/docloader.test.ts b/packages/vocab-runtime/src/docloader.test.ts
index 4048a4012..8ac8d719b 100644
--- a/packages/vocab-runtime/src/docloader.test.ts
+++ b/packages/vocab-runtime/src/docloader.test.ts
@@ -369,38 +369,6 @@ test("getDocumentLoader()", async (t) => {
     deepStrictEqual(canceled, true);
   });
 
-  await t.test("HTML body without getReader falls back to text", async () => {
-    const response = new Response(
-      JSON.stringify({
-        "@context": "https://www.w3.org/ns/activitystreams",
-        id: "https://example.com/body-without-get-reader",
-        name: "Fetched object",
-        type: "Object",
-      }),
-      { headers: { "Content-Type": "text/html; charset=utf-8" } },
-    );
-    Object.defineProperty(response, "body", { value: {} });
-    deepStrictEqual(
-      await getRemoteDocument(
-        "https://example.com/body-without-get-reader",
-        response,
-        () => {
-          throw new Error("unexpected alternate fetch");
-        },
-      ),
-      {
-        contextUrl: null,
-        documentUrl: "https://example.com/body-without-get-reader",
-        document: {
-          "@context": "https://www.w3.org/ns/activitystreams",
-          id: "https://example.com/body-without-get-reader",
-          name: "Fetched object",
-          type: "Object",
-        },
-      },
-    );
-  });
-
   fetchMock.get("https://example.com/404", { status: 404 });
 
   await t.test("not ok", async () => {
diff --git a/packages/vocab-runtime/src/docloader.ts b/packages/vocab-runtime/src/docloader.ts
index 35a9c11b8..f8a6966d1 100644
--- a/packages/vocab-runtime/src/docloader.ts
+++ b/packages/vocab-runtime/src/docloader.ts
@@ -122,9 +122,8 @@ function createResponseMetadata(response: Response): Response {
 }
 
 async function cancelResponseBody(response: Response): Promise<void> {
-  const body = response.body as { cancel?: unknown } | null;
-  if (body != null && typeof body.cancel === "function") {
-    await body.cancel();
+  if (response.body != null) {
+    await response.body.cancel();
   }
 }
 
@@ -135,7 +134,7 @@ async function readBoundedText(
   const contentLength = response.headers.get("Content-Length");
   if (contentLength != null) {
     const size = Number(contentLength);
-    if (Number.isFinite(size) && size > maxBytes) {
+    if (size > maxBytes) {
       await cancelResponseBody(response);
       return { text: "", size, tooLarge: true };
     }
@@ -143,20 +142,7 @@ async function readBoundedText(
 
   if (response.body == null) return { text: "", size: 0, tooLarge: false };
 
-  const body = response.body as ReadableStream<Uint8Array> & {
-    getReader?: unknown;
-  };
-  if (typeof body.getReader !== "function") {
-    const text = await response.text();
-    const size = new TextEncoder().encode(text).byteLength;
-    return {
-      text: size <= maxBytes ? text : "",
-      size,
-      tooLarge: size > maxBytes,
-    };
-  }
-
-  const reader = body.getReader();
+  const reader = response.body.getReader();
   const decoder = new TextDecoder();
   let text = "";
   let size = 0;
@@ -319,10 +305,10 @@ export async function getRemoteDocument(
           return await fetch(new URL(attribs.href, docUrl).href);
         }
       }
-      const trimmed = html.text.trimStart();
-      if (trimmed.startsWith("{") || trimmed.startsWith("[")) {
+      try {
         document = JSON.parse(html.text);
-      } else {
+      } catch (error) {
+        if (!(error instanceof SyntaxError)) throw error;
         throw new FetchError(
           documentUrl,
           `HTML document has no ActivityPub alternate link ` +