From 3d9267da999e28304d1955ad40c4879df7a175ae Mon Sep 17 00:00:00 2001
From: huangzhimou <huangzhimou@example.com>
Date: Wed, 22 Apr 2026 11:17:05 +0800
Subject: [PATCH] fix(fetch): fallback when Readability strips too much content

When Readability returns <5% of original HTML, retry with raw markdownify
as fallback to preserve chunked streaming SSR response content.

Closes #3878
---
 .changeset/fix-ssr-content-stripping.md  | 11 +++++
 src/fetch/src/mcp_server_fetch/server.py | 25 ++++++++----
 src/fetch/tests/test_server.py           | 51 ++++++++++++++++++++++++
 3 files changed, 80 insertions(+), 7 deletions(-)
 create mode 100644 .changeset/fix-ssr-content-stripping.md
diff --git a/.changeset/fix-ssr-content-stripping.md b/.changeset/fix-ssr-content-stripping.md
new file mode 100644
index 0000000000..763dc195ef
--- /dev/null
+++ b/.changeset/fix-ssr-content-stripping.md
@@ -0,0 +1,11 @@
+---
+"mcp-server-fetch": patch
+---
+
+Fix mcp-server-fetch dropping SSR content from streaming/progressive rendering sites
+
+When a page uses progressive server-side rendering with streaming architecture, the SSR content may be injected hidden (visibility:hidden, position:absolute, top:-9999px). Readability's content extraction algorithm treats these elements as non-content and strips them entirely, causing the majority of page content to be silently dropped.
+
+The fix adds a fallback mechanism: if Readability returns content that is less than 5% of the original HTML length, we fall back to converting the raw HTML to markdown instead. This preserves current behavior for normal pages while automatically recovering when Readability strips too aggressively.
+
+Users fetching pages from sites using Next.js streaming, Remix deferred, or similar SSR patterns will now get the full page content instead of just the loading shell.
diff --git a/src/fetch/src/mcp_server_fetch/server.py b/src/fetch/src/mcp_server_fetch/server.py
index b42c7b1f6b..71717e117b 100644
--- a/src/fetch/src/mcp_server_fetch/server.py
+++ b/src/fetch/src/mcp_server_fetch/server.py
@@ -36,13 +36,24 @@ def extract_content_from_html(html: str) -> str:
     ret = readabilipy.simple_json.simple_json_from_html_string(
         html, use_readability=True
     )
-    if not ret["content"]:
-        return "<error>Page failed to be simplified from HTML</error>"
-    content = markdownify.markdownify(
-        ret["content"],
-        heading_style=markdownify.ATX,
-    )
-    return content
+    content_html = ret.get("content", "")
+
+    # If Readability extracted very little compared to input,
+    # it likely stripped hidden SSR content — fall back to raw conversion.
+    # This handles streaming/SSR sites where content is injected hidden
+    # and Readability treats it as non-content.
+    if not content_html or len(content_html) < len(html) * 0.05:
+        content = markdownify.markdownify(
+            html,
+            heading_style=markdownify.ATX,
+        )
+    else:
+        content = markdownify.markdownify(
+            content_html,
+            heading_style=markdownify.ATX,
+        )
+
+    return content if content.strip() else "<error>Page failed to be simplified from HTML</error>"
 
 
 def get_robots_txt_url(url: str) -> str:
diff --git a/src/fetch/tests/test_server.py b/src/fetch/tests/test_server.py
index 96c1cb38c7..d147235718 100644
--- a/src/fetch/tests/test_server.py
+++ b/src/fetch/tests/test_server.py
@@ -87,6 +87,57 @@ def test_empty_content_returns_error(self):
         result = extract_content_from_html(html)
         assert "<error>" in result
 
+    def test_ssr_hidden_content_falls_back_to_raw(self):
+        """Test that SSR content with visibility:hidden triggers fallback to raw HTML.
+
+        Streaming/SSR sites inject content hidden (visibility:hidden, position:absolute,
+        top:-9999px) which Readability strips. When this happens, we fall back to
+        converting the raw HTML so the content is not silently dropped.
+        """
+        # Large SSR page where Readability would strip the hidden content
+        # The actual content is in a visibility:hidden div that Readability strips.
+        # When Readability returns minimal content (< 5% of input HTML), we fall back.
+        # We mock readabilipy to return minimal content (simulating the SSR stripping).
+        import readabilipy.simple_json
+        original_fn = readabilipy.simple_json.simple_json_from_html_string
+        
+        def mock_readability(html, **kwargs):
+            # Simulate Readability stripping hidden content and returning only
+            # the visible loading shell (tiny fraction of the HTML)
+            return {
+                "content": "<p>Loading...</p>",  # tiny content = fallback triggers
+                "title": None,
+                "byline": None,
+                "date": None,
+            }
+        
+        html = """<!DOCTYPE html>
+<html>
+<head><title>Streaming SSR Page</title></head>
+<body>
+    <div class="loading-shell">
+        <h1>Loading...</h1>
+        <p>This is the loading shell visible initially</p>
+    </div>
+    <div id="ssr-content" style="visibility:hidden; position:absolute; top:-9999px;">
+        <article>
+            <h1>Actual Page Title</h1>
+            <p>This is the real content that was rendered server-side but hidden</p>
+            <ul>
+                <li>Feature one</li>
+                <li>Feature two</li>
+                <li>Feature three</li>
+            </ul>
+        </article>
+    </div>
+</body>
+</html>"""
+        with patch.object(readabilipy.simple_json, 'simple_json_from_html_string', mock_readability):
+            result = extract_content_from_html(html)
+        # The fallback should preserve the hidden SSR content
+        assert "Actual Page Title" in result
+        assert "Feature one" in result
+        assert "Feature two" in result
 
 class TestCheckMayAutonomouslyFetchUrl:
     """Tests for check_may_autonomously_fetch_url function."""