From 3d9267da999e28304d1955ad40c4879df7a175ae Mon Sep 17 00:00:00 2001 From: huangzhimou Date: Wed, 22 Apr 2026 11:17:05 +0800 Subject: [PATCH] fix(fetch): fallback when Readability strips too much content When Readability returns <5% of original HTML, retry with raw markdownify as fallback to preserve chunked streaming SSR response content. Closes #3878 --- .changeset/fix-ssr-content-stripping.md | 11 +++++ src/fetch/src/mcp_server_fetch/server.py | 25 ++++++++---- src/fetch/tests/test_server.py | 51 ++++++++++++++++++++++++ 3 files changed, 80 insertions(+), 7 deletions(-) create mode 100644 .changeset/fix-ssr-content-stripping.md diff --git a/.changeset/fix-ssr-content-stripping.md b/.changeset/fix-ssr-content-stripping.md new file mode 100644 index 0000000000..763dc195ef --- /dev/null +++ b/.changeset/fix-ssr-content-stripping.md @@ -0,0 +1,11 @@ +--- +"mcp-server-fetch": patch +--- + +Fix mcp-server-fetch dropping SSR content from streaming/progressive rendering sites + +When a page uses progressive server-side rendering with streaming architecture, the SSR content may be injected hidden (visibility:hidden, position:absolute, top:-9999px). Readability's content extraction algorithm treats these elements as non-content and strips them entirely, causing the majority of page content to be silently dropped. + +The fix adds a fallback mechanism: if Readability returns content that is less than 5% of the original HTML length, we fall back to converting the raw HTML to markdown instead. This preserves current behavior for normal pages while automatically recovering when Readability strips too aggressively. + +Users fetching pages from sites using Next.js streaming, Remix deferred, or similar SSR patterns will now get the full page content instead of just the loading shell. diff --git a/src/fetch/src/mcp_server_fetch/server.py b/src/fetch/src/mcp_server_fetch/server.py index b42c7b1f6b..71717e117b 100644 --- a/src/fetch/src/mcp_server_fetch/server.py +++ b/src/fetch/src/mcp_server_fetch/server.py @@ -36,13 +36,24 @@ def extract_content_from_html(html: str) -> str: ret = readabilipy.simple_json.simple_json_from_html_string( html, use_readability=True ) - if not ret["content"]: - return "Page failed to be simplified from HTML" - content = markdownify.markdownify( - ret["content"], - heading_style=markdownify.ATX, - ) - return content + content_html = ret.get("content", "") + + # If Readability extracted very little compared to input, + # it likely stripped hidden SSR content — fall back to raw conversion. + # This handles streaming/SSR sites where content is injected hidden + # and Readability treats it as non-content. + if not content_html or len(content_html) < len(html) * 0.05: + content = markdownify.markdownify( + html, + heading_style=markdownify.ATX, + ) + else: + content = markdownify.markdownify( + content_html, + heading_style=markdownify.ATX, + ) + + return content if content.strip() else "Page failed to be simplified from HTML" def get_robots_txt_url(url: str) -> str: diff --git a/src/fetch/tests/test_server.py b/src/fetch/tests/test_server.py index 96c1cb38c7..d147235718 100644 --- a/src/fetch/tests/test_server.py +++ b/src/fetch/tests/test_server.py @@ -87,6 +87,57 @@ def test_empty_content_returns_error(self): result = extract_content_from_html(html) assert "" in result + def test_ssr_hidden_content_falls_back_to_raw(self): + """Test that SSR content with visibility:hidden triggers fallback to raw HTML. + + Streaming/SSR sites inject content hidden (visibility:hidden, position:absolute, + top:-9999px) which Readability strips. When this happens, we fall back to + converting the raw HTML so the content is not silently dropped. + """ + # Large SSR page where Readability would strip the hidden content + # The actual content is in a visibility:hidden div that Readability strips. + # When Readability returns minimal content (< 5% of input HTML), we fall back. + # We mock readabilipy to return minimal content (simulating the SSR stripping). + import readabilipy.simple_json + original_fn = readabilipy.simple_json.simple_json_from_html_string + + def mock_readability(html, **kwargs): + # Simulate Readability stripping hidden content and returning only + # the visible loading shell (tiny fraction of the HTML) + return { + "content": "

Loading...

", # tiny content = fallback triggers + "title": None, + "byline": None, + "date": None, + } + + html = """ + +Streaming SSR Page + +
+

Loading...

+

This is the loading shell visible initially

+
+ + +""" + with patch.object(readabilipy.simple_json, 'simple_json_from_html_string', mock_readability): + result = extract_content_from_html(html) + # The fallback should preserve the hidden SSR content + assert "Actual Page Title" in result + assert "Feature one" in result + assert "Feature two" in result class TestCheckMayAutonomouslyFetchUrl: """Tests for check_may_autonomously_fetch_url function."""