Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .changeset/fix-ssr-content-stripping.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
---
"mcp-server-fetch": patch
---

Fix mcp-server-fetch dropping SSR content from streaming/progressive rendering sites

When a page uses progressive server-side rendering with streaming architecture, the SSR content may be injected hidden (visibility:hidden, position:absolute, top:-9999px). Readability's content extraction algorithm treats these elements as non-content and strips them entirely, causing the majority of page content to be silently dropped.

The fix adds a fallback mechanism: if Readability returns content that is less than 5% of the original HTML length, we fall back to converting the raw HTML to markdown instead. This preserves current behavior for normal pages while automatically recovering when Readability strips too aggressively.

Users fetching pages from sites using Next.js streaming, Remix deferred, or similar SSR patterns will now get the full page content instead of just the loading shell.
25 changes: 18 additions & 7 deletions src/fetch/src/mcp_server_fetch/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,24 @@ def extract_content_from_html(html: str) -> str:
ret = readabilipy.simple_json.simple_json_from_html_string(
html, use_readability=True
)
if not ret["content"]:
return "<error>Page failed to be simplified from HTML</error>"
content = markdownify.markdownify(
ret["content"],
heading_style=markdownify.ATX,
)
return content
content_html = ret.get("content", "")

# If Readability extracted very little compared to input,
# it likely stripped hidden SSR content — fall back to raw conversion.
# This handles streaming/SSR sites where content is injected hidden
# and Readability treats it as non-content.
if not content_html or len(content_html) < len(html) * 0.05:
content = markdownify.markdownify(
html,
heading_style=markdownify.ATX,
)
else:
content = markdownify.markdownify(
content_html,
heading_style=markdownify.ATX,
)

return content if content.strip() else "<error>Page failed to be simplified from HTML</error>"


def get_robots_txt_url(url: str) -> str:
Expand Down
51 changes: 51 additions & 0 deletions src/fetch/tests/test_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,57 @@ def test_empty_content_returns_error(self):
result = extract_content_from_html(html)
assert "<error>" in result

def test_ssr_hidden_content_falls_back_to_raw(self):
"""Test that SSR content with visibility:hidden triggers fallback to raw HTML.

Streaming/SSR sites inject content hidden (visibility:hidden, position:absolute,
top:-9999px) which Readability strips. When this happens, we fall back to
converting the raw HTML so the content is not silently dropped.
"""
# Large SSR page where Readability would strip the hidden content
# The actual content is in a visibility:hidden div that Readability strips.
# When Readability returns minimal content (< 5% of input HTML), we fall back.
# We mock readabilipy to return minimal content (simulating the SSR stripping).
import readabilipy.simple_json
original_fn = readabilipy.simple_json.simple_json_from_html_string

def mock_readability(html, **kwargs):
# Simulate Readability stripping hidden content and returning only
# the visible loading shell (tiny fraction of the HTML)
return {
"content": "<p>Loading...</p>", # tiny content = fallback triggers
"title": None,
"byline": None,
"date": None,
}

html = """<!DOCTYPE html>
<html>
<head><title>Streaming SSR Page</title></head>
<body>
<div class="loading-shell">
<h1>Loading...</h1>
<p>This is the loading shell visible initially</p>
</div>
<div id="ssr-content" style="visibility:hidden; position:absolute; top:-9999px;">
<article>
<h1>Actual Page Title</h1>
<p>This is the real content that was rendered server-side but hidden</p>
<ul>
<li>Feature one</li>
<li>Feature two</li>
<li>Feature three</li>
</ul>
</article>
</div>
</body>
</html>"""
with patch.object(readabilipy.simple_json, 'simple_json_from_html_string', mock_readability):
result = extract_content_from_html(html)
# The fallback should preserve the hidden SSR content
assert "Actual Page Title" in result
assert "Feature one" in result
assert "Feature two" in result

class TestCheckMayAutonomouslyFetchUrl:
"""Tests for check_may_autonomously_fetch_url function."""
Expand Down
Loading