diff --git a/src/fetch/src/mcp_server_fetch/server.py b/src/fetch/src/mcp_server_fetch/server.py
index b42c7b1f6b..aa91252510 100644
--- a/src/fetch/src/mcp_server_fetch/server.py
+++ b/src/fetch/src/mcp_server_fetch/server.py
@@ -3,6 +3,7 @@
import markdownify
import readabilipy.simple_json
+from bs4 import BeautifulSoup
from mcp.shared.exceptions import McpError
from mcp.server import Server
from mcp.server.stdio import stdio_server
@@ -27,22 +28,64 @@
def extract_content_from_html(html: str) -> str:
"""Extract and convert HTML content to Markdown format.
+ Uses Mozilla Readability via readabilipy as the primary extraction method.
+ Falls back to readabilipy without Readability (less aggressive filtering)
+ or direct markdownify conversion when Readability strips too much content,
+ which commonly happens with progressive SSR sites that deliver content in
+ hidden containers awaiting client-side hydration.
+
Args:
html: Raw HTML content to process
Returns:
Simplified markdown version of the content
"""
+ # Minimum expected content length as a fraction of input HTML.
+ # If extracted text is shorter than this, Readability likely stripped
+ # meaningful content (e.g. hidden SSR markup).
+ min_expected_length = max(1, len(html) // 100)
+
+ # Stage 1: Try Readability (best quality for standard pages)
ret = readabilipy.simple_json.simple_json_from_html_string(
html, use_readability=True
)
- if not ret["content"]:
- return "Page failed to be simplified from HTML"
+ content_html = ret.get("content", "")
+ if content_html:
+ content = markdownify.markdownify(
+ content_html,
+ heading_style=markdownify.ATX,
+ )
+ if len(content.strip()) >= min_expected_length:
+ return content
+
+ # Stage 2: Try readabilipy without Readability JS (less aggressive,
+ # does not filter by CSS visibility)
+ ret = readabilipy.simple_json.simple_json_from_html_string(
+ html, use_readability=False
+ )
+ content_html = ret.get("content", "")
+ if content_html:
+ content = markdownify.markdownify(
+ content_html,
+ heading_style=markdownify.ATX,
+ )
+ if len(content.strip()) >= min_expected_length:
+ return content
+
+ # Stage 3: Convert full HTML directly with markdownify (last resort).
+ # Strip