diff --git a/src/fetch/src/mcp_server_fetch/server.py b/src/fetch/src/mcp_server_fetch/server.py index b42c7b1f6b..aa91252510 100644 --- a/src/fetch/src/mcp_server_fetch/server.py +++ b/src/fetch/src/mcp_server_fetch/server.py @@ -3,6 +3,7 @@ import markdownify import readabilipy.simple_json +from bs4 import BeautifulSoup from mcp.shared.exceptions import McpError from mcp.server import Server from mcp.server.stdio import stdio_server @@ -27,22 +28,64 @@ def extract_content_from_html(html: str) -> str: """Extract and convert HTML content to Markdown format. + Uses Mozilla Readability via readabilipy as the primary extraction method. + Falls back to readabilipy without Readability (less aggressive filtering) + or direct markdownify conversion when Readability strips too much content, + which commonly happens with progressive SSR sites that deliver content in + hidden containers awaiting client-side hydration. + Args: html: Raw HTML content to process Returns: Simplified markdown version of the content """ + # Minimum expected content length as a fraction of input HTML. + # If extracted text is shorter than this, Readability likely stripped + # meaningful content (e.g. hidden SSR markup). + min_expected_length = max(1, len(html) // 100) + + # Stage 1: Try Readability (best quality for standard pages) ret = readabilipy.simple_json.simple_json_from_html_string( html, use_readability=True ) - if not ret["content"]: - return "Page failed to be simplified from HTML" + content_html = ret.get("content", "") + if content_html: + content = markdownify.markdownify( + content_html, + heading_style=markdownify.ATX, + ) + if len(content.strip()) >= min_expected_length: + return content + + # Stage 2: Try readabilipy without Readability JS (less aggressive, + # does not filter by CSS visibility) + ret = readabilipy.simple_json.simple_json_from_html_string( + html, use_readability=False + ) + content_html = ret.get("content", "") + if content_html: + content = markdownify.markdownify( + content_html, + heading_style=markdownify.ATX, + ) + if len(content.strip()) >= min_expected_length: + return content + + # Stage 3: Convert full HTML directly with markdownify (last resort). + # Strip