diff --git a/docs/content/docs/framework/tools/builtin.en.mdx b/docs/content/docs/framework/tools/builtin.en.mdx index 28bdd630..4eeace1a 100644 --- a/docs/content/docs/framework/tools/builtin.en.mdx +++ b/docs/content/docs/framework/tools/builtin.en.mdx @@ -46,6 +46,7 @@ Built-in tools depend on Volcengine services. Enable the corresponding service a | `web_scraper` | Aggregated search (invite-only), code [here](https://github.com/volcengine/mcp-server/tree/main/server) | `from veadk.tools.builtin_tools.web_scraper import web_scraper` | | `vesearch` | Search via the [web-aware Q&A Agent](https://www.volcengine.com/docs/85508/1512748) | `from veadk.tools.builtin_tools.vesearch import vesearch` | | `link_reader` | Read and parse the content of web links | `from veadk.tools.builtin_tools.link_reader import link_reader` | +| `web_fetch` | Fetch a web page / PDF over HTTP and extract readable content (plain HTTP, no credentials) | `from veadk.tools.builtin_tools.web_fetch import web_fetch` | | `image_generate` | [Generate images](https://www.volcengine.com/docs/82379/1541523) from text | `from veadk.tools.builtin_tools.image_generate import image_generate` | | `image_edit` | [Edit images](https://www.volcengine.com/docs/82379/1541523) (image-to-image) | `from veadk.tools.builtin_tools.image_edit import image_edit` | | `video_generate` | [Generate videos](https://www.volcengine.com/docs/82379/1520757) from text | `from veadk.tools.builtin_tools.video_generate import video_generate` | @@ -201,6 +202,57 @@ Environment variables: - `MODEL_AGENT_API_KEY`: API key for the agent's reasoning model +## Web fetch (web_fetch) + +`web_fetch` does a plain HTTP GET on a given URL and extracts its readable content: HTML is converted to markdown or plain text, and PDFs are extracted to text via `pypdf`. It does **not** execute JavaScript, so pages that render entirely client-side or require login may come back incomplete. Unlike `link_reader`, this tool needs **no credentials of its own** (it is a plain HTTP fetch) — use it to let the agent read articles, docs, or any public URL the user references. + +Parameters: + +- `url`: the `http(s)` URL to fetch; +- `extract_mode`: `markdown` (default, keeps headings / links / lists) or `text` (plain text); +- `max_chars`: maximum characters of extracted content (default `50000`). + +Returns `{"url", "title", "content", "truncated"}`, or `{"error": ...}` on failure. + + +- **SSRF protection**: after DNS resolution it blocks private / loopback / link-local / reserved addresses, and re-validates every redirect hop (including ``), following at most 3 hops. +- **Limits**: 2 MB download cap for HTML (10 MB for PDFs); 30 s request timeout; results cached in-process for 15 minutes. +- No JavaScript rendering; no socket-level DNS pinning (resolve-then-revalidate only). + + +```python title="examples/tools/web_fetch/agent.py" +import asyncio + +from veadk import Agent, Runner +from veadk.memory.short_term_memory import ShortTermMemory +from veadk.tools.builtin_tools.web_fetch import web_fetch + +agent = Agent( + name="web_fetch_agent", + model_name="doubao-seed-1-8-251228", + description="An agent that reads web pages and PDFs.", + instruction="Use the web_fetch tool to fetch the given URL, then answer based on its content.", + tools=[web_fetch], +) + +runner = Runner(agent=agent, short_term_memory=ShortTermMemory()) + + +async def main(): + response = await runner.run( + "Fetch https://arxiv.org/pdf/1706.03762 and summarize the paper's core idea" + ) + print(response) + + +if __name__ == "__main__": + asyncio.run(main()) +``` + +Environment variables: + +- `MODEL_AGENT_API_KEY`: API key for the agent's reasoning model (the `web_fetch` tool itself needs no extra credentials) + ## Image generation (image_generate) `image_generate` generates images from text. For image-to-image editing, see `image_edit` below. diff --git a/docs/content/docs/framework/tools/builtin.mdx b/docs/content/docs/framework/tools/builtin.mdx index 634bdf5b..e034a448 100644 --- a/docs/content/docs/framework/tools/builtin.mdx +++ b/docs/content/docs/framework/tools/builtin.mdx @@ -46,6 +46,7 @@ agent = Agent(tools=[web_search]) | `web_scraper` | 聚合搜索(邀测),代码见[此处](https://github.com/volcengine/mcp-server/tree/main/server) | `from veadk.tools.builtin_tools.web_scraper import web_scraper` | | `vesearch` | 调用[联网问答 Agent](https://www.volcengine.com/docs/85508/1512748) 进行搜索 | `from veadk.tools.builtin_tools.vesearch import vesearch` | | `link_reader` | 读取并解析网页链接内容 | `from veadk.tools.builtin_tools.link_reader import link_reader` | +| `web_fetch` | 直接抓取网页 / PDF 并抽取正文(纯 HTTP,工具自身无需凭证) | `from veadk.tools.builtin_tools.web_fetch import web_fetch` | | `image_generate` | 根据文本描述[生成图片](https://www.volcengine.com/docs/82379/1541523) | `from veadk.tools.builtin_tools.image_generate import image_generate` | | `image_edit` | [编辑图片](https://www.volcengine.com/docs/82379/1541523)(图生图) | `from veadk.tools.builtin_tools.image_edit import image_edit` | | `video_generate` | 根据文本描述[生成视频](https://www.volcengine.com/docs/82379/1520757) | `from veadk.tools.builtin_tools.video_generate import video_generate` | @@ -201,6 +202,57 @@ if __name__ == "__main__": - `MODEL_AGENT_API_KEY`:Agent 推理模型的 API Key +## 网页抓取(web_fetch) + +`web_fetch` 对给定 URL 发起一次普通 HTTP GET 并抽取正文:HTML 转 Markdown 或纯文本,PDF 用 `pypdf` 抽取文字。它**不执行 JavaScript**——纯前端渲染或需要登录的页面可能抽取不全。与 `link_reader` 不同,该工具**自身无需任何凭证**(纯 HTTP 抓取),适合让 Agent 阅读用户给出的文章、文档或任意公开链接。 + +参数: + +- `url`:要抓取的 `http(s)` 链接; +- `extract_mode`:`markdown`(默认,保留标题 / 链接 / 列表)或 `text`(纯文本); +- `max_chars`:抽取内容的最大字符数(默认 `50000`)。 + +返回 `{"url", "title", "content", "truncated"}`,失败时返回 `{"error": ...}`。 + + +- **SSRF 防护**:解析域名后拦截私网 / 环回 / 链路本地 / 保留地址,并对每一跳重定向(含 ``)重新校验,最多跟随 3 跳。 +- **上限**:HTML 下载上限 2MB、PDF 10MB;请求超时 30 秒;结果在进程内缓存 15 分钟。 +- 不渲染 JavaScript;未做 socket 级 DNS pinning(仅“解析后校验”)。 + + +```python title="examples/tools/web_fetch/agent.py" +import asyncio + +from veadk import Agent, Runner +from veadk.memory.short_term_memory import ShortTermMemory +from veadk.tools.builtin_tools.web_fetch import web_fetch + +agent = Agent( + name="web_fetch_agent", + model_name="doubao-seed-1-8-251228", + description="An agent that reads web pages and PDFs.", + instruction="Use the web_fetch tool to fetch the given URL, then answer based on its content.", + tools=[web_fetch], +) + +runner = Runner(agent=agent, short_term_memory=ShortTermMemory()) + + +async def main(): + response = await runner.run( + "抓取 https://arxiv.org/pdf/1706.03762 并总结这篇论文的核心思想" + ) + print(response) + + +if __name__ == "__main__": + asyncio.run(main()) +``` + +环境变量: + +- `MODEL_AGENT_API_KEY`:Agent 推理模型的 API Key(`web_fetch` 工具本身无需额外凭证) + ## 图像生成(image_generate) `image_generate` 根据文本描述生成图片。图生图编辑见下文的 `image_edit`。 diff --git a/veadk/tools/builtin_tools/web_fetch.py b/veadk/tools/builtin_tools/web_fetch.py new file mode 100644 index 00000000..65c9493c --- /dev/null +++ b/veadk/tools/builtin_tools/web_fetch.py @@ -0,0 +1,382 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""A plain-HTTP web fetch tool that extracts readable content. + +Does a plain HTTP GET and converts the HTML into bounded markdown/text. It does +NOT execute JavaScript — for JS-heavy / login-protected pages a headless browser +is needed instead. The design mirrors OpenClaw's `web_fetch`: Chrome-like +headers, SSRF protection (block private/internal addresses, re-validate every +redirect), a download-size cap, a coarse HTML→markdown extractor, and a short +in-memory cache. +""" + +import html as _html +import ipaddress +import re +import socket +import time +from urllib.parse import urljoin, urlparse + +import requests + +from google.adk.tools import ToolContext + +from veadk.utils.logger import get_logger + +logger = get_logger(__name__) + +# ---- limits / defaults (mirror OpenClaw's tools.web.fetch.* config) ---------- +_USER_AGENT = ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36" +) +_ACCEPT_LANGUAGE = "en-US,en;q=0.9" +_TIMEOUT_SECONDS = 30 +_MAX_REDIRECTS = 3 +_MAX_RESPONSE_BYTES = 2_000_000 # cap the download before truncation (HTML) +_MAX_PDF_BYTES = 10_000_000 # higher cap for PDFs (truncation corrupts parsing) +_MAX_CHARS_CAP = 200_000 # hard ceiling for the max_chars parameter +_DEFAULT_MAX_CHARS = 50_000 +_CACHE_TTL_SECONDS = 15 * 60 +_CACHE_MAX_ENTRIES = 128 + +# Tiny in-process TTL cache: {(url, mode, max_chars): (expires_at, result)}. +_CACHE: dict[tuple[str, str, int], tuple[float, dict]] = {} + + +class _WebFetchError(Exception): + """Internal: surfaced to the model as {"error": ...}.""" + + +# ---------------------------------------------------------------- SSRF guard -- +def _assert_public_host(host: str) -> None: + """Resolve `host` and reject private/internal/loopback/link-local targets. + + Note: this validates the resolved addresses but does not pin the connection + to them, so a determined attacker controlling DNS could still race the + re-resolution (TOCTOU). It blocks the common SSRF cases; pinning the socket + to the validated IP would be the hardening follow-up. + """ + if not host: + raise _WebFetchError("missing host") + try: + infos = socket.getaddrinfo(host, None) + except socket.gaierror as e: + raise _WebFetchError(f"DNS resolution failed for {host!r}: {e}") + for info in infos: + addr = info[4][0] + try: + ip = ipaddress.ip_address(addr) + except ValueError: + continue + if ( + ip.is_private + or ip.is_loopback + or ip.is_link_local + or ip.is_reserved + or ip.is_multicast + or ip.is_unspecified + ): + raise _WebFetchError(f"Blocked non-public address for host {host!r}: {ip}") + + +def _check_url(url: str) -> str: + parsed = urlparse(url) + if parsed.scheme not in ("http", "https"): + raise _WebFetchError("only http(s) URLs are supported") + if not parsed.hostname: + raise _WebFetchError("URL has no host") + _assert_public_host(parsed.hostname) + return parsed.hostname + + +# ---------------------------------------------- coarse HTML -> markdown/text -- +def _normalize_whitespace(value: str) -> str: + value = value.replace("\r", "") + value = re.sub(r"[ \t]+\n", "\n", value) + value = re.sub(r"\n{3,}", "\n\n", value) + value = re.sub(r"[ \t]{2,}", " ", value) + return value.strip() + + +def _strip_tags(value: str) -> str: + return _html.unescape(re.sub(r"<[^>]+>", "", value)) + + +def _html_to_markdown(html_text: str) -> tuple[str, str | None]: + """Coarse HTML→markdown (ported from OpenClaw's `htmlToMarkdown`).""" + title_match = re.search( + r"]*>([\s\S]*?)", html_text, flags=re.IGNORECASE + ) + title = ( + _normalize_whitespace(_strip_tags(title_match.group(1))) + if title_match + else None + ) + + text = re.sub( + r"]*)?>", "", html_text, flags=re.IGNORECASE + ) + text = re.sub(r"]*)?>", "", text, flags=re.IGNORECASE) + text = re.sub( + r"]*)?>", "", text, flags=re.IGNORECASE + ) + + # Preserve link targets so fetched pages stay source-auditable. + def _link(m: re.Match) -> str: + href, body = m.group(1), m.group(2) + label = _normalize_whitespace(_strip_tags(body)) + return f"[{label}]({href})" if label else href + + text = re.sub( + r"]*href=[\"']([^\"']+)[\"'][^>]*>([\s\S]*?)", + _link, + text, + flags=re.IGNORECASE, + ) + + def _heading(m: re.Match) -> str: + level = max(1, min(6, int(m.group(1)))) + label = _normalize_whitespace(_strip_tags(m.group(2))) + return f"\n{'#' * level} {label}\n" + + text = re.sub( + r"]*>([\s\S]*?)", _heading, text, flags=re.IGNORECASE + ) + + def _li(m: re.Match) -> str: + label = _normalize_whitespace(_strip_tags(m.group(1))) + return f"\n- {label}" if label else "" + + text = re.sub(r"]*>([\s\S]*?)", _li, text, flags=re.IGNORECASE) + text = re.sub(r"<(br|hr)\s*/?>", "\n", text, flags=re.IGNORECASE) + text = re.sub( + r"", + "\n", + text, + flags=re.IGNORECASE, + ) + text = _strip_tags(text) + return _normalize_whitespace(text), title + + +def _meta_refresh_url(html_text: str) -> str | None: + """Return the target of a `` + tag, if present (the JS-free redirect used by shell pages like sina.com).""" + m = re.search( + r"]+http-equiv=[\"']?refresh[\"']?[^>]*content=[\"']?[^\"'>]*?url=([^\"'>\s]+)", + html_text[:4096], + flags=re.IGNORECASE, + ) + return _html.unescape(m.group(1)) if m else None + + +def _extract_pdf_text(raw: bytes) -> tuple[str, str | None] | None: + """Extract text from PDF bytes via pypdf. Returns (text, title), or None if + pypdf isn't installed.""" + try: + import io + + from pypdf import PdfReader + except ImportError: + return None + reader = PdfReader(io.BytesIO(raw)) + parts: list[str] = [] + for page in reader.pages: + try: + parts.append(page.extract_text() or "") + except Exception: # noqa: BLE001 - skip unparseable pages + continue + title = None + try: + if reader.metadata and reader.metadata.title: + title = str(reader.metadata.title) + except Exception: # noqa: BLE001 + title = None + return _normalize_whitespace("\n\n".join(parts)), title + + +def _markdown_to_text(markdown: str) -> str: + text = re.sub(r"!\[[^\]]*]\([^)]+\)", "", markdown) # images + text = re.sub(r"\[([^\]]+)]\([^)]+\)", r"\1", text) # links -> label + text = re.sub(r"`([^`]+)`", r"\1", text) # inline code + text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE) # headings + text = re.sub(r"^\s*[-*+]\s+", "", text, flags=re.MULTILINE) # bullets + text = re.sub(r"^\s*\d+\.\s+", "", text, flags=re.MULTILINE) # numbered + return _normalize_whitespace(text) + + +# ----------------------------------------------------------------- the tool -- +def web_fetch( + url: str, + extract_mode: str = "markdown", + max_chars: int = _DEFAULT_MAX_CHARS, + tool_context: ToolContext | None = None, +) -> dict: + """Fetch a web page over HTTP(S) and return its readable main content. + + Performs a plain HTTP GET (no JavaScript execution) and extracts the page's + readable text. Handles HTML pages (converted to markdown/text) and **PDF** + URLs (text extracted via pypdf). Follows HTTP and `` redirects. + Use it to read articles, docs, or any public URL the user references. For + pages that require login or render entirely via JavaScript, the content may + be incomplete. + + Args: + url: The http(s) URL to fetch. + extract_mode: "markdown" (default, keeps headings/links/lists) or "text" + (plain text with markdown decoration removed). + max_chars: Truncate the extracted content to at most this many characters. + + Returns: + A dict with keys: "url" (final URL after redirects), "title", "content", + and "truncated" (bool). On failure, a dict with an "error" key. + """ + mode = extract_mode if extract_mode in ("markdown", "text") else "markdown" + try: + max_chars = int(max_chars) + except (TypeError, ValueError): + max_chars = _DEFAULT_MAX_CHARS + max_chars = max(1, min(max_chars, _MAX_CHARS_CAP)) + + cache_key = (url, mode, max_chars) + cached = _CACHE.get(cache_key) + if cached and cached[0] > time.monotonic(): + return cached[1] + + try: + result = _fetch_and_extract(url, mode, max_chars) + except _WebFetchError as e: + logger.warning(f"web_fetch failed for {url!r}: {e}") + return {"error": str(e)} + except requests.RequestException as e: + logger.warning(f"web_fetch request error for {url!r}: {e}") + return {"error": f"request failed: {e}"} + + # Cache + bound the cache size. + if len(_CACHE) >= _CACHE_MAX_ENTRIES: + _CACHE.clear() + _CACHE[cache_key] = (time.monotonic() + _CACHE_TTL_SECONDS, result) + return result + + +def _read_capped(response: requests.Response, cap: int) -> bytes: + chunks: list[bytes] = [] + total = 0 + for chunk in response.iter_content(chunk_size=16_384): + if not chunk: + continue + chunks.append(chunk) + total += len(chunk) + if total >= cap: + break + return b"".join(chunks) + + +def _result(url: str, title: str | None, content: str, max_chars: int) -> dict: + return { + "url": url, + "title": title, + "content": content[:max_chars], + "truncated": len(content) > max_chars, + } + + +def _fetch_and_extract(url: str, mode: str, max_chars: int) -> dict: + session = requests.Session() + headers = { + "User-Agent": _USER_AGENT, + "Accept-Language": _ACCEPT_LANGUAGE, + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + } + + current = url + hops = 0 + # Follow HTTP 3xx AND redirects, both bounded by the same hop + # budget, SSRF-checking every hop. + while True: + _check_url(current) + response = session.get( + current, + headers=headers, + timeout=_TIMEOUT_SECONDS, + allow_redirects=False, + stream=True, + ) + + if response.is_redirect and response.headers.get("location"): + response.close() + hops += 1 + if hops > _MAX_REDIRECTS: + raise _WebFetchError(f"too many redirects (>{_MAX_REDIRECTS})") + current = urljoin(current, response.headers["location"]) + continue + + if not response.ok: + status = response.status_code + response.close() + raise _WebFetchError(f"HTTP {status}") + + content_type = response.headers.get("content-type", "") + is_pdf = "pdf" in content_type.lower() + raw = _read_capped(response, _MAX_PDF_BYTES if is_pdf else _MAX_RESPONSE_BYTES) + response.close() + + # PDF (by content-type or magic bytes) -> extract text via pypdf. + if is_pdf or raw[:5] == b"%PDF-": + extracted = _extract_pdf_text(raw) + if extracted is None: + return { + "url": current, + "title": None, + "content": "[PDF detected — text extraction requires `pypdf`]", + "truncated": False, + } + text, title = extracted + return _result( + current, title, text or "[PDF had no extractable text]", max_chars + ) + + # Decode: honor charset from the header, else fall back to utf-8. + charset = "utf-8" + m = re.search(r"charset=([\w-]+)", content_type, flags=re.IGNORECASE) + if m: + charset = m.group(1) + try: + body = raw.decode(charset, errors="replace") + except LookupError: + body = raw.decode("utf-8", errors="replace") + + is_html = "html" in content_type.lower() or "xml" in content_type.lower() + + # Honor redirects on shell pages (e.g. sina.com). + if is_html and hops < _MAX_REDIRECTS: + meta = _meta_refresh_url(body) + if meta: + nxt = urljoin(current, meta) + if nxt != current: + hops += 1 + current = nxt + continue + + if not is_html: + # Other text-ish content: return as-is (truncated). + return _result(current, None, _normalize_whitespace(body), max_chars) + + markdown, title = _html_to_markdown(body) + content = markdown if mode == "markdown" else _markdown_to_text(markdown) + if not content: + content = _normalize_whitespace(_strip_tags(body)) + return _result(current, title, content, max_chars)