From 8f20af859252b0829d48d0dbd66e94e7fdd1980d Mon Sep 17 00:00:00 2001 From: FrancescoSaverioZuppichini Date: Wed, 27 May 2026 22:52:27 +0200 Subject: [PATCH 1/3] feat: add AI SDK tools and crawl pages support --- .github/workflows/ci.yml | 7 + README.md | 16 + package.json | 7 +- packages/ai-sdk/README.md | 200 ++++++ packages/ai-sdk/examples/crawl-blog.ts | 77 +++ packages/ai-sdk/examples/hacker-news.ts | 15 + packages/ai-sdk/package.json | 43 ++ packages/ai-sdk/src/index.ts | 249 ++++++++ packages/ai-sdk/tsconfig.json | 12 + packages/ai-sdk/tsup.config.ts | 10 + src/index.ts | 111 +--- src/models.ts | 13 + src/schemas.ts | 780 ++++++++++++++++++++++-- src/scrapegraphai.ts | 33 +- src/types.ts | 745 +++++++++++----------- src/url.ts | 46 ++ tests/scrapegraphai.test.ts | 29 + 17 files changed, 1835 insertions(+), 558 deletions(-) create mode 100644 packages/ai-sdk/README.md create mode 100644 packages/ai-sdk/examples/crawl-blog.ts create mode 100644 packages/ai-sdk/examples/hacker-news.ts create mode 100644 packages/ai-sdk/package.json create mode 100644 packages/ai-sdk/src/index.ts create mode 100644 packages/ai-sdk/tsconfig.json create mode 100644 packages/ai-sdk/tsup.config.ts create mode 100644 src/models.ts create mode 100644 src/url.ts diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a303845..039af49 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -13,6 +13,8 @@ jobs: steps: - uses: actions/checkout@v4 - uses: oven-sh/setup-bun@v2 + - name: Use local scrapegraph-js package + run: sed -i 's/"scrapegraph-js": "\^2.2.0"/"scrapegraph-js": "file:..\/.."/' packages/ai-sdk/package.json - run: bun install - run: bun run test @@ -22,5 +24,10 @@ jobs: steps: - uses: actions/checkout@v4 - uses: oven-sh/setup-bun@v2 + - name: Use local scrapegraph-js package + run: sed -i 's/"scrapegraph-js": "\^2.2.0"/"scrapegraph-js": "file:..\/.."/' packages/ai-sdk/package.json - run: bun install + - run: bun run build - run: bun run check + - run: cd packages/ai-sdk && bun run check + - run: cd packages/ai-sdk && bun run build diff --git a/README.md b/README.md index bee865e..26140aa 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,16 @@ bun add scrapegraph-js ## Quick Start +### API key + +Log in to the [ScrapeGraphAI dashboard](https://scrapegraphai.com/) to create an API key. The dashboard also shows your request history, usage, credits, and crawl/monitor activity. + +Set it in your environment: + +```bash +export SGAI_API_KEY=... +``` + ```ts import { ScrapeGraphAI } from "scrapegraph-js"; @@ -140,6 +150,12 @@ const start = await sgai.crawl.start({ // Check status const status = await sgai.crawl.get(start.data?.id!); +// Fetch paginated pages with resolved scrape results +const pages = await sgai.crawl.pages(start.data?.id!, { + cursor: 0, + limit: 50, +}); + // Control await sgai.crawl.stop(id); await sgai.crawl.resume(id); diff --git a/package.json b/package.json index cd6b353..705e24d 100644 --- a/package.json +++ b/package.json @@ -1,16 +1,17 @@ { "name": "scrapegraph-js", - "version": "2.1.0", + "version": "2.2.0", "description": "Official JavaScript/TypeScript SDK for the ScrapeGraph AI API — smart web scraping powered by AI", "type": "module", "main": "dist/index.js", "types": "dist/index.d.ts", "exports": { ".": { - "import": "./dist/index.js", - "types": "./dist/index.d.ts" + "types": "./dist/index.d.ts", + "import": "./dist/index.js" } }, + "workspaces": ["packages/*"], "scripts": { "dev": "tsup --watch", "build": "tsup", diff --git a/packages/ai-sdk/README.md b/packages/ai-sdk/README.md new file mode 100644 index 0000000..68d749f --- /dev/null +++ b/packages/ai-sdk/README.md @@ -0,0 +1,200 @@ +# ScrapeGraphAI AI SDK Tools + +[![npm version](https://badge.fury.io/js/%40scrapegraphai%2Fai-sdk.svg)](https://www.npmjs.com/package/@scrapegraphai/ai-sdk) +[![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT) + +

+ + ScrapeGraphAI AI SDK Tools + +

+ +Vercel [AI SDK](https://ai-sdk.dev/docs/introduction) tools for the ScrapeGraphAI API. + +## Install + +```bash +npm i @scrapegraphai/ai-sdk ai +# or +bun add @scrapegraphai/ai-sdk ai +``` + +`ai` is a peer dependency. Install the model provider package you use, for example: + +```bash +npm i @ai-sdk/openai +# or +bun add @ai-sdk/openai +``` + +## Quick Start + +### API key + +Log in to the [ScrapeGraphAI dashboard](https://scrapegraphai.com/) to create an API key. The dashboard also shows your request history, usage, credits, and crawl/monitor activity. + +Set it in your environment: + +```bash +export SGAI_API_KEY=... +``` + +Minimal scrape-only setup: + +```ts +import { openai } from "@ai-sdk/openai"; +import { generateText, stepCountIs } from "ai"; +import { scrapeTool } from "@scrapegraphai/ai-sdk"; + +const result = await generateText({ + model: openai("gpt-5-nano"), + prompt: "Find the main headline on https://example.com", + tools: { + scrape: scrapeTool(), + }, + stopWhen: stepCountIs(5), +}); + +console.log(result.text); +``` + +Use every ScrapeGraphAI tool group: + +```ts +import { openai } from "@ai-sdk/openai"; +import { generateText, stepCountIs } from "ai"; +import { + crawlTools, + extractTool, + monitorTools, + scrapeTool, + searchTool, +} from "@scrapegraphai/ai-sdk"; + +const result = await generateText({ + model: openai("gpt-5-nano"), + prompt: "Search for ScrapeGraphAI docs, scrape the best page, and summarize it.", + tools: { + scrape: scrapeTool(), + extract: extractTool(), + search: searchTool(), + ...crawlTools(), + ...monitorTools(), + }, + stopWhen: stepCountIs(10), +}); + +console.log(result.text); +``` + +Tools read `SGAI_API_KEY` from the environment by default. You can also pass it explicitly: + +```ts +const tools = { + scrape: scrapeTool({ apiKey: process.env.SGAI_API_KEY }), +}; +``` + +## Tools + +### scrapeTool + +Scrape a webpage with ScrapeGraphAI. Supports markdown, html, json extraction, links, images, summary, branding, and screenshots. + +```ts +import { scrapeTool } from "@scrapegraphai/ai-sdk"; + +const tools = { + scrape: scrapeTool(), +}; +``` + +### extractTool + +Extract structured JSON from a URL, HTML, or markdown with a natural-language prompt. + +```ts +import { extractTool } from "@scrapegraphai/ai-sdk"; + +const tools = { + extract: extractTool(), +}; +``` + +### searchTool + +Search the web and optionally extract structured data from search results. + +```ts +import { searchTool } from "@scrapegraphai/ai-sdk"; + +const tools = { + search: searchTool(), +}; +``` + +### crawlTools + +Start, poll, page through, stop, resume, and delete ScrapeGraphAI crawl jobs. + +```ts +import { crawlTools } from "@scrapegraphai/ai-sdk"; + +const tools = { + ...crawlTools(), +}; +``` + +Crawl page retrieval is paginated. Use `getCrawl` for status, then `getCrawlPages` for pages and resolved scrape results. + +```ts +const tools = { + startCrawl: startCrawlTool(), + getCrawl: getCrawlTool(), + getCrawlPages: getCrawlPagesTool(), +}; +``` + +### monitorTools + +Create, list, update, pause, resume, delete, and fetch activity for ScrapeGraphAI monitors. + +```ts +import { monitorTools } from "@scrapegraphai/ai-sdk"; + +const tools = { + ...monitorTools(), +}; +``` + +## Examples + +| Example | Description | +|---------|-------------| +| [`hacker-news.ts`](examples/hacker-news.ts) | Scrape Hacker News with AI SDK tools | +| [`crawl-blog.ts`](examples/crawl-blog.ts) | Crawl ScrapeGraphAI blog pages, fetch paginated crawl results, and summarize them | + +Run an example: + +```bash +OPENAI_API_KEY=... SGAI_API_KEY=... bun examples/crawl-blog.ts +``` + +## Environment Variables + +| Variable | Description | +|----------|-------------| +| `SGAI_API_KEY` | ScrapeGraphAI API key | +| `OPENAI_API_KEY` | Required by the OpenAI provider examples | + +## Development + +```bash +bun install +bun run build +bun run check +``` + +## License + +MIT - [ScrapeGraphAI](https://scrapegraphai.com) diff --git a/packages/ai-sdk/examples/crawl-blog.ts b/packages/ai-sdk/examples/crawl-blog.ts new file mode 100644 index 0000000..c6ee200 --- /dev/null +++ b/packages/ai-sdk/examples/crawl-blog.ts @@ -0,0 +1,77 @@ +import { openai } from "@ai-sdk/openai"; +import { generateText, stepCountIs, type ModelMessage } from "ai"; +import { stdin as input, stdout as output } from "node:process"; +import { createInterface } from "node:readline/promises"; +import { crawlTools } from "../src/index"; + +const initialPrompt = + "Find 10 https://scrapegraphai.com/ blog posts. Start a crawl, poll its status, fetch crawled pages with getCrawlPages, then summarize what you found."; +const messages: ModelMessage[] = []; +let activeController: AbortController | undefined; + +async function run(prompt: string) { + messages.push({ role: "user", content: prompt }); + const controller = new AbortController(); + activeController = controller; + + try { + const result = await generateText({ + model: openai("gpt-5-nano"), + messages, + tools: { ...crawlTools() }, + stopWhen: stepCountIs(20), + abortSignal: controller.signal, + onStepFinish: ({ text, toolCalls, toolResults }) => { + if (text) { + console.log(`\n[assistant]\n${text}`); + } + + for (const toolCall of toolCalls) { + console.log(`\n[tool] ${toolCall.toolName}`); + console.log(JSON.stringify(toolCall.input, null, 2)); + } + + for (const toolResult of toolResults) { + console.log(`\n[result] ${toolResult.toolName}`); + console.log(JSON.stringify(toolResult.output, null, 2)); + } + }, + }); + + messages.push(...result.response.messages); + console.log(`\n${result.text}\n`); + } catch (error) { + if (controller.signal.aborted) { + console.error("[aborted]"); + } else { + console.error(error instanceof Error ? error.message : error); + } + } finally { + if (activeController === controller) { + activeController = undefined; + } + } +} + +const rl = createInterface({ input, output }); + +process.on("SIGINT", () => { + output.write("\n"); + if (activeController) { + activeController.abort(); + return; + } + + rl.close(); + process.exit(0); +}); + +await run(initialPrompt); + +while (true) { + const prompt = (await rl.question("> ")).trim(); + + if (prompt) { + await run(prompt); + } +} diff --git a/packages/ai-sdk/examples/hacker-news.ts b/packages/ai-sdk/examples/hacker-news.ts new file mode 100644 index 0000000..ac92864 --- /dev/null +++ b/packages/ai-sdk/examples/hacker-news.ts @@ -0,0 +1,15 @@ +import { openai } from "@ai-sdk/openai"; +import { generateText, stepCountIs } from "ai"; +import { scrapeTool } from "../src/index"; + +const { text } = await generateText({ + model: openai("gpt-5-nano"), + prompt: + "Scrape Hacker News and write a short, concise summary of what people are talking about today.", + tools: { + scrape: scrapeTool(), + }, + stopWhen: stepCountIs(3), +}); + +console.log(text); diff --git a/packages/ai-sdk/package.json b/packages/ai-sdk/package.json new file mode 100644 index 0000000..934a518 --- /dev/null +++ b/packages/ai-sdk/package.json @@ -0,0 +1,43 @@ +{ + "name": "@scrapegraphai/ai-sdk", + "version": "0.1.0", + "description": "Vercel AI SDK tools integration for ScrapeGraphAI.", + "type": "module", + "main": "dist/index.js", + "types": "dist/index.d.ts", + "exports": { + ".": { + "types": "./dist/index.d.ts", + "import": "./dist/index.js" + } + }, + "files": ["dist"], + "scripts": { + "dev": "tsup --watch", + "build": "tsup", + "check": "tsc --noEmit", + "prepublishOnly": "bun run build" + }, + "keywords": ["scrapegraph", "ai-sdk", "tools", "scraping", "extraction"], + "author": "ScrapeGraph Team", + "license": "MIT", + "homepage": "https://scrapegraphai.com", + "repository": { + "type": "git", + "url": "https://github.com/ScrapeGraphAI/scrapegraph-sdk.git", + "directory": "packages/ai-sdk" + }, + "peerDependencies": { + "ai": ">=6" + }, + "dependencies": { + "scrapegraph-js": "^2.2.0", + "zod": "^4.3.6" + }, + "devDependencies": { + "@ai-sdk/openai": "^3.0.65", + "ai": "^6.0.191", + "tsup": "^8.3.6", + "typescript": "^5.8.2" + } +} diff --git a/packages/ai-sdk/src/index.ts b/packages/ai-sdk/src/index.ts new file mode 100644 index 0000000..17dd218 --- /dev/null +++ b/packages/ai-sdk/src/index.ts @@ -0,0 +1,249 @@ +import { tool } from "ai"; +import { + ScrapeGraphAI, + type ScrapeGraphAIInput, + crawlPagesQuerySchema, + crawlRequestSchema, + extractRequestBaseSchema, + monitorActivityQuerySchema, + monitorCreateSchema, + monitorUpdateSchema, + scrapeRequestSchema, + searchRequestSchema, +} from "scrapegraph-js"; +import { z } from "zod"; + +export type ScrapeGraphToolOptions = ScrapeGraphAIInput; + +const idSchema = z.object({ + id: z.string().min(1), +}); + +const monitorActivityInputSchema = z.object({ + id: z.string().min(1), + params: monitorActivityQuerySchema.optional(), +}); + +const monitorUpdateInputSchema = z.object({ + id: z.string().min(1), + params: monitorUpdateSchema, +}); + +const crawlPagesInputSchema = z.object({ + id: z.string().min(1), + params: crawlPagesQuerySchema.partial().optional(), +}); + +function unwrap(result: { status: "success" | "error"; data: T | null; error?: string }) { + if (result.status === "error") { + throw new Error(result.error ?? "ScrapeGraphAI request failed"); + } + + if (!result.data) { + throw new Error("ScrapeGraphAI request returned no data"); + } + + return result.data; +} + +export function scrapeTool(options?: ScrapeGraphToolOptions) { + const sgai = ScrapeGraphAI(options); + + return tool({ + description: + "Scrape a webpage with ScrapeGraphAI. Supports markdown, html, json extraction, links, images, summary, branding, and screenshots.", + inputSchema: scrapeRequestSchema, + execute: async (input) => unwrap(await sgai.scrape(input)), + }); +} + +export function extractTool(options?: ScrapeGraphToolOptions) { + const sgai = ScrapeGraphAI(options); + + return tool({ + description: + "Extract structured JSON from a URL, HTML, or markdown using ScrapeGraphAI and a natural-language prompt.", + inputSchema: extractRequestBaseSchema, + execute: async (input) => unwrap(await sgai.extract(input)), + }); +} + +export function searchTool(options?: ScrapeGraphToolOptions) { + const sgai = ScrapeGraphAI(options); + + return tool({ + description: + "Search the web with ScrapeGraphAI and optionally extract structured data from the results.", + inputSchema: searchRequestSchema, + execute: async (input) => unwrap(await sgai.search(input)), + }); +} + +export function startCrawlTool(options?: ScrapeGraphToolOptions) { + const sgai = ScrapeGraphAI(options); + + return tool({ + description: + 'Start an asynchronous ScrapeGraphAI crawl. Returns a crawl id. Poll getCrawlTool for status, then call getCrawlPagesTool to retrieve paginated pages and scrape results. When the user asks to crawl only a section or path slug, set includePatterns using glob-style URL patterns: "*/" for first-level paths and "**//**" for nested paths.', + inputSchema: crawlRequestSchema, + execute: async (input) => unwrap(await sgai.crawl.start(input)), + }); +} + +export function getCrawlTool(options?: ScrapeGraphToolOptions) { + const sgai = ScrapeGraphAI(options); + + return tool({ + description: + "Get crawl status by crawl id. Use this after startCrawlTool for polling progress; use getCrawlPagesTool to retrieve paginated pages and scrape results.", + inputSchema: idSchema, + execute: async ({ id }) => unwrap(await sgai.crawl.get(id)), + }); +} + +export function getCrawlPagesTool(options?: ScrapeGraphToolOptions) { + const sgai = ScrapeGraphAI(options); + + return tool({ + description: + "Get cursor-paginated crawled pages for a ScrapeGraphAI crawl by crawl id. Returned pages include resolved scrape results when available. Default pagination is cursor 0 and limit 50.", + inputSchema: crawlPagesInputSchema, + execute: async ({ id, params }) => unwrap(await sgai.crawl.pages(id, params)), + }); +} + +export function stopCrawlTool(options?: ScrapeGraphToolOptions) { + const sgai = ScrapeGraphAI(options); + + return tool({ + description: "Stop a running ScrapeGraphAI crawl by crawl id.", + inputSchema: idSchema, + execute: async ({ id }) => unwrap(await sgai.crawl.stop(id)), + }); +} + +export function resumeCrawlTool(options?: ScrapeGraphToolOptions) { + const sgai = ScrapeGraphAI(options); + + return tool({ + description: "Resume a paused ScrapeGraphAI crawl by crawl id.", + inputSchema: idSchema, + execute: async ({ id }) => unwrap(await sgai.crawl.resume(id)), + }); +} + +export function deleteCrawlTool(options?: ScrapeGraphToolOptions) { + const sgai = ScrapeGraphAI(options); + + return tool({ + description: "Delete a ScrapeGraphAI crawl by crawl id.", + inputSchema: idSchema, + execute: async ({ id }) => unwrap(await sgai.crawl.delete(id)), + }); +} + +export function crawlTools(options?: ScrapeGraphToolOptions) { + return { + startCrawl: startCrawlTool(options), + getCrawl: getCrawlTool(options), + getCrawlPages: getCrawlPagesTool(options), + stopCrawl: stopCrawlTool(options), + resumeCrawl: resumeCrawlTool(options), + deleteCrawl: deleteCrawlTool(options), + }; +} + +export function createMonitorTool(options?: ScrapeGraphToolOptions) { + const sgai = ScrapeGraphAI(options); + + return tool({ + description: + "Create an asynchronous ScrapeGraphAI monitor for a webpage. Returns a monitor id for status and activity checks.", + inputSchema: monitorCreateSchema, + execute: async (input) => unwrap(await sgai.monitor.create(input)), + }); +} + +export function listMonitorsTool(options?: ScrapeGraphToolOptions) { + const sgai = ScrapeGraphAI(options); + + return tool({ + description: "List ScrapeGraphAI monitors.", + inputSchema: z.object({}), + execute: async () => unwrap(await sgai.monitor.list()), + }); +} + +export function getMonitorTool(options?: ScrapeGraphToolOptions) { + const sgai = ScrapeGraphAI(options); + + return tool({ + description: "Get a ScrapeGraphAI monitor by monitor id.", + inputSchema: idSchema, + execute: async ({ id }) => unwrap(await sgai.monitor.get(id)), + }); +} + +export function updateMonitorTool(options?: ScrapeGraphToolOptions) { + const sgai = ScrapeGraphAI(options); + + return tool({ + description: "Update a ScrapeGraphAI monitor by monitor id.", + inputSchema: monitorUpdateInputSchema, + execute: async ({ id, params }) => unwrap(await sgai.monitor.update(id, params)), + }); +} + +export function deleteMonitorTool(options?: ScrapeGraphToolOptions) { + const sgai = ScrapeGraphAI(options); + + return tool({ + description: "Delete a ScrapeGraphAI monitor by monitor id.", + inputSchema: idSchema, + execute: async ({ id }) => unwrap(await sgai.monitor.delete(id)), + }); +} + +export function pauseMonitorTool(options?: ScrapeGraphToolOptions) { + const sgai = ScrapeGraphAI(options); + + return tool({ + description: "Pause a ScrapeGraphAI monitor by monitor id.", + inputSchema: idSchema, + execute: async ({ id }) => unwrap(await sgai.monitor.pause(id)), + }); +} + +export function resumeMonitorTool(options?: ScrapeGraphToolOptions) { + const sgai = ScrapeGraphAI(options); + + return tool({ + description: "Resume a paused ScrapeGraphAI monitor by monitor id.", + inputSchema: idSchema, + execute: async ({ id }) => unwrap(await sgai.monitor.resume(id)), + }); +} + +export function getMonitorActivityTool(options?: ScrapeGraphToolOptions) { + const sgai = ScrapeGraphAI(options); + + return tool({ + description: + "Get recent activity ticks for a ScrapeGraphAI monitor by monitor id. Use after creating or retrieving a monitor.", + inputSchema: monitorActivityInputSchema, + execute: async ({ id, params }) => unwrap(await sgai.monitor.activity(id, params)), + }); +} + +export function monitorTools(options?: ScrapeGraphToolOptions) { + return { + createMonitor: createMonitorTool(options), + listMonitors: listMonitorsTool(options), + getMonitor: getMonitorTool(options), + updateMonitor: updateMonitorTool(options), + deleteMonitor: deleteMonitorTool(options), + pauseMonitor: pauseMonitorTool(options), + resumeMonitor: resumeMonitorTool(options), + getMonitorActivity: getMonitorActivityTool(options), + }; +} diff --git a/packages/ai-sdk/tsconfig.json b/packages/ai-sdk/tsconfig.json new file mode 100644 index 0000000..229717c --- /dev/null +++ b/packages/ai-sdk/tsconfig.json @@ -0,0 +1,12 @@ +{ + "extends": "../../tsconfig.json", + "compilerOptions": { + "rootDir": "src", + "outDir": "dist", + "paths": { + "scrapegraph-js": ["../../dist/index.d.ts"] + } + }, + "include": ["src"], + "exclude": ["node_modules", "dist", "tests"] +} diff --git a/packages/ai-sdk/tsup.config.ts b/packages/ai-sdk/tsup.config.ts new file mode 100644 index 0000000..337fac5 --- /dev/null +++ b/packages/ai-sdk/tsup.config.ts @@ -0,0 +1,10 @@ +import { defineConfig } from "tsup"; + +export default defineConfig({ + entry: ["src/index.ts"], + format: ["esm"], + dts: true, + clean: true, + target: "node22", + outDir: "dist", +}); diff --git a/src/index.ts b/src/index.ts index aa69918..2c8bd8e 100644 --- a/src/index.ts +++ b/src/index.ts @@ -12,113 +12,6 @@ export { monitor, } from "./scrapegraphai.js"; -export type { - ApiResult, - Branding, - BrandingColors, - BrandingFontEntry, - BrandingImages, - BrandingMetadata, - BrandingPersonality, - BrandingTypography, - ChunkerMetadata, - ContentPageMetadata, - CrawlHistoryEntry, - CrawlPage, - CrawlPageStatus, - CrawlRequest, - CrawlResponse, - CrawlResult, - CrawlStatus, - CreditsJobs, - CreditsResponse, - ExtractHistoryEntry, - ExtractRequest, - ExtractResponse, - FetchConfig, - FetchContentType, - FetchMode, - FetchWarning, - FetchWarningReason, - FormatConfig, - FormatError, - FormatMetadataMap, - FormatResponseMap, - FormatType, - HealthResponse, - HistoryEntry, - HistoryFilter, - HistoryPage, - HistoryPagination, - HistoryStatus, - HtmlMode, - ImageChange, - ImageContentType, - JobsStatus, - JsonChange, - MarkdownFormatConfig, - HtmlFormatConfig, - ScreenshotFormatConfig, - JsonFormatConfig, - LinksFormatConfig, - ImagesFormatConfig, - SummaryFormatConfig, - BrandingFormatConfig, - MockConfig, - MonitorActivityRequest, - MonitorActivityResponse, - MonitorCreateRequest, - MonitorDiffs, - MonitorHistoryEntry, - MonitorRefs, - MonitorResponse, - MonitorResult, - MonitorTickEntry, - MonitorTickStatus, - MonitorUpdateRequest, - PageResponse, - ScrapeHistoryEntry, - ScrapeMetadata, - ScrapeRequest, - ScrapeResponse, - ScrapeResultMap, - ScreenshotData, - SearchHistoryEntry, - SearchMetadata, - SearchRequest, - SearchResponse, - SearchResult, - Service, - SetChange, - TextChange, - TimeRange, - TokenUsage, - WebhookStatus, -} from "./types.js"; +export type * from "./types.js"; -export { - brandingFormatConfigSchema, - crawlRequestSchema, - extractRequestSchema, - fetchConfigSchema, - fetchContentTypeSchema, - fetchModeSchema, - formatConfigSchema, - historyFilterSchema, - htmlFormatConfigSchema, - htmlModeSchema, - imagesFormatConfigSchema, - jsonFormatConfigSchema, - linksFormatConfigSchema, - markdownFormatConfigSchema, - mockConfigSchema, - monitorActivityRequestSchema, - monitorCreateRequestSchema, - monitorUpdateRequestSchema, - scrapeRequestSchema, - screenshotFormatConfigSchema, - searchRequestSchema, - serviceSchema, - summaryFormatConfigSchema, - timeRangeSchema, -} from "./schemas.js"; +export * from "./schemas.js"; diff --git a/src/models.ts b/src/models.ts new file mode 100644 index 0000000..2a63ee1 --- /dev/null +++ b/src/models.ts @@ -0,0 +1,13 @@ +export const MODEL_NAMES = [ + "gpt-4o-mini", + "gpt-4o-mini-2024-07-18", + "llama-3.3-70b-versatile", + "llama-3.1-8b-instant", + "mixtral-8x7b-32768", + "mistral-small-2501", + "gpt-oss-120b", + "openai/gpt-oss-120b", + "claude-haiku-4-5-20251001", +] as const; + +export type ModelName = (typeof MODEL_NAMES)[number]; diff --git a/src/schemas.ts b/src/schemas.ts index 13f9ae3..6a273c9 100644 --- a/src/schemas.ts +++ b/src/schemas.ts @@ -1,25 +1,15 @@ -import { z } from "zod"; +import { z } from "zod/v4"; +import { MODEL_NAMES } from "./models.js"; +import * as url from "./url.js"; -export const serviceSchema = z.enum(["scrape", "extract", "search", "monitor", "crawl"]); +// shared sub-schemas composed into route request schemas below +export const serviceEnumSchema = z.enum(["scrape", "extract", "search", "monitor", "crawl"]); +export const statusEnumSchema = z.enum(["completed", "failed"]); export const htmlModeSchema = z.enum(["normal", "reader", "prune"]); - -export const fetchModeSchema = z.enum(["auto", "fast", "js"]); - -export const timeRangeSchema = z.enum([ - "past_hour", - "past_24_hours", - "past_week", - "past_month", - "past_year", -]); - -export const crawlStatusSchema = z.enum(["running", "completed", "failed", "paused", "deleted"]); - -export const crawlPageStatusSchema = z.enum(["completed", "failed", "skipped"]); - export const fetchContentTypeSchema = z.enum([ "text/html", + "application/json", "application/pdf", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "application/vnd.openxmlformats-officedocument.presentationml.presentation", @@ -39,10 +29,37 @@ export const fetchContentTypeSchema = z.enum([ "text/plain", "application/x-latex", ]); - export const userPromptSchema = z.string().min(1).max(10_000); -export const urlSchema = z.string().url(); +const PUBLIC_DOMAIN_RE = + /^(?=.{1,253}\.?$)(?:[a-z\d](?:[a-z\d-]{0,61}[a-z\d])?\.)+[a-z\d](?:[a-z\d-]{0,61}[a-z\d])?\.?$/i; +const IPV4_RE = /^(?:(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)\.){3}(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)$/; + +export const urlSchema = z + .string() + .trim() + .transform((val) => (/^[a-z][a-z\d+.-]*:\/\//i.test(val) ? val : `https://${val}`)) + .pipe(z.url()) + .check( + z.refine((val) => { + try { + const { protocol, hostname } = new URL(val); + if (protocol !== "http:" && protocol !== "https:") return false; + if ( + !PUBLIC_DOMAIN_RE.test(hostname) && + !IPV4_RE.test(hostname) && + !hostname.includes(":") && + !(process.env.NODE_ENV === "development" && hostname === "localhost") + ) { + return false; + } + if (process.env.NODE_ENV === "development") return true; + return !url.isInternal(hostname); + } catch { + return false; + } + }, "Private or internal URLs are not allowed"), + ); export const paginationSchema = z.object({ page: z.coerce.number().int().positive().default(1), @@ -53,6 +70,8 @@ export const uuidParamSchema = z.object({ id: z.string().regex(/^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i), }); +export const fetchModeSchema = z.enum(["auto", "fast", "js"]); + export const FETCH_CONFIG_DEFAULTS = { mode: "auto", stealth: false, @@ -61,14 +80,6 @@ export const FETCH_CONFIG_DEFAULTS = { scrolls: 0, } as const; -export const mockConfigSchema = z.object({ - minKb: z.number().int().min(1).max(1000).default(1), - maxKb: z.number().int().min(1).max(1000).default(5), - minSleep: z.number().int().min(0).max(30000).default(5), - maxSleep: z.number().int().min(0).max(30000).default(15), - writeToBucket: z.boolean().default(false), -}); - export const fetchConfigSchema = z.object({ mode: fetchModeSchema.default(FETCH_CONFIG_DEFAULTS.mode), stealth: z.boolean().default(FETCH_CONFIG_DEFAULTS.stealth), @@ -82,65 +93,122 @@ export const fetchConfigSchema = z.object({ .transform((v) => v.toLowerCase()) .optional(), scrolls: z.number().int().min(0).max(100).default(FETCH_CONFIG_DEFAULTS.scrolls), - mock: z.union([z.boolean(), mockConfigSchema]).default(false), + mock: z + .union([ + z.boolean(), + z.object({ + minKb: z.number().int().min(1).max(1000).default(1), + maxKb: z.number().int().min(1).max(1000).default(5), + minSleep: z.number().int().min(0).max(30000).default(5), + maxSleep: z.number().int().min(0).max(30000).default(15), + writeToBucket: z.boolean().default(false), + }), + ]) + .default(false), +}); + +export const chunkerSchema = z.object({ + size: z.union([z.number().int().min(2048), z.literal("dynamic")]).optional(), + overlap: z.number().int().min(0).max(512).optional(), +}); + +export const llmConfigSchema = z.object({ + model: z.enum(MODEL_NAMES).optional(), + temperature: z.number().min(0).max(1).default(0), + maxTokens: z.number().int().min(1).max(16384).default(16384), + chunker: chunkerSchema.optional(), }); +// route request schemas + export const historyFilterSchema = z.object({ page: z.coerce.number().int().positive().default(1), limit: z.coerce.number().int().min(1).max(100).default(20), - service: serviceSchema.optional(), + service: serviceEnumSchema.optional(), }); -export const markdownFormatConfigSchema = z.object({ - type: z.literal("markdown"), +export const scrapeContentFormatSchema = z.enum([ + "markdown", + "html", + "links", + "images", + "summary", + "json", + "branding", +]); +export const scrapeCaptureFormatSchema = z.enum(["screenshot"]); +export const scrapeFormatSchema = z.enum([ + ...scrapeContentFormatSchema.options, + ...scrapeCaptureFormatSchema.options, +]); + +export const markdownConfigSchema = z.object({ mode: htmlModeSchema.default("normal"), }); -export const htmlFormatConfigSchema = z.object({ - type: z.literal("html"), +export const htmlConfigSchema = z.object({ mode: htmlModeSchema.default("normal"), }); -export const screenshotFormatConfigSchema = z.object({ - type: z.literal("screenshot"), +export const screenshotConfigSchema = z.object({ fullPage: z.boolean().default(false), width: z.number().int().min(320).max(3840).default(1440), height: z.number().int().min(200).max(2160).default(900), quality: z.number().int().min(1).max(100).default(80), }); -export const jsonFormatConfigSchema = z.object({ - type: z.literal("json"), - prompt: userPromptSchema, +export const scrapeJsonConfigSchema = z.object({ + prompt: z.string().max(10_000).default(""), schema: z.record(z.string(), z.unknown()).optional(), + // llmConfig: llmConfigSchema.optional(), mode: htmlModeSchema.default("normal"), }); -export const linksFormatConfigSchema = z.object({ +export const scrapeSummaryConfigSchema = z.object({ + // llmConfig: llmConfigSchema.optional(), +}); + +export const scrapeMarkdownFormatSchema = markdownConfigSchema.extend({ + type: z.literal("markdown"), +}); + +export const scrapeHtmlFormatSchema = htmlConfigSchema.extend({ + type: z.literal("html"), +}); + +export const scrapeScreenshotFormatSchema = screenshotConfigSchema.extend({ + type: z.literal("screenshot"), +}); + +export const scrapeJsonFormatSchema = scrapeJsonConfigSchema.extend({ + type: z.literal("json"), +}); + +export const scrapeLinksFormatSchema = z.object({ type: z.literal("links"), }); -export const imagesFormatConfigSchema = z.object({ +export const scrapeImagesFormatSchema = z.object({ type: z.literal("images"), }); -export const summaryFormatConfigSchema = z.object({ +export const scrapeSummaryFormatSchema = scrapeSummaryConfigSchema.extend({ type: z.literal("summary"), }); -export const brandingFormatConfigSchema = z.object({ +export const scrapeBrandingFormatSchema = z.object({ type: z.literal("branding"), }); -export const formatConfigSchema = z.discriminatedUnion("type", [ - markdownFormatConfigSchema, - htmlFormatConfigSchema, - screenshotFormatConfigSchema, - jsonFormatConfigSchema, - linksFormatConfigSchema, - imagesFormatConfigSchema, - summaryFormatConfigSchema, - brandingFormatConfigSchema, +export const scrapeFormatEntrySchema = z.discriminatedUnion("type", [ + scrapeMarkdownFormatSchema, + scrapeHtmlFormatSchema, + scrapeScreenshotFormatSchema, + scrapeJsonFormatSchema, + scrapeLinksFormatSchema, + scrapeImagesFormatSchema, + scrapeSummaryFormatSchema, + scrapeBrandingFormatSchema, ]); export const scrapeRequestSchema = z.object({ @@ -148,15 +216,15 @@ export const scrapeRequestSchema = z.object({ contentType: fetchContentTypeSchema.optional(), fetchConfig: fetchConfigSchema.optional(), formats: z - .array(formatConfigSchema) - .min(1) + .array(scrapeFormatEntrySchema) + .min(1, { message: "Select at least one format" }) .refine((formats) => new Set(formats.map((format) => format.type)).size === formats.length, { message: "duplicate format types not allowed", }) .default([{ type: "markdown", mode: "normal" }]), }); -export const extractRequestSchema = z +export const extractRequestBaseSchema = z .object({ url: urlSchema.optional(), html: z.string().optional(), @@ -181,18 +249,68 @@ export const searchRequestSchema = z prompt: userPromptSchema.optional(), schema: z.record(z.string(), z.unknown()).optional(), locationGeoCode: z.string().max(10).optional(), - timeRange: timeRangeSchema.optional(), + timeRange: z + .enum(["past_hour", "past_24_hours", "past_week", "past_month", "past_year"]) + .optional(), }) .refine((d) => !d.schema || d.prompt, { message: "schema requires prompt", + path: ["prompt"], }); -export const monitorCreateRequestSchema = z.object({ +// ─── response schemas ─────────────────────────────────────────────────────── + +export const validateResponseSchema = z.object({ + email: z.email(), +}); + +export const okResponseSchema = z.object({ + ok: z.literal(true), +}); + +export const healthResponseSchema = z.object({ + status: z.enum(["ok", "degraded"]), + uptime: z.number().int().nonnegative(), + services: z + .object({ + redis: z.enum(["ok", "down"]), + db: z.enum(["ok", "down"]), + }) + .optional(), +}); + +export const tokenUsageSchema = z.object({ + promptTokens: z.number().int().nonnegative(), + completionTokens: z.number().int().nonnegative(), +}); + +export const chunkerMetadataSchema = z.object({ + chunks: z.array(z.object({ size: z.number().int().nonnegative() })), +}); + +export const jobsStatusSchema = z.object({ + used: z.number().int().nonnegative(), + limit: z.number().int().nonnegative(), +}); + +export const creditsResponseSchema = z.object({ + remaining: z.number().int(), + used: z.number().int(), + plan: z.string(), + jobs: z.object({ + crawl: jobsStatusSchema, + monitor: jobsStatusSchema, + }), +}); + +// ─── monitor schemas ──────────────────────────────────────────────────────── + +export const monitorCreateSchema = z.object({ url: urlSchema, name: z.string().max(200).optional(), formats: z - .array(formatConfigSchema) - .min(1) + .array(scrapeFormatEntrySchema) + .min(1, { message: "Select at least one format" }) .refine((formats) => new Set(formats.map((f) => f.type)).size === formats.length, { message: "duplicate format types not allowed", }) @@ -202,11 +320,11 @@ export const monitorCreateRequestSchema = z.object({ fetchConfig: fetchConfigSchema.optional(), }); -export const monitorUpdateRequestSchema = z +export const monitorUpdateSchema = z .object({ name: z.string().max(200).optional(), formats: z - .array(formatConfigSchema) + .array(scrapeFormatEntrySchema) .min(1) .refine((formats) => new Set(formats.map((f) => f.type)).size === formats.length, { message: "duplicate format types not allowed", @@ -218,16 +336,167 @@ export const monitorUpdateRequestSchema = z }) .partial(); -export const monitorActivityRequestSchema = z.object({ +export const monitorActivityQuerySchema = z.object({ + cursor: z.iso.datetime({ offset: true, local: true }).optional(), limit: z.coerce.number().int().min(1).max(100).default(20), - cursor: z.string().optional(), }); +// ─── history response schemas ─────────────────────────────────────────────── + +export const historyStatusSchema = z.enum(["completed", "failed", "running", "paused", "deleted"]); + +const historyBase = { + id: z.string(), + status: historyStatusSchema, + error: z.unknown(), + elapsedMs: z.number(), + createdAt: z.iso.datetime(), + requestParentId: z.string().nullable(), +}; + +export const paginationInfoSchema = z.object({ + page: z.number().int(), + limit: z.number().int(), + total: z.number().int(), +}); + +export const cursorPaginationInfoSchema = z.object({ + limit: z.number().int(), + nextCursor: z.string().nullable(), +}); + +export function pageResponseSchema(itemSchema: T) { + return z.object({ + data: z.array(itemSchema), + pagination: paginationInfoSchema, + }); +} + +export function cursorPageResponseSchema(itemSchema: T) { + return z.object({ + data: z.array(itemSchema), + pagination: cursorPaginationInfoSchema, + }); +} + +// ─── extract / search response schemas ────────────────────────────────────── + +export const extractResponseSchema = z.object({ + raw: z.string().nullable(), + json: z.record(z.string(), z.unknown()).nullable(), + usage: tokenUsageSchema, + metadata: z.object({ + chunker: chunkerMetadataSchema, + fetch: z.object({ provider: z.string().optional() }).optional(), + }), +}); + +export const searchResultSchema = z.object({ + url: z.string(), + title: z.string(), + content: z.string(), + provider: z.string().optional(), +}); + +export const searchMetadataSchema = z.object({ + search: z.object({ provider: z.string().optional() }), + pages: z.object({ requested: z.number().int(), scraped: z.number().int() }), + chunker: chunkerMetadataSchema.optional(), +}); + +export const searchResponseSchema = z.object({ + results: z.array(searchResultSchema), + json: z.record(z.string(), z.unknown()).nullable().optional(), + raw: z.string().nullable().optional(), + usage: tokenUsageSchema.optional(), + metadata: searchMetadataSchema, +}); + +// ─── monitor response schemas ─────────────────────────────────────────────── + +export const textChangeSchema = z.object({ + type: z.enum(["added", "removed"]), + line: z.number().int(), + content: z.string(), +}); + +export const jsonChangeSchema = z.object({ + path: z.string(), + old: z.unknown(), + new: z.unknown(), +}); + +export const setChangeSchema = z.object({ + added: z.array(z.string()), + removed: z.array(z.string()), +}); + +export const imageChangeSchema = z.object({ + size: z.number(), + changed: z.number(), + mask: z.string().optional(), +}); + +export const monitorDiffsSchema = z.object({ + markdown: z.array(textChangeSchema).optional(), + html: z.array(textChangeSchema).optional(), + json: z.array(jsonChangeSchema).optional(), + screenshot: imageChangeSchema.optional(), + links: setChangeSchema.optional(), + images: setChangeSchema.optional(), + summary: z.array(textChangeSchema).optional(), + branding: z.array(jsonChangeSchema).optional(), +}); + +export const webhookStatusSchema = z.object({ + sentAt: z.iso.datetime(), + statusCode: z.number().int().nullable(), + error: z.string().optional(), +}); + +export const monitorResultSchema = z.object({ + changed: z.boolean(), + diffs: monitorDiffsSchema, + refs: z.record(z.string(), z.string()), + webhookStatus: webhookStatusSchema.optional(), +}); + +export const monitorResponseSchema = z.object({ + cronId: z.string(), + scheduleId: z.string(), + interval: z.string(), + status: z.enum(["active", "paused"]), + config: monitorCreateSchema, + createdAt: z.iso.datetime(), + updatedAt: z.iso.datetime(), +}); + +export const monitorTickEntrySchema = z.object({ + id: z.string(), + status: z.enum(["completed", "failed", "paused", "running"]), + createdAt: z.iso.datetime(), + elapsedMs: z.number(), + changed: z.boolean(), + diffs: monitorDiffsSchema, + error: z.string().optional(), +}); + +export const monitorActivityResponseSchema = z.object({ + ticks: z.array(monitorTickEntrySchema), + nextCursor: z.string().nullable(), +}); + +// ─── crawl schemas ───────────────────────────────────────────────────────── + +export const crawlStatusSchema = z.enum(["running", "completed", "failed", "paused", "deleted"]); + +export const crawlPageStatusSchema = z.enum(["completed", "failed", "skipped"]); + export const crawlRequestSchema = z.object({ url: urlSchema, formats: z - .array(formatConfigSchema) - .min(1) + .array(scrapeFormatEntrySchema) + .min(1, { message: "Select at least one format" }) .refine((formats) => new Set(formats.map((f) => f.type)).size === formats.length, { message: "duplicate format types not allowed", }) @@ -236,8 +505,387 @@ export const crawlRequestSchema = z.object({ maxPages: z.coerce.number().int().min(1).max(1000).default(50), maxLinksPerPage: z.coerce.number().int().min(1).default(10), allowExternal: z.boolean().default(false), - includePatterns: z.array(z.string()).optional(), - excludePatterns: z.array(z.string()).optional(), + includePatterns: z + .array(z.string()) + .optional() + .describe( + 'Glob-style URL patterns to include. Use "*/" for first-level paths and "**//**" for nested paths.', + ), + excludePatterns: z + .array(z.string()) + .optional() + .describe( + 'Glob-style URL patterns to exclude. Use "*/" for first-level paths and "**//**" for nested paths.', + ), contentTypes: z.array(fetchContentTypeSchema).optional(), fetchConfig: fetchConfigSchema.optional(), }); + +export const crawlPagesQuerySchema = z.object({ + cursor: z.coerce.number().int().min(0).default(0), + limit: z.coerce.number().int().min(1).max(100).default(50), +}); + +// ─── scrape response schemas ──────────────────────────────────────────────── + +export const fetchWarningSchema = z.object({ + reason: z.enum(["too_short", "empty", "bot_blocked", "spa_shell", "soft_404"]), + provider: z.string().optional(), +}); + +export const contentPageMetadataSchema = z.object({ + index: z.number().int(), + images: z.array( + z.object({ + id: z.string(), + topLeftX: z.number(), + topLeftY: z.number(), + bottomRightX: z.number(), + bottomRightY: z.number(), + }), + ), + tables: z.array(z.object({ id: z.string(), content: z.string(), format: z.string() })), + hyperlinks: z.array(z.string()), + dimensions: z.object({ dpi: z.number(), height: z.number(), width: z.number() }), +}); + +export const scrapeMetadataSchema = z.object({ + provider: z.string().optional(), + contentType: z.string(), + elapsedMs: z.number().optional(), + warnings: z.array(fetchWarningSchema).optional(), + ocr: z + .object({ + model: z.string(), + pagesProcessed: z.number().int(), + pages: z.array(contentPageMetadataSchema), + }) + .optional(), +}); + +export const brandingColorsSchema = z.object({ + primary: z.string(), + accent: z.string(), + background: z.string(), + textPrimary: z.string(), + link: z.string(), +}); + +export const brandingFontEntrySchema = z.object({ + family: z.string(), + role: z.enum(["heading", "body"]), +}); + +export const brandingTypographySchema = z.object({ + fontFamilies: z.object({ primary: z.string(), heading: z.string() }), + fontStacks: z.object({ + heading: z.array(z.string()), + body: z.array(z.string()), + paragraph: z.array(z.string()), + }), + fontSizes: z.record(z.string(), z.string()), +}); + +export const brandingSpacingSchema = z.object({ + baseUnit: z.number(), + borderRadius: z.string(), +}); + +export const brandingInputComponentSchema = z.object({ + borderColor: z.string(), + borderRadius: z.string(), +}); + +export const brandingButtonComponentSchema = z.object({ + background: z.string(), + textColor: z.string(), + borderRadius: z.string(), + shadow: z.string(), +}); + +export const brandingComponentsSchema = z.object({ + input: brandingInputComponentSchema, + buttonPrimary: brandingButtonComponentSchema, + buttonSecondary: brandingButtonComponentSchema, +}); + +export const brandingImagesSchema = z.object({ + logo: z.string(), + favicon: z.string(), + ogImage: z.string(), +}); + +export const brandingPersonalitySchema = z.object({ + tone: z.string(), + energy: z.enum(["high", "medium", "low"]), + targetAudience: z.string(), +}); + +export const brandingDesignSystemSchema = z.object({ + framework: z.string().nullable(), + componentLibrary: z.string().nullable(), +}); + +export const brandingButtonPickSchema = z.object({ + index: z.number().int(), + text: z.string(), + reasoning: z.string(), +}); + +export const brandingButtonReasoningSchema = z.object({ + primary: brandingButtonPickSchema, + secondary: brandingButtonPickSchema, + confidence: z.number(), +}); + +export const brandingLogoReasoningSchema = z.object({ + selectedIndex: z.number().int(), + reasoning: z.string(), + confidence: z.number(), +}); + +export const brandingConfidenceSchema = z.object({ + colors: z.number(), + buttons: z.number(), + logo: z.number(), + fonts: z.number(), + components: z.number(), + overall: z.number(), +}); + +export const brandingSchema = z.object({ + colorScheme: z.enum(["light", "dark"]), + fonts: z.array(brandingFontEntrySchema), + colors: brandingColorsSchema, + typography: brandingTypographySchema, + spacing: brandingSpacingSchema, + components: brandingComponentsSchema, + images: brandingImagesSchema, + frameworkHints: z.array(z.string()), + buttonReasoning: brandingButtonReasoningSchema, + logoReasoning: brandingLogoReasoningSchema, + personality: brandingPersonalitySchema, + designSystem: brandingDesignSystemSchema, + confidence: brandingConfidenceSchema, +}); + +export const brandingMetadataSchema = z.object({ + title: z.string(), + description: z.string(), + favicon: z.string(), + language: z.string(), + themeColor: z.string(), + ogTitle: z.string(), + ogDescription: z.string(), + ogImage: z.string(), + ogUrl: z.string(), +}); + +export const scrapeScreenshotDataSchema = z.object({ + url: z.string(), + width: z.number().int(), + height: z.number().int(), +}); + +export const scrapeFormatErrorSchema = z.object({ + code: z.string(), + error: z.string(), +}); + +const emptyObj = z.object({}); + +export const scrapeResultSectionSchemas = { + markdown: z.object({ data: z.array(z.string()), metadata: emptyObj.optional() }), + html: z.object({ data: z.array(z.string()), metadata: emptyObj.optional() }), + links: z.object({ + data: z.array(z.string()), + metadata: z.object({ count: z.number().int() }).optional(), + }), + images: z.object({ + data: z.array(z.string()), + metadata: z.object({ count: z.number().int() }).optional(), + }), + summary: z.object({ + data: z.string(), + metadata: z.object({ chunker: chunkerMetadataSchema.optional() }).optional(), + }), + json: z.object({ + data: z.unknown(), + metadata: z + .object({ + chunker: chunkerMetadataSchema, + raw: z.string().nullable().optional(), + }) + .optional(), + }), + branding: z.object({ + data: brandingSchema, + metadata: z.object({ branding: brandingMetadataSchema }).optional(), + }), + screenshot: z.object({ + data: scrapeScreenshotDataSchema, + metadata: z + .object({ + contentType: z.string(), + provider: z.string().optional(), + }) + .optional(), + }), +} as const; + +export const scrapeResultMapSchema = z.object(scrapeResultSectionSchemas).partial(); + +export const scrapeResponseSchema = z.object({ + results: scrapeResultMapSchema, + metadata: scrapeMetadataSchema, + errors: z.record(scrapeFormatSchema, scrapeFormatErrorSchema).optional(), +}); + +// [NOTE] @Claude legacy cached/historic scrape responses can predate schema changes +// (e.g., the branding pipeline rework). This sanitizer drops `results.*` sections that +// no longer match the current schema so consumers receive a structurally valid response +// instead of crashing on missing fields. Returns the dropped section names for logging. +export function sanitizeScrapeResponse(raw: unknown): { + data: unknown; + dropped: string[]; +} { + const parsed = scrapeResponseSchema.safeParse(raw); + if (parsed.success) return { data: parsed.data, dropped: [] }; + if (!raw || typeof raw !== "object") return { data: raw, dropped: [] }; + + const obj = { ...(raw as Record) }; + const rawResults = obj.results; + if (!rawResults || typeof rawResults !== "object") return { data: raw, dropped: [] }; + + const cleanResults: Record = {}; + const dropped: string[] = []; + for (const [key, sectionSchema] of Object.entries(scrapeResultSectionSchemas)) { + const value = (rawResults as Record)[key]; + if (value === undefined) continue; + const check = sectionSchema.safeParse(value); + if (check.success) cleanResults[key] = check.data; + else dropped.push(key); + } + obj.results = cleanResults; + + const reparsed = scrapeResponseSchema.safeParse(obj); + return reparsed.success ? { data: reparsed.data, dropped } : { data: obj, dropped }; +} + +// ─── crawl response schemas ───────────────────────────────────────────────── + +export const crawlPageSchema = z.object({ + url: z.string(), + status: crawlPageStatusSchema, + depth: z.number().int(), + parentUrl: z.string().nullable(), + links: z.array(z.string()), + scrapeRefId: z.string(), + title: z.string(), + contentType: z.string(), + screenshotUrl: z.string().optional(), + reason: z.string().optional(), + error: z.string().optional(), + scrape: scrapeResponseSchema.optional(), +}); + +export const crawlResultSchema = z.object({ + status: crawlStatusSchema, + reason: z.string().optional(), + total: z.number().int(), + finished: z.number().int(), + pages: z.array(crawlPageSchema), +}); + +export const crawlResponseSchema = crawlResultSchema.extend({ + id: z.string(), +}); + +export const crawlPagesResponseSchema = cursorPageResponseSchema(crawlPageSchema); + +// ─── job payload schemas (internal endpoints) ─────────────────────────────── + +export const crawlJobPayloadSchema = z.object({ + crawlId: z.string(), + urls: z.array(z.string()), + depth: z.number().int(), + parentUrl: z.string().nullable(), + config: crawlRequestSchema, + userId: z.string(), + keyId: z.string().nullable(), +}); + +export const monitorJobPayloadSchema = z.object({ + cronId: z.string(), + prevId: z.string().nullable(), + userId: z.string(), + keyId: z.string().nullable(), + config: monitorCreateSchema, +}); + +// ─── history entry schemas (discriminated union by service) ───────────────── + +export const scrapeHistoryEntrySchema = z.object({ + ...historyBase, + service: z.literal("scrape"), + params: scrapeRequestSchema, + result: scrapeResponseSchema, +}); + +export const extractHistoryEntrySchema = z.object({ + ...historyBase, + service: z.literal("extract"), + params: extractRequestBaseSchema, + result: extractResponseSchema, +}); + +export const searchHistoryEntrySchema = z.object({ + ...historyBase, + service: z.literal("search"), + params: searchRequestSchema, + result: searchResponseSchema, +}); + +export const monitorHistoryEntrySchema = z.object({ + ...historyBase, + service: z.literal("monitor"), + params: z.object({ cronId: z.string(), url: z.string() }), + result: monitorResultSchema, +}); + +export const crawlHistoryEntrySchema = z.object({ + ...historyBase, + service: z.literal("crawl"), + params: z.object({ url: z.string(), maxPages: z.number().int() }), + result: crawlResultSchema, +}); + +export const historyEntrySchema = z.discriminatedUnion("service", [ + scrapeHistoryEntrySchema, + extractHistoryEntrySchema, + searchHistoryEntrySchema, + monitorHistoryEntrySchema, + crawlHistoryEntrySchema, +]); + +export const historyPageSchema = pageResponseSchema(historyEntrySchema); + +// [NOTE] @Claude runtime history route returns raw DB rows whose JSONB `params`/`result` columns +// cannot be narrowed at the edge. This loose schema documents the real wire shape (and tolerates +// a "processing" status for entries still buffered in Redis). SDK consumers that need strong +// per-service typing should parse against `historyEntrySchema` themselves. +export const historyRuntimeEntrySchema = z + .object({ + id: z.string(), + service: z.string(), + status: z.enum([...historyStatusSchema.options, "processing"]), + error: z.any().optional(), + elapsedMs: z.number().nullable().optional(), + createdAt: z.string().optional(), + requestParentId: z.string().nullable().optional(), + params: z.any().optional(), + result: z.any().optional(), + }) + .loose(); + +export const historyRuntimePageSchema = pageResponseSchema(historyRuntimeEntrySchema); diff --git a/src/scrapegraphai.ts b/src/scrapegraphai.ts index b022e98..440ed1f 100644 --- a/src/scrapegraphai.ts +++ b/src/scrapegraphai.ts @@ -1,16 +1,18 @@ import { env } from "./env.js"; import type { ApiResult, + CrawlPagesQuery, + CrawlPagesResponse, CrawlRequest, CrawlResponse, CreditsResponse, - ExtractRequest, + ExtractRequestBase, ExtractResponse, HealthResponse, HistoryEntry, HistoryFilter, HistoryPage, - MonitorActivityRequest, + MonitorActivityQuery, MonitorActivityResponse, MonitorCreateRequest, MonitorResponse, @@ -122,7 +124,7 @@ export async function scrape( export async function extract( apiKey: string, - params: ExtractRequest, + params: ExtractRequestBase, ): Promise> { try { const { data, elapsedMs } = await request("POST", "/extract", apiKey, params); @@ -207,6 +209,24 @@ export const crawl = { } }, + async pages( + apiKey: string, + id: string, + params?: Partial, + ): Promise> { + try { + const qs = new URLSearchParams(); + if (params?.cursor !== undefined) qs.set("cursor", String(params.cursor)); + if (params?.limit !== undefined) qs.set("limit", String(params.limit)); + const query = qs.toString(); + const path = query ? `/crawl/${id}/pages?${query}` : `/crawl/${id}/pages`; + const { data, elapsedMs } = await request("GET", path, apiKey); + return ok(data, elapsedMs); + } catch (err) { + return fail(err); + } + }, + async stop(apiKey: string, id: string): Promise> { try { const { data, elapsedMs } = await request<{ ok: boolean }>( @@ -336,7 +356,7 @@ export const monitor = { async activity( apiKey: string, id: string, - params?: MonitorActivityRequest, + params?: MonitorActivityQuery, ): Promise> { try { const qs = new URLSearchParams(); @@ -366,7 +386,7 @@ export function ScrapeGraphAI(opts?: ScrapeGraphAIInput) { const key = resolveApiKey(opts); return { scrape: (params: ScrapeRequest) => scrape(key, params), - extract: (params: ExtractRequest) => extract(key, params), + extract: (params: ExtractRequestBase) => extract(key, params), search: (params: SearchRequest) => search(key, params), credits: () => getCredits(key), healthy: () => checkHealth(key), @@ -377,6 +397,7 @@ export function ScrapeGraphAI(opts?: ScrapeGraphAIInput) { crawl: { start: (params: CrawlRequest) => crawl.start(key, params), get: (id: string) => crawl.get(key, id), + pages: (id: string, params?: Partial) => crawl.pages(key, id, params), stop: (id: string) => crawl.stop(key, id), resume: (id: string) => crawl.resume(key, id), delete: (id: string) => crawl.delete(key, id), @@ -389,7 +410,7 @@ export function ScrapeGraphAI(opts?: ScrapeGraphAIInput) { delete: (id: string) => monitor.delete(key, id), pause: (id: string) => monitor.pause(key, id), resume: (id: string) => monitor.resume(key, id), - activity: (id: string, params?: MonitorActivityRequest) => monitor.activity(key, id, params), + activity: (id: string, params?: MonitorActivityQuery) => monitor.activity(key, id, params), }, }; } diff --git a/src/types.ts b/src/types.ts index 5f9111e..d2fd8ea 100644 --- a/src/types.ts +++ b/src/types.ts @@ -1,419 +1,416 @@ -import type { z } from "zod"; +import type { z } from "zod/v4"; import type { + brandingButtonComponentSchema, + brandingButtonReasoningSchema, + brandingColorsSchema, + brandingComponentsSchema, + brandingConfidenceSchema, + brandingDesignSystemSchema, + brandingFontEntrySchema, + brandingImagesSchema, + brandingInputComponentSchema, + brandingLogoReasoningSchema, + brandingMetadataSchema, + brandingPersonalitySchema, + brandingSchema, + brandingSpacingSchema, + brandingTypographySchema, + chunkerMetadataSchema, + contentPageMetadataSchema, + crawlHistoryEntrySchema, + crawlJobPayloadSchema, + crawlPageSchema, + crawlPageStatusSchema, + crawlPagesQuerySchema, + crawlPagesResponseSchema, crawlRequestSchema, - extractRequestSchema, + crawlResponseSchema, + crawlResultSchema, + crawlStatusSchema, + creditsResponseSchema, + cursorPaginationInfoSchema, + extractHistoryEntrySchema, + extractRequestBaseSchema, + extractResponseSchema, fetchConfigSchema, fetchContentTypeSchema, fetchModeSchema, - formatConfigSchema, + fetchWarningSchema, + healthResponseSchema, + historyEntrySchema, historyFilterSchema, - htmlModeSchema, - mockConfigSchema, - monitorActivityRequestSchema, - monitorCreateRequestSchema, - monitorUpdateRequestSchema, + historyPageSchema, + historyStatusSchema, + htmlConfigSchema, + imageChangeSchema, + jobsStatusSchema, + jsonChangeSchema, + llmConfigSchema, + markdownConfigSchema, + monitorActivityQuerySchema, + monitorActivityResponseSchema, + monitorCreateSchema, + monitorDiffsSchema, + monitorHistoryEntrySchema, + monitorJobPayloadSchema, + monitorResponseSchema, + monitorResultSchema, + monitorTickEntrySchema, + monitorUpdateSchema, + paginationInfoSchema, + scrapeBrandingFormatSchema, + scrapeCaptureFormatSchema, + scrapeContentFormatSchema, + scrapeFormatEntrySchema, + scrapeFormatErrorSchema, + scrapeFormatSchema, + scrapeHistoryEntrySchema, + scrapeHtmlFormatSchema, + scrapeImagesFormatSchema, + scrapeJsonFormatSchema, + scrapeLinksFormatSchema, + scrapeMarkdownFormatSchema, + scrapeMetadataSchema, scrapeRequestSchema, + scrapeResponseSchema, + scrapeResultMapSchema, + scrapeScreenshotDataSchema, + scrapeScreenshotFormatSchema, + scrapeSummaryConfigSchema, + scrapeSummaryFormatSchema, + searchHistoryEntrySchema, + searchMetadataSchema, searchRequestSchema, - timeRangeSchema, + searchResponseSchema, + searchResultSchema, + setChangeSchema, + textChangeSchema, + tokenUsageSchema, + validateResponseSchema, + webhookStatusSchema, } from "./schemas.js"; -export type Service = "scrape" | "extract" | "search" | "monitor" | "crawl"; -export type HtmlMode = z.infer; -export type FetchMode = z.infer; -export type TimeRange = z.infer; -export type CrawlStatus = "running" | "completed" | "failed" | "paused" | "deleted"; -export type CrawlPageStatus = "completed" | "failed" | "skipped"; -export type HistoryStatus = "completed" | "failed" | "running" | "paused" | "deleted"; -export type MonitorTickStatus = "completed" | "failed" | "paused" | "running"; -export type FetchContentType = z.infer; - -export type MockConfig = z.input; -export type FetchConfig = z.input; - -export type MarkdownFormatConfig = z.input & { type: "markdown" }; -export type HtmlFormatConfig = z.input & { type: "html" }; -export type ScreenshotFormatConfig = z.input & { type: "screenshot" }; -export type JsonFormatConfig = z.input & { type: "json" }; -export type LinksFormatConfig = z.input & { type: "links" }; -export type ImagesFormatConfig = z.input & { type: "images" }; -export type SummaryFormatConfig = z.input & { type: "summary" }; -export type BrandingFormatConfig = z.input & { type: "branding" }; -export type FormatConfig = z.input; - -export type FormatType = - | "markdown" - | "html" - | "links" - | "images" - | "summary" - | "json" - | "branding" - | "screenshot"; - -export type ScrapeRequest = z.input; -export type ExtractRequest = z.input; -export type SearchRequest = z.input; -export type CrawlRequest = z.input; -export type MonitorCreateRequest = z.input; -export type MonitorUpdateRequest = z.input; -export type MonitorActivityRequest = z.input; -export type HistoryFilter = z.input; - -export interface TokenUsage { - promptTokens: number; - completionTokens: number; -} - -export interface ChunkerMetadata { - chunks: { size: number }[]; -} - -export type FetchWarningReason = "too_short" | "empty" | "bot_blocked" | "spa_shell" | "soft_404"; - -export interface FetchWarning { - reason: FetchWarningReason; - provider?: string; -} - -export interface ContentPageMetadata { - index: number; - images: Array<{ - id: string; - topLeftX: number; - topLeftY: number; - bottomRightX: number; - bottomRightY: number; - }>; - tables: Array<{ id: string; content: string; format: string }>; - hyperlinks: string[]; - dimensions: { dpi: number; height: number; width: number }; -} +// ─── generic / config ──────────────────────────────────────────────────────── -export interface ScrapeMetadata { - provider?: string; - contentType: string; - elapsedMs?: number; - warnings?: FetchWarning[]; - ocr?: { - model: string; - pagesProcessed: number; - pages: ContentPageMetadata[]; - }; -} +export type { ModelName } from "./models.js"; +export type UserRole = "user" | "admin"; -export interface BrandingColors { - primary: string; - accent: string; - background: string; - textPrimary: string; - link: string; -} - -export interface BrandingFontEntry { - family: string; - fallback: string; -} +export type ErrorType = + | "auth_missing_key" + | "internal" + | "monitor_tick_failed" + | "not_found" + | "upstream_failed" + | "validation"; -export interface BrandingTypography { - primary: BrandingFontEntry; - heading: BrandingFontEntry; - mono: BrandingFontEntry; - sizes: { h1: string; h2: string; body: string }; +export interface Error { + type: ErrorType; + message: string; + details?: unknown; } -export interface BrandingImages { - logo: string; - favicon: string; - ogImage: string; +export interface ErrorResponse { + error: Error; } -export interface BrandingPersonality { - tone: string; - energy: "high" | "medium" | "low"; - targetAudience: string; -} +export type RateLimitKind = "work" | "poll"; -export interface Branding { - colorScheme: "light" | "dark"; - colors: BrandingColors; - typography: BrandingTypography; - images: BrandingImages; - spacing: { baseUnit: number; borderRadius: string }; - frameworkHints: string[]; - personality: BrandingPersonality; - confidence: number; +export interface RateLimitConfig { + work: number; + poll: number; } -export interface BrandingMetadata { - title: string; - description: string; - favicon: string; - language: string; - themeColor: string; - ogTitle: string; - ogDescription: string; - ogImage: string; - ogUrl: string; +export interface ServiceConfig { + rateLimit: RateLimitConfig; + maxJobs?: number; } -export interface ScreenshotData { - url: string; - width: number; - height: number; -} +export type ServicesConfig = Record; -export interface FormatError { - code: string; - error: string; -} +export type TokenUsage = z.infer; +export type ChunkerMetadata = z.infer; +export type ValidateResponse = z.infer; +export type HealthResponse = z.infer; -export interface FormatResponseMap { - markdown: string[]; - html: string[]; - links: string[]; - images: string[]; - summary: string; - json: Record; - branding: Branding; - screenshot: ScreenshotData; -} +// ─── scrape ────────────────────────────────────────────────────────────────── +export type ScrapeRequest = z.infer; +export type FetchConfig = z.infer; +export type FetchMode = z.infer; +export type FetchContentType = z.infer; export type ImageContentType = Extract; -export interface FormatMetadataMap { - markdown: Record; - html: Record; - links: { count: number }; - images: { count: number }; - summary: { chunker?: ChunkerMetadata }; - json: { chunker: ChunkerMetadata; raw?: string | null }; - branding: { branding: BrandingMetadata }; - screenshot: { contentType: ImageContentType; provider?: string }; -} - -export type ScrapeResultMap = Partial<{ - [K in FormatType]: { - data: FormatResponseMap[K]; - metadata?: FormatMetadataMap[K]; - }; -}>; - -export interface ScrapeResponse { - results: ScrapeResultMap; - metadata: ScrapeMetadata; - errors?: Partial<{ [K in FormatType]: FormatError }>; -} - -export interface ExtractResponse { - raw: string | null; - json: Record | null; - usage: TokenUsage; - metadata: { - chunker: ChunkerMetadata; - fetch?: { provider?: string }; - }; -} - -export interface SearchResult { - url: string; - title: string; - content: string; - provider?: string; -} - -export interface SearchMetadata { - search: { provider?: string }; - pages: { requested: number; scraped: number }; - chunker?: ChunkerMetadata; -} - -export interface SearchResponse { - results: SearchResult[]; - json?: Record | null; - raw?: string | null; - usage?: TokenUsage; - metadata: SearchMetadata; -} - -export interface CrawlPage { - url: string; - status: CrawlPageStatus; - depth: number; - parentUrl: string | null; - links: string[]; - scrapeRefId: string; - title: string; - contentType: string; - screenshotUrl?: string; - reason?: string; - error?: string; -} - -export interface CrawlResult { - status: CrawlStatus; - reason?: string; - total: number; - finished: number; - pages: CrawlPage[]; -} - -export interface CrawlResponse extends CrawlResult { - id: string; -} - -export interface TextChange { - type: "added" | "removed"; - line: number; - content: string; -} - -export interface JsonChange { - path: string; - old: unknown; - new: unknown; -} - -export interface SetChange { - added: string[]; - removed: string[]; -} - -export interface ImageChange { - size: number; - changed: number; - mask?: string; -} - -export interface MonitorDiffs { - markdown?: TextChange[]; - html?: TextChange[]; - json?: JsonChange[]; - screenshot?: ImageChange; - links?: SetChange; - images?: SetChange; - summary?: TextChange[]; - branding?: JsonChange[]; -} - -export type MonitorRefs = Partial>; - -export interface WebhookStatus { - sentAt: string; - statusCode: number | null; - error?: string; -} - -export interface MonitorResult { - changed: boolean; - diffs: MonitorDiffs; - refs: MonitorRefs; - webhookStatus?: WebhookStatus; -} - -export interface MonitorResponse { - cronId: string; - scheduleId: string; - interval: string; - status: "active" | "paused"; - config: MonitorCreateRequest; - createdAt: string; - updatedAt: string; -} +export type ContentPageMetadata = z.infer; +export type FetchWarning = z.infer; +export type ScrapeMetadata = z.infer; + +export type ScrapeContentFormat = z.infer; +export type ScrapeCaptureFormat = z.infer; +export type ScrapeFormat = z.infer; +export type ScrapeMarkdownConfig = z.infer; +export type ScrapeHtmlConfig = z.infer; +export type ScrapeSummaryConfig = z.infer; +export type ScrapeFormatEntry = z.infer; +export type ScrapeMarkdownFormatEntry = z.infer; +export type ScrapeHtmlFormatEntry = z.infer; +export type ScrapeLinksFormatEntry = z.infer; +export type ScrapeImagesFormatEntry = z.infer; +export type ScrapeSummaryFormatEntry = z.infer; +export type ScrapeJsonFormatEntry = z.infer; +export type ScrapeBrandingFormatEntry = z.infer; +export type ScrapeScreenshotFormatEntry = z.infer; +export type ScrapeContentFormatEntry = Extract; + +export type BrandingColors = z.infer; +export type BrandingFontEntry = z.infer; +export type BrandingTypography = z.infer; +export type BrandingSpacing = z.infer; +export type BrandingInputComponent = z.infer; +export type BrandingButtonComponent = z.infer; +export type BrandingComponents = z.infer; +export type BrandingImages = z.infer; +export type BrandingPersonality = z.infer; +export type BrandingDesignSystem = z.infer; +export type BrandingButtonReasoning = z.infer; +export type BrandingLogoReasoning = z.infer; +export type BrandingConfidence = z.infer; +export type Branding = z.infer; +export type BrandingMetadata = z.infer; +export type ScrapeFormatError = z.infer; +export type ScrapeScreenshotData = z.infer; +export type ScrapeResultMap = z.infer; + +export type ScrapeFormatResponseMap = { + [K in keyof Required]: NonNullable["data"]; +}; + +export type ScrapeFormatMetadataMap = { + [K in keyof Required]: NonNullable["metadata"]>; +}; + +export type ScrapeResponse = z.infer; + +export type ScrapeEvent = + | { type: "scrape.fetch.started"; url: string } + | { type: "scrape.fetch.completed"; url: string; elapsedMs: number } + | { type: "scrape.process.started"; format: ScrapeFormat } + | { type: "scrape.process.completed"; format: ScrapeFormat; elapsedMs: number } + | { type: "scrape.process.failed"; format: ScrapeFormat; error: string; code: string } + | { type: "scrape.result"; data: ScrapeResponse } + | { type: "scrape.failed"; error: string; code: string } + | { type: "scrape.completed" }; + +// ─── extract ───────────────────────────────────────────────────────────────── + +export type ExtractRequestBase = z.infer; +export type LlmConfig = z.infer; + +export type ExtractResponse = z.infer; + +export type ExtractEvent = + | { type: "extract.fetch.started"; url: string } + | { type: "extract.fetch.completed"; url: string; elapsedMs: number } + | { type: "extract.extraction.started" } + | { type: "extract.extraction.completed"; elapsedMs: number } + | { type: "extract.failed"; error: string } + | { type: "extract.completed" }; + +// ─── search ────────────────────────────────────────────────────────────────── + +export type SearchRequest = z.infer; + +export type SearchResult = z.infer; +export type SearchMetadata = z.infer; +export type SearchResponse = z.infer; + +export type SearchEvent = + | { type: "search.query.started" } + | { + type: "search.query.completed"; + query: string; + prompt: string; + urls: string[]; + totalResults: number; + } + | { type: "search.scrape.started"; url: string; requestId: string } + | { type: "search.scrape.completed"; url: string; requestId: string; data: unknown } + | { type: "search.scrape.failed"; url: string; requestId: string; error: string } + | { type: "search.scrape.done"; total: number; scraped: number } + | { type: "search.merge.started" } + | { type: "search.failed"; error: string } + | { type: "search.completed" }; + +// ─── monitor ───────────────────────────────────────────────────────────────── + +export type MonitorCreateRequest = z.infer; +export type MonitorUpdateRequest = z.infer; +export type MonitorActivityQuery = z.input; + +export type TextChange = z.infer; +export type JsonChange = z.infer; +export type SetChange = z.infer; +export type ImageChange = z.infer; +export type MonitorDiffs = z.infer; +export type MonitorRefs = Partial>; +export type WebhookStatus = z.infer; +export type MonitorResult = z.infer; + +export function countMonitorDiffs(diffs?: Partial): number { + let count = 0; + if (diffs?.markdown) count += diffs.markdown.length; + if (diffs?.html) count += diffs.html.length; + if (diffs?.json) count += diffs.json.length; + if (diffs?.summary) count += diffs.summary.length; + if (diffs?.branding) count += diffs.branding.length; + if (diffs?.links) count += diffs.links.added.length + diffs.links.removed.length; + if (diffs?.images) count += diffs.images.added.length + diffs.images.removed.length; + if (diffs?.screenshot?.changed) count += 1; + return count; +} + +export type MonitorJobPayload = z.infer; + +export type MonitorResponse = z.infer; +export type MonitorTickEntry = z.infer; +export type MonitorTickStatus = MonitorTickEntry["status"]; +export type MonitorActivityResponse = z.infer; + +export type WebhookPayload = + | { + type: "monitor.change.detected"; + data: { + cronId: string; + url: string; + changedAt: string; + changed: boolean; + current: ScrapeResultMap; + previous: ScrapeResultMap | null; + diffs: MonitorDiffs; + }; + } + | { + type: "monitor.test"; + data: { + cronId: string; + url: string; + sentAt: string; + }; + }; + +export type MonitorEvent = + | { type: "monitor.tick.started"; cronId: string; url: string } + | { type: "monitor.tick.completed"; cronId: string; changed: boolean } + | { + type: "monitor.change.detected"; + cronId: string; + url: string; + diffs: MonitorDiffs; + } + | { type: "monitor.tick.failed"; cronId: string; url: string; error: string } + | { type: "monitor.paused"; cronId: string; reason: string } + | { type: "monitor.webhook.completed"; cronId: string; statusCode: number } + | { type: "monitor.webhook.failed"; cronId: string; error: string }; + +// ─── crawl ─────────────────────────────────────────────────────────────────── + +export type CrawlRequest = z.infer; +export type CrawlStatus = z.infer; +export type CrawlPageStatus = z.infer; + +export type CrawlPage = z.infer; +export type CrawlPagesQuery = z.infer; +export type CrawlPagesResponse = z.infer; +export type CrawlResponse = z.infer; + +export type CrawlJobPayload = z.infer; + +export type CrawlEvent = + | { type: "crawl.started"; crawlId: string; url: string } + | { type: "crawl.page.completed"; crawlId: string; page: CrawlPage } + | { type: "crawl.page.skipped"; crawlId: string; page: CrawlPage; reason: string } + | { + type: "crawl.page.failed"; + crawlId: string; + page: CrawlPage; + error: string; + } + | { type: "crawl.progress"; crawlId: string; total: number; finished: number } + | { type: "crawl.paused"; crawlId: string; reason: string } + | { type: "crawl.resumed"; crawlId: string } + | { type: "crawl.completed"; crawlId: string }; + +export type Event = ScrapeEvent | ExtractEvent | SearchEvent | MonitorEvent | CrawlEvent; + +export type EventType = Event["type"]; + +export type EventData = Extract; + +// ─── history ───────────────────────────────────────────────────────────────── + +export type HistoryFilter = z.infer; +export type HistoryService = "scrape" | "extract" | "search" | "monitor" | "crawl"; +export type HistoryStatus = z.infer; + +export type ScrapeHistoryEntry = z.infer; +export type ExtractHistoryEntry = z.infer; +export type SearchHistoryEntry = z.infer; +export type MonitorHistoryEntry = z.infer; +export type CrawlHistoryEntry = z.infer; +export type CrawlResult = z.infer; +export type HistoryEntry = z.infer; -export interface MonitorTickEntry { - id: string; - status: MonitorTickStatus; - createdAt: string; - elapsedMs: number; - changed: boolean; - diffs: MonitorDiffs; - error?: string; -} - -export interface MonitorActivityResponse { - ticks: MonitorTickEntry[]; - nextCursor: string | null; -} - -interface HistoryBase { - id: string; - status: HistoryStatus; - error: unknown; - elapsedMs: number; - createdAt: string; - requestParentId: string | null; -} - -export interface ScrapeHistoryEntry extends HistoryBase { - service: "scrape"; - params: ScrapeRequest; - result: ScrapeResponse; -} - -export interface ExtractHistoryEntry extends HistoryBase { - service: "extract"; - params: ExtractRequest; - result: ExtractResponse; -} - -export interface SearchHistoryEntry extends HistoryBase { - service: "search"; - params: SearchRequest; - result: SearchResponse; +export interface PageResponse { + data: T[]; + pagination: z.infer; } -export interface MonitorHistoryEntry extends HistoryBase { - service: "monitor"; - params: { cronId: string; url: string }; - result: MonitorResult; +export interface CursorPageResponse { + data: T[]; + pagination: z.infer; } -export interface CrawlHistoryEntry extends HistoryBase { - service: "crawl"; - params: { url: string; maxPages: number }; - result: CrawlResult; -} +export type HistoryPage = z.infer; -export type HistoryEntry = - | ScrapeHistoryEntry - | ExtractHistoryEntry - | SearchHistoryEntry - | MonitorHistoryEntry - | CrawlHistoryEntry; - -export interface HistoryPagination { - page: number; - limit: number; - total: number; -} +// ─── credits ───────────────────────────────────────────────────────────────── -export interface PageResponse { - data: T[]; - pagination: HistoryPagination; -} +export type JobsStatus = z.infer; +export type CreditsResponse = z.infer; -export type HistoryPage = PageResponse; +// ─── credit ledger ────────────────────────────────────────────────────────── -export interface JobsStatus { - used: number; - limit: number; +// [NOTE] @Claude single-letter keys to minimise Redis memory per entry — flushed to DB with full names +export interface CreditLedgerEntry { + i: string; + k: string; + a: number; + s: string; + t: number; + r?: string; + ak?: string; } -export interface CreditsJobs { - crawl: JobsStatus; - monitor: JobsStatus; +export interface TopUpInProcessEntry { + i: string; + k: string; + t: number; } +// ─── legacy migration ─────────────────────────────────────────────────────── -export interface CreditsResponse { - remaining: number; - used: number; - plan: string; - jobs: CreditsJobs; +export interface LegacyOnboarding { + jobRole: string; + company: string | null; + companySize: string; + primaryUseCase: string; + source: string; } -export interface HealthResponse { - status: "ok" | "degraded"; - uptime: number; +export interface LegacyUserData { + oldUserId: string; + stripeCustomerId: string | null; + stripeSubscriptionId: string | null; + planId: string; + remainingCredits: number; + onboarding: LegacyOnboarding; } export interface ApiResult { diff --git a/src/url.ts b/src/url.ts new file mode 100644 index 0000000..e3cf816 --- /dev/null +++ b/src/url.ts @@ -0,0 +1,46 @@ +const IPV4_RE = /^(?:(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)\.){3}(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)$/; +const IPV6_RE = + /^(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}$|^::$|^(?:[0-9a-fA-F]{1,4}:){1,7}:$|^:(?::[0-9a-fA-F]{1,4}){1,7}$|^(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}$|^(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}$|^(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}$|^(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}$|^(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}$|^[0-9a-fA-F]{1,4}:(?::[0-9a-fA-F]{1,4}){1,6}$|^::(?:ffff:)?(?:(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)\.){3}(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)$/; + +function isIPv4(s: string): boolean { + return IPV4_RE.test(s); +} + +function isIPv6(s: string): boolean { + return IPV6_RE.test(s); +} + +const PRIVATE_HOSTNAME_PATTERNS = [/^localhost$/i, /\.local$/i, /\.internal$/i, /\.localhost$/i]; + +function isPrivateIPv4(ip: string): boolean { + const parts = ip.split(".").map(Number); + if (parts.length !== 4 || parts.some((p) => Number.isNaN(p) || p < 0 || p > 255)) return false; + const [a, b] = parts; + if (a === 127) return true; + if (a === 10) return true; + if (a === 172 && b >= 16 && b <= 31) return true; + if (a === 192 && b === 168) return true; + if (a === 169 && b === 254) return true; + if (a === 0) return true; + return false; +} + +function isPrivateIPv6(ip: string): boolean { + const normalized = ip.replace(/^\[|]$/g, "").toLowerCase(); + if (normalized === "::1") return true; + if (normalized === "::") return true; + if (normalized.startsWith("fe80:")) return true; + if (normalized.startsWith("fc") || normalized.startsWith("fd")) return true; + if (normalized.startsWith("::ffff:")) { + const v4 = normalized.slice(7); + if (isIPv4(v4)) return isPrivateIPv4(v4); + } + return false; +} + +export function isInternal(hostname: string): boolean { + if (PRIVATE_HOSTNAME_PATTERNS.some((r) => r.test(hostname))) return true; + if (isIPv4(hostname)) return isPrivateIPv4(hostname); + if (isIPv6(hostname) || hostname.startsWith("[")) return isPrivateIPv6(hostname); + return false; +} diff --git a/tests/scrapegraphai.test.ts b/tests/scrapegraphai.test.ts index b8496bd..edfe47a 100644 --- a/tests/scrapegraphai.test.ts +++ b/tests/scrapegraphai.test.ts @@ -906,6 +906,35 @@ describe("crawl", () => { expectRequest(0, "GET", "/crawl/crawl-123"); }); + test("pages success", async () => { + const body = { + data: [ + { + url: "https://example.com", + status: "completed", + depth: 0, + parentUrl: null, + links: [], + scrapeRefId: "scrape-123", + title: "Example", + contentType: "text/html", + scrape: { + results: { markdown: { data: ["# Example"] } }, + metadata: { contentType: "text/html" }, + }, + }, + ], + pagination: { limit: 50, nextCursor: null }, + }; + fetchSpy = spyOn(globalThis, "fetch").mockResolvedValueOnce(json(body)); + + const res = await sdk.crawl.pages(API_KEY, "crawl-123", { cursor: 0, limit: 50 }); + + expect(res.status).toBe("success"); + expect(res.data).toEqual(body); + expectRequest(0, "GET", "/crawl/crawl-123/pages?cursor=0&limit=50"); + }); + test("stop success", async () => { fetchSpy = spyOn(globalThis, "fetch").mockResolvedValueOnce(json({ ok: true })); From 95bcae5ab15370ba400a77cd7f973b469bd4c4ac Mon Sep 17 00:00:00 2001 From: FrancescoSaverioZuppichini Date: Thu, 28 May 2026 10:22:11 +0200 Subject: [PATCH 2/3] added org name in ai-sdk --- packages/ai-sdk/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/ai-sdk/package.json b/packages/ai-sdk/package.json index 934a518..fded747 100644 --- a/packages/ai-sdk/package.json +++ b/packages/ai-sdk/package.json @@ -1,5 +1,5 @@ { - "name": "@scrapegraphai/ai-sdk", + "name": "@scrapegraph-ai/ai-sdk", "version": "0.1.0", "description": "Vercel AI SDK tools integration for ScrapeGraphAI.", "type": "module", From 507fb81bfe16653b7bb4e322420e87273760ee3f Mon Sep 17 00:00:00 2001 From: FrancescoSaverioZuppichini Date: Thu, 28 May 2026 10:27:11 +0200 Subject: [PATCH 3/3] fix: use scrapegraph ai package scope --- packages/ai-sdk/README.md | 20 ++++++++++---------- packages/ai-sdk/examples/crawl-blog.ts | 2 +- packages/ai-sdk/examples/hacker-news.ts | 2 +- packages/ai-sdk/package.json | 2 +- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/packages/ai-sdk/README.md b/packages/ai-sdk/README.md index 68d749f..76e69c1 100644 --- a/packages/ai-sdk/README.md +++ b/packages/ai-sdk/README.md @@ -1,6 +1,6 @@ # ScrapeGraphAI AI SDK Tools -[![npm version](https://badge.fury.io/js/%40scrapegraphai%2Fai-sdk.svg)](https://www.npmjs.com/package/@scrapegraphai/ai-sdk) +[![npm version](https://badge.fury.io/js/%40scrapegraph-ai%2Fai-sdk.svg)](https://www.npmjs.com/package/@scrapegraph-ai/ai-sdk) [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)

@@ -14,9 +14,9 @@ Vercel [AI SDK](https://ai-sdk.dev/docs/introduction) tools for the ScrapeGraphA ## Install ```bash -npm i @scrapegraphai/ai-sdk ai +npm i @scrapegraph-ai/ai-sdk ai # or -bun add @scrapegraphai/ai-sdk ai +bun add @scrapegraph-ai/ai-sdk ai ``` `ai` is a peer dependency. Install the model provider package you use, for example: @@ -44,7 +44,7 @@ Minimal scrape-only setup: ```ts import { openai } from "@ai-sdk/openai"; import { generateText, stepCountIs } from "ai"; -import { scrapeTool } from "@scrapegraphai/ai-sdk"; +import { scrapeTool } from "@scrapegraph-ai/ai-sdk"; const result = await generateText({ model: openai("gpt-5-nano"), @@ -69,7 +69,7 @@ import { monitorTools, scrapeTool, searchTool, -} from "@scrapegraphai/ai-sdk"; +} from "@scrapegraph-ai/ai-sdk"; const result = await generateText({ model: openai("gpt-5-nano"), @@ -102,7 +102,7 @@ const tools = { Scrape a webpage with ScrapeGraphAI. Supports markdown, html, json extraction, links, images, summary, branding, and screenshots. ```ts -import { scrapeTool } from "@scrapegraphai/ai-sdk"; +import { scrapeTool } from "@scrapegraph-ai/ai-sdk"; const tools = { scrape: scrapeTool(), @@ -114,7 +114,7 @@ const tools = { Extract structured JSON from a URL, HTML, or markdown with a natural-language prompt. ```ts -import { extractTool } from "@scrapegraphai/ai-sdk"; +import { extractTool } from "@scrapegraph-ai/ai-sdk"; const tools = { extract: extractTool(), @@ -126,7 +126,7 @@ const tools = { Search the web and optionally extract structured data from search results. ```ts -import { searchTool } from "@scrapegraphai/ai-sdk"; +import { searchTool } from "@scrapegraph-ai/ai-sdk"; const tools = { search: searchTool(), @@ -138,7 +138,7 @@ const tools = { Start, poll, page through, stop, resume, and delete ScrapeGraphAI crawl jobs. ```ts -import { crawlTools } from "@scrapegraphai/ai-sdk"; +import { crawlTools } from "@scrapegraph-ai/ai-sdk"; const tools = { ...crawlTools(), @@ -160,7 +160,7 @@ const tools = { Create, list, update, pause, resume, delete, and fetch activity for ScrapeGraphAI monitors. ```ts -import { monitorTools } from "@scrapegraphai/ai-sdk"; +import { monitorTools } from "@scrapegraph-ai/ai-sdk"; const tools = { ...monitorTools(), diff --git a/packages/ai-sdk/examples/crawl-blog.ts b/packages/ai-sdk/examples/crawl-blog.ts index c6ee200..5642e12 100644 --- a/packages/ai-sdk/examples/crawl-blog.ts +++ b/packages/ai-sdk/examples/crawl-blog.ts @@ -2,7 +2,7 @@ import { openai } from "@ai-sdk/openai"; import { generateText, stepCountIs, type ModelMessage } from "ai"; import { stdin as input, stdout as output } from "node:process"; import { createInterface } from "node:readline/promises"; -import { crawlTools } from "../src/index"; +import { crawlTools } from "@scrapegraph-ai/ai-sdk"; const initialPrompt = "Find 10 https://scrapegraphai.com/ blog posts. Start a crawl, poll its status, fetch crawled pages with getCrawlPages, then summarize what you found."; diff --git a/packages/ai-sdk/examples/hacker-news.ts b/packages/ai-sdk/examples/hacker-news.ts index ac92864..eb87fd8 100644 --- a/packages/ai-sdk/examples/hacker-news.ts +++ b/packages/ai-sdk/examples/hacker-news.ts @@ -1,6 +1,6 @@ import { openai } from "@ai-sdk/openai"; import { generateText, stepCountIs } from "ai"; -import { scrapeTool } from "../src/index"; +import { scrapeTool } from "@scrapegraph-ai/ai-sdk"; const { text } = await generateText({ model: openai("gpt-5-nano"), diff --git a/packages/ai-sdk/package.json b/packages/ai-sdk/package.json index fded747..5e69a9e 100644 --- a/packages/ai-sdk/package.json +++ b/packages/ai-sdk/package.json @@ -1,6 +1,6 @@ { "name": "@scrapegraph-ai/ai-sdk", - "version": "0.1.0", + "version": "0.1.1", "description": "Vercel AI SDK tools integration for ScrapeGraphAI.", "type": "module", "main": "dist/index.js",