diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index a303845..039af49 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -13,6 +13,8 @@ jobs:
steps:
- uses: actions/checkout@v4
- uses: oven-sh/setup-bun@v2
+ - name: Use local scrapegraph-js package
+ run: sed -i 's/"scrapegraph-js": "\^2.2.0"/"scrapegraph-js": "file:..\/.."/' packages/ai-sdk/package.json
- run: bun install
- run: bun run test
@@ -22,5 +24,10 @@ jobs:
steps:
- uses: actions/checkout@v4
- uses: oven-sh/setup-bun@v2
+ - name: Use local scrapegraph-js package
+ run: sed -i 's/"scrapegraph-js": "\^2.2.0"/"scrapegraph-js": "file:..\/.."/' packages/ai-sdk/package.json
- run: bun install
+ - run: bun run build
- run: bun run check
+ - run: cd packages/ai-sdk && bun run check
+ - run: cd packages/ai-sdk && bun run build
diff --git a/README.md b/README.md
index bee865e..26140aa 100644
--- a/README.md
+++ b/README.md
@@ -21,6 +21,16 @@ bun add scrapegraph-js
## Quick Start
+### API key
+
+Log in to the [ScrapeGraphAI dashboard](https://scrapegraphai.com/) to create an API key. The dashboard also shows your request history, usage, credits, and crawl/monitor activity.
+
+Set it in your environment:
+
+```bash
+export SGAI_API_KEY=...
+```
+
```ts
import { ScrapeGraphAI } from "scrapegraph-js";
@@ -140,6 +150,12 @@ const start = await sgai.crawl.start({
// Check status
const status = await sgai.crawl.get(start.data?.id!);
+// Fetch paginated pages with resolved scrape results
+const pages = await sgai.crawl.pages(start.data?.id!, {
+ cursor: 0,
+ limit: 50,
+});
+
// Control
await sgai.crawl.stop(id);
await sgai.crawl.resume(id);
diff --git a/package.json b/package.json
index cd6b353..705e24d 100644
--- a/package.json
+++ b/package.json
@@ -1,16 +1,17 @@
{
"name": "scrapegraph-js",
- "version": "2.1.0",
+ "version": "2.2.0",
"description": "Official JavaScript/TypeScript SDK for the ScrapeGraph AI API — smart web scraping powered by AI",
"type": "module",
"main": "dist/index.js",
"types": "dist/index.d.ts",
"exports": {
".": {
- "import": "./dist/index.js",
- "types": "./dist/index.d.ts"
+ "types": "./dist/index.d.ts",
+ "import": "./dist/index.js"
}
},
+ "workspaces": ["packages/*"],
"scripts": {
"dev": "tsup --watch",
"build": "tsup",
diff --git a/packages/ai-sdk/README.md b/packages/ai-sdk/README.md
new file mode 100644
index 0000000..76e69c1
--- /dev/null
+++ b/packages/ai-sdk/README.md
@@ -0,0 +1,200 @@
+# ScrapeGraphAI AI SDK Tools
+
+[](https://www.npmjs.com/package/@scrapegraph-ai/ai-sdk)
+[](https://opensource.org/licenses/MIT)
+
+
+
+
+
+
+
+Vercel [AI SDK](https://ai-sdk.dev/docs/introduction) tools for the ScrapeGraphAI API.
+
+## Install
+
+```bash
+npm i @scrapegraph-ai/ai-sdk ai
+# or
+bun add @scrapegraph-ai/ai-sdk ai
+```
+
+`ai` is a peer dependency. Install the model provider package you use, for example:
+
+```bash
+npm i @ai-sdk/openai
+# or
+bun add @ai-sdk/openai
+```
+
+## Quick Start
+
+### API key
+
+Log in to the [ScrapeGraphAI dashboard](https://scrapegraphai.com/) to create an API key. The dashboard also shows your request history, usage, credits, and crawl/monitor activity.
+
+Set it in your environment:
+
+```bash
+export SGAI_API_KEY=...
+```
+
+Minimal scrape-only setup:
+
+```ts
+import { openai } from "@ai-sdk/openai";
+import { generateText, stepCountIs } from "ai";
+import { scrapeTool } from "@scrapegraph-ai/ai-sdk";
+
+const result = await generateText({
+ model: openai("gpt-5-nano"),
+ prompt: "Find the main headline on https://example.com",
+ tools: {
+ scrape: scrapeTool(),
+ },
+ stopWhen: stepCountIs(5),
+});
+
+console.log(result.text);
+```
+
+Use every ScrapeGraphAI tool group:
+
+```ts
+import { openai } from "@ai-sdk/openai";
+import { generateText, stepCountIs } from "ai";
+import {
+ crawlTools,
+ extractTool,
+ monitorTools,
+ scrapeTool,
+ searchTool,
+} from "@scrapegraph-ai/ai-sdk";
+
+const result = await generateText({
+ model: openai("gpt-5-nano"),
+ prompt: "Search for ScrapeGraphAI docs, scrape the best page, and summarize it.",
+ tools: {
+ scrape: scrapeTool(),
+ extract: extractTool(),
+ search: searchTool(),
+ ...crawlTools(),
+ ...monitorTools(),
+ },
+ stopWhen: stepCountIs(10),
+});
+
+console.log(result.text);
+```
+
+Tools read `SGAI_API_KEY` from the environment by default. You can also pass it explicitly:
+
+```ts
+const tools = {
+ scrape: scrapeTool({ apiKey: process.env.SGAI_API_KEY }),
+};
+```
+
+## Tools
+
+### scrapeTool
+
+Scrape a webpage with ScrapeGraphAI. Supports markdown, html, json extraction, links, images, summary, branding, and screenshots.
+
+```ts
+import { scrapeTool } from "@scrapegraph-ai/ai-sdk";
+
+const tools = {
+ scrape: scrapeTool(),
+};
+```
+
+### extractTool
+
+Extract structured JSON from a URL, HTML, or markdown with a natural-language prompt.
+
+```ts
+import { extractTool } from "@scrapegraph-ai/ai-sdk";
+
+const tools = {
+ extract: extractTool(),
+};
+```
+
+### searchTool
+
+Search the web and optionally extract structured data from search results.
+
+```ts
+import { searchTool } from "@scrapegraph-ai/ai-sdk";
+
+const tools = {
+ search: searchTool(),
+};
+```
+
+### crawlTools
+
+Start, poll, page through, stop, resume, and delete ScrapeGraphAI crawl jobs.
+
+```ts
+import { crawlTools } from "@scrapegraph-ai/ai-sdk";
+
+const tools = {
+ ...crawlTools(),
+};
+```
+
+Crawl page retrieval is paginated. Use `getCrawl` for status, then `getCrawlPages` for pages and resolved scrape results.
+
+```ts
+const tools = {
+ startCrawl: startCrawlTool(),
+ getCrawl: getCrawlTool(),
+ getCrawlPages: getCrawlPagesTool(),
+};
+```
+
+### monitorTools
+
+Create, list, update, pause, resume, delete, and fetch activity for ScrapeGraphAI monitors.
+
+```ts
+import { monitorTools } from "@scrapegraph-ai/ai-sdk";
+
+const tools = {
+ ...monitorTools(),
+};
+```
+
+## Examples
+
+| Example | Description |
+|---------|-------------|
+| [`hacker-news.ts`](examples/hacker-news.ts) | Scrape Hacker News with AI SDK tools |
+| [`crawl-blog.ts`](examples/crawl-blog.ts) | Crawl ScrapeGraphAI blog pages, fetch paginated crawl results, and summarize them |
+
+Run an example:
+
+```bash
+OPENAI_API_KEY=... SGAI_API_KEY=... bun examples/crawl-blog.ts
+```
+
+## Environment Variables
+
+| Variable | Description |
+|----------|-------------|
+| `SGAI_API_KEY` | ScrapeGraphAI API key |
+| `OPENAI_API_KEY` | Required by the OpenAI provider examples |
+
+## Development
+
+```bash
+bun install
+bun run build
+bun run check
+```
+
+## License
+
+MIT - [ScrapeGraphAI](https://scrapegraphai.com)
diff --git a/packages/ai-sdk/examples/crawl-blog.ts b/packages/ai-sdk/examples/crawl-blog.ts
new file mode 100644
index 0000000..5642e12
--- /dev/null
+++ b/packages/ai-sdk/examples/crawl-blog.ts
@@ -0,0 +1,77 @@
+import { openai } from "@ai-sdk/openai";
+import { generateText, stepCountIs, type ModelMessage } from "ai";
+import { stdin as input, stdout as output } from "node:process";
+import { createInterface } from "node:readline/promises";
+import { crawlTools } from "@scrapegraph-ai/ai-sdk";
+
+const initialPrompt =
+ "Find 10 https://scrapegraphai.com/ blog posts. Start a crawl, poll its status, fetch crawled pages with getCrawlPages, then summarize what you found.";
+const messages: ModelMessage[] = [];
+let activeController: AbortController | undefined;
+
+async function run(prompt: string) {
+ messages.push({ role: "user", content: prompt });
+ const controller = new AbortController();
+ activeController = controller;
+
+ try {
+ const result = await generateText({
+ model: openai("gpt-5-nano"),
+ messages,
+ tools: { ...crawlTools() },
+ stopWhen: stepCountIs(20),
+ abortSignal: controller.signal,
+ onStepFinish: ({ text, toolCalls, toolResults }) => {
+ if (text) {
+ console.log(`\n[assistant]\n${text}`);
+ }
+
+ for (const toolCall of toolCalls) {
+ console.log(`\n[tool] ${toolCall.toolName}`);
+ console.log(JSON.stringify(toolCall.input, null, 2));
+ }
+
+ for (const toolResult of toolResults) {
+ console.log(`\n[result] ${toolResult.toolName}`);
+ console.log(JSON.stringify(toolResult.output, null, 2));
+ }
+ },
+ });
+
+ messages.push(...result.response.messages);
+ console.log(`\n${result.text}\n`);
+ } catch (error) {
+ if (controller.signal.aborted) {
+ console.error("[aborted]");
+ } else {
+ console.error(error instanceof Error ? error.message : error);
+ }
+ } finally {
+ if (activeController === controller) {
+ activeController = undefined;
+ }
+ }
+}
+
+const rl = createInterface({ input, output });
+
+process.on("SIGINT", () => {
+ output.write("\n");
+ if (activeController) {
+ activeController.abort();
+ return;
+ }
+
+ rl.close();
+ process.exit(0);
+});
+
+await run(initialPrompt);
+
+while (true) {
+ const prompt = (await rl.question("> ")).trim();
+
+ if (prompt) {
+ await run(prompt);
+ }
+}
diff --git a/packages/ai-sdk/examples/hacker-news.ts b/packages/ai-sdk/examples/hacker-news.ts
new file mode 100644
index 0000000..eb87fd8
--- /dev/null
+++ b/packages/ai-sdk/examples/hacker-news.ts
@@ -0,0 +1,15 @@
+import { openai } from "@ai-sdk/openai";
+import { generateText, stepCountIs } from "ai";
+import { scrapeTool } from "@scrapegraph-ai/ai-sdk";
+
+const { text } = await generateText({
+ model: openai("gpt-5-nano"),
+ prompt:
+ "Scrape Hacker News and write a short, concise summary of what people are talking about today.",
+ tools: {
+ scrape: scrapeTool(),
+ },
+ stopWhen: stepCountIs(3),
+});
+
+console.log(text);
diff --git a/packages/ai-sdk/package.json b/packages/ai-sdk/package.json
new file mode 100644
index 0000000..5e69a9e
--- /dev/null
+++ b/packages/ai-sdk/package.json
@@ -0,0 +1,43 @@
+{
+ "name": "@scrapegraph-ai/ai-sdk",
+ "version": "0.1.1",
+ "description": "Vercel AI SDK tools integration for ScrapeGraphAI.",
+ "type": "module",
+ "main": "dist/index.js",
+ "types": "dist/index.d.ts",
+ "exports": {
+ ".": {
+ "types": "./dist/index.d.ts",
+ "import": "./dist/index.js"
+ }
+ },
+ "files": ["dist"],
+ "scripts": {
+ "dev": "tsup --watch",
+ "build": "tsup",
+ "check": "tsc --noEmit",
+ "prepublishOnly": "bun run build"
+ },
+ "keywords": ["scrapegraph", "ai-sdk", "tools", "scraping", "extraction"],
+ "author": "ScrapeGraph Team",
+ "license": "MIT",
+ "homepage": "https://scrapegraphai.com",
+ "repository": {
+ "type": "git",
+ "url": "https://github.com/ScrapeGraphAI/scrapegraph-sdk.git",
+ "directory": "packages/ai-sdk"
+ },
+ "peerDependencies": {
+ "ai": ">=6"
+ },
+ "dependencies": {
+ "scrapegraph-js": "^2.2.0",
+ "zod": "^4.3.6"
+ },
+ "devDependencies": {
+ "@ai-sdk/openai": "^3.0.65",
+ "ai": "^6.0.191",
+ "tsup": "^8.3.6",
+ "typescript": "^5.8.2"
+ }
+}
diff --git a/packages/ai-sdk/src/index.ts b/packages/ai-sdk/src/index.ts
new file mode 100644
index 0000000..17dd218
--- /dev/null
+++ b/packages/ai-sdk/src/index.ts
@@ -0,0 +1,249 @@
+import { tool } from "ai";
+import {
+ ScrapeGraphAI,
+ type ScrapeGraphAIInput,
+ crawlPagesQuerySchema,
+ crawlRequestSchema,
+ extractRequestBaseSchema,
+ monitorActivityQuerySchema,
+ monitorCreateSchema,
+ monitorUpdateSchema,
+ scrapeRequestSchema,
+ searchRequestSchema,
+} from "scrapegraph-js";
+import { z } from "zod";
+
+export type ScrapeGraphToolOptions = ScrapeGraphAIInput;
+
+const idSchema = z.object({
+ id: z.string().min(1),
+});
+
+const monitorActivityInputSchema = z.object({
+ id: z.string().min(1),
+ params: monitorActivityQuerySchema.optional(),
+});
+
+const monitorUpdateInputSchema = z.object({
+ id: z.string().min(1),
+ params: monitorUpdateSchema,
+});
+
+const crawlPagesInputSchema = z.object({
+ id: z.string().min(1),
+ params: crawlPagesQuerySchema.partial().optional(),
+});
+
+function unwrap(result: { status: "success" | "error"; data: T | null; error?: string }) {
+ if (result.status === "error") {
+ throw new Error(result.error ?? "ScrapeGraphAI request failed");
+ }
+
+ if (!result.data) {
+ throw new Error("ScrapeGraphAI request returned no data");
+ }
+
+ return result.data;
+}
+
+export function scrapeTool(options?: ScrapeGraphToolOptions) {
+ const sgai = ScrapeGraphAI(options);
+
+ return tool({
+ description:
+ "Scrape a webpage with ScrapeGraphAI. Supports markdown, html, json extraction, links, images, summary, branding, and screenshots.",
+ inputSchema: scrapeRequestSchema,
+ execute: async (input) => unwrap(await sgai.scrape(input)),
+ });
+}
+
+export function extractTool(options?: ScrapeGraphToolOptions) {
+ const sgai = ScrapeGraphAI(options);
+
+ return tool({
+ description:
+ "Extract structured JSON from a URL, HTML, or markdown using ScrapeGraphAI and a natural-language prompt.",
+ inputSchema: extractRequestBaseSchema,
+ execute: async (input) => unwrap(await sgai.extract(input)),
+ });
+}
+
+export function searchTool(options?: ScrapeGraphToolOptions) {
+ const sgai = ScrapeGraphAI(options);
+
+ return tool({
+ description:
+ "Search the web with ScrapeGraphAI and optionally extract structured data from the results.",
+ inputSchema: searchRequestSchema,
+ execute: async (input) => unwrap(await sgai.search(input)),
+ });
+}
+
+export function startCrawlTool(options?: ScrapeGraphToolOptions) {
+ const sgai = ScrapeGraphAI(options);
+
+ return tool({
+ description:
+ 'Start an asynchronous ScrapeGraphAI crawl. Returns a crawl id. Poll getCrawlTool for status, then call getCrawlPagesTool to retrieve paginated pages and scrape results. When the user asks to crawl only a section or path slug, set includePatterns using glob-style URL patterns: "*/" for first-level paths and "**//**" for nested paths.',
+ inputSchema: crawlRequestSchema,
+ execute: async (input) => unwrap(await sgai.crawl.start(input)),
+ });
+}
+
+export function getCrawlTool(options?: ScrapeGraphToolOptions) {
+ const sgai = ScrapeGraphAI(options);
+
+ return tool({
+ description:
+ "Get crawl status by crawl id. Use this after startCrawlTool for polling progress; use getCrawlPagesTool to retrieve paginated pages and scrape results.",
+ inputSchema: idSchema,
+ execute: async ({ id }) => unwrap(await sgai.crawl.get(id)),
+ });
+}
+
+export function getCrawlPagesTool(options?: ScrapeGraphToolOptions) {
+ const sgai = ScrapeGraphAI(options);
+
+ return tool({
+ description:
+ "Get cursor-paginated crawled pages for a ScrapeGraphAI crawl by crawl id. Returned pages include resolved scrape results when available. Default pagination is cursor 0 and limit 50.",
+ inputSchema: crawlPagesInputSchema,
+ execute: async ({ id, params }) => unwrap(await sgai.crawl.pages(id, params)),
+ });
+}
+
+export function stopCrawlTool(options?: ScrapeGraphToolOptions) {
+ const sgai = ScrapeGraphAI(options);
+
+ return tool({
+ description: "Stop a running ScrapeGraphAI crawl by crawl id.",
+ inputSchema: idSchema,
+ execute: async ({ id }) => unwrap(await sgai.crawl.stop(id)),
+ });
+}
+
+export function resumeCrawlTool(options?: ScrapeGraphToolOptions) {
+ const sgai = ScrapeGraphAI(options);
+
+ return tool({
+ description: "Resume a paused ScrapeGraphAI crawl by crawl id.",
+ inputSchema: idSchema,
+ execute: async ({ id }) => unwrap(await sgai.crawl.resume(id)),
+ });
+}
+
+export function deleteCrawlTool(options?: ScrapeGraphToolOptions) {
+ const sgai = ScrapeGraphAI(options);
+
+ return tool({
+ description: "Delete a ScrapeGraphAI crawl by crawl id.",
+ inputSchema: idSchema,
+ execute: async ({ id }) => unwrap(await sgai.crawl.delete(id)),
+ });
+}
+
+export function crawlTools(options?: ScrapeGraphToolOptions) {
+ return {
+ startCrawl: startCrawlTool(options),
+ getCrawl: getCrawlTool(options),
+ getCrawlPages: getCrawlPagesTool(options),
+ stopCrawl: stopCrawlTool(options),
+ resumeCrawl: resumeCrawlTool(options),
+ deleteCrawl: deleteCrawlTool(options),
+ };
+}
+
+export function createMonitorTool(options?: ScrapeGraphToolOptions) {
+ const sgai = ScrapeGraphAI(options);
+
+ return tool({
+ description:
+ "Create an asynchronous ScrapeGraphAI monitor for a webpage. Returns a monitor id for status and activity checks.",
+ inputSchema: monitorCreateSchema,
+ execute: async (input) => unwrap(await sgai.monitor.create(input)),
+ });
+}
+
+export function listMonitorsTool(options?: ScrapeGraphToolOptions) {
+ const sgai = ScrapeGraphAI(options);
+
+ return tool({
+ description: "List ScrapeGraphAI monitors.",
+ inputSchema: z.object({}),
+ execute: async () => unwrap(await sgai.monitor.list()),
+ });
+}
+
+export function getMonitorTool(options?: ScrapeGraphToolOptions) {
+ const sgai = ScrapeGraphAI(options);
+
+ return tool({
+ description: "Get a ScrapeGraphAI monitor by monitor id.",
+ inputSchema: idSchema,
+ execute: async ({ id }) => unwrap(await sgai.monitor.get(id)),
+ });
+}
+
+export function updateMonitorTool(options?: ScrapeGraphToolOptions) {
+ const sgai = ScrapeGraphAI(options);
+
+ return tool({
+ description: "Update a ScrapeGraphAI monitor by monitor id.",
+ inputSchema: monitorUpdateInputSchema,
+ execute: async ({ id, params }) => unwrap(await sgai.monitor.update(id, params)),
+ });
+}
+
+export function deleteMonitorTool(options?: ScrapeGraphToolOptions) {
+ const sgai = ScrapeGraphAI(options);
+
+ return tool({
+ description: "Delete a ScrapeGraphAI monitor by monitor id.",
+ inputSchema: idSchema,
+ execute: async ({ id }) => unwrap(await sgai.monitor.delete(id)),
+ });
+}
+
+export function pauseMonitorTool(options?: ScrapeGraphToolOptions) {
+ const sgai = ScrapeGraphAI(options);
+
+ return tool({
+ description: "Pause a ScrapeGraphAI monitor by monitor id.",
+ inputSchema: idSchema,
+ execute: async ({ id }) => unwrap(await sgai.monitor.pause(id)),
+ });
+}
+
+export function resumeMonitorTool(options?: ScrapeGraphToolOptions) {
+ const sgai = ScrapeGraphAI(options);
+
+ return tool({
+ description: "Resume a paused ScrapeGraphAI monitor by monitor id.",
+ inputSchema: idSchema,
+ execute: async ({ id }) => unwrap(await sgai.monitor.resume(id)),
+ });
+}
+
+export function getMonitorActivityTool(options?: ScrapeGraphToolOptions) {
+ const sgai = ScrapeGraphAI(options);
+
+ return tool({
+ description:
+ "Get recent activity ticks for a ScrapeGraphAI monitor by monitor id. Use after creating or retrieving a monitor.",
+ inputSchema: monitorActivityInputSchema,
+ execute: async ({ id, params }) => unwrap(await sgai.monitor.activity(id, params)),
+ });
+}
+
+export function monitorTools(options?: ScrapeGraphToolOptions) {
+ return {
+ createMonitor: createMonitorTool(options),
+ listMonitors: listMonitorsTool(options),
+ getMonitor: getMonitorTool(options),
+ updateMonitor: updateMonitorTool(options),
+ deleteMonitor: deleteMonitorTool(options),
+ pauseMonitor: pauseMonitorTool(options),
+ resumeMonitor: resumeMonitorTool(options),
+ getMonitorActivity: getMonitorActivityTool(options),
+ };
+}
diff --git a/packages/ai-sdk/tsconfig.json b/packages/ai-sdk/tsconfig.json
new file mode 100644
index 0000000..229717c
--- /dev/null
+++ b/packages/ai-sdk/tsconfig.json
@@ -0,0 +1,12 @@
+{
+ "extends": "../../tsconfig.json",
+ "compilerOptions": {
+ "rootDir": "src",
+ "outDir": "dist",
+ "paths": {
+ "scrapegraph-js": ["../../dist/index.d.ts"]
+ }
+ },
+ "include": ["src"],
+ "exclude": ["node_modules", "dist", "tests"]
+}
diff --git a/packages/ai-sdk/tsup.config.ts b/packages/ai-sdk/tsup.config.ts
new file mode 100644
index 0000000..337fac5
--- /dev/null
+++ b/packages/ai-sdk/tsup.config.ts
@@ -0,0 +1,10 @@
+import { defineConfig } from "tsup";
+
+export default defineConfig({
+ entry: ["src/index.ts"],
+ format: ["esm"],
+ dts: true,
+ clean: true,
+ target: "node22",
+ outDir: "dist",
+});
diff --git a/src/index.ts b/src/index.ts
index aa69918..2c8bd8e 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -12,113 +12,6 @@ export {
monitor,
} from "./scrapegraphai.js";
-export type {
- ApiResult,
- Branding,
- BrandingColors,
- BrandingFontEntry,
- BrandingImages,
- BrandingMetadata,
- BrandingPersonality,
- BrandingTypography,
- ChunkerMetadata,
- ContentPageMetadata,
- CrawlHistoryEntry,
- CrawlPage,
- CrawlPageStatus,
- CrawlRequest,
- CrawlResponse,
- CrawlResult,
- CrawlStatus,
- CreditsJobs,
- CreditsResponse,
- ExtractHistoryEntry,
- ExtractRequest,
- ExtractResponse,
- FetchConfig,
- FetchContentType,
- FetchMode,
- FetchWarning,
- FetchWarningReason,
- FormatConfig,
- FormatError,
- FormatMetadataMap,
- FormatResponseMap,
- FormatType,
- HealthResponse,
- HistoryEntry,
- HistoryFilter,
- HistoryPage,
- HistoryPagination,
- HistoryStatus,
- HtmlMode,
- ImageChange,
- ImageContentType,
- JobsStatus,
- JsonChange,
- MarkdownFormatConfig,
- HtmlFormatConfig,
- ScreenshotFormatConfig,
- JsonFormatConfig,
- LinksFormatConfig,
- ImagesFormatConfig,
- SummaryFormatConfig,
- BrandingFormatConfig,
- MockConfig,
- MonitorActivityRequest,
- MonitorActivityResponse,
- MonitorCreateRequest,
- MonitorDiffs,
- MonitorHistoryEntry,
- MonitorRefs,
- MonitorResponse,
- MonitorResult,
- MonitorTickEntry,
- MonitorTickStatus,
- MonitorUpdateRequest,
- PageResponse,
- ScrapeHistoryEntry,
- ScrapeMetadata,
- ScrapeRequest,
- ScrapeResponse,
- ScrapeResultMap,
- ScreenshotData,
- SearchHistoryEntry,
- SearchMetadata,
- SearchRequest,
- SearchResponse,
- SearchResult,
- Service,
- SetChange,
- TextChange,
- TimeRange,
- TokenUsage,
- WebhookStatus,
-} from "./types.js";
+export type * from "./types.js";
-export {
- brandingFormatConfigSchema,
- crawlRequestSchema,
- extractRequestSchema,
- fetchConfigSchema,
- fetchContentTypeSchema,
- fetchModeSchema,
- formatConfigSchema,
- historyFilterSchema,
- htmlFormatConfigSchema,
- htmlModeSchema,
- imagesFormatConfigSchema,
- jsonFormatConfigSchema,
- linksFormatConfigSchema,
- markdownFormatConfigSchema,
- mockConfigSchema,
- monitorActivityRequestSchema,
- monitorCreateRequestSchema,
- monitorUpdateRequestSchema,
- scrapeRequestSchema,
- screenshotFormatConfigSchema,
- searchRequestSchema,
- serviceSchema,
- summaryFormatConfigSchema,
- timeRangeSchema,
-} from "./schemas.js";
+export * from "./schemas.js";
diff --git a/src/models.ts b/src/models.ts
new file mode 100644
index 0000000..2a63ee1
--- /dev/null
+++ b/src/models.ts
@@ -0,0 +1,13 @@
+export const MODEL_NAMES = [
+ "gpt-4o-mini",
+ "gpt-4o-mini-2024-07-18",
+ "llama-3.3-70b-versatile",
+ "llama-3.1-8b-instant",
+ "mixtral-8x7b-32768",
+ "mistral-small-2501",
+ "gpt-oss-120b",
+ "openai/gpt-oss-120b",
+ "claude-haiku-4-5-20251001",
+] as const;
+
+export type ModelName = (typeof MODEL_NAMES)[number];
diff --git a/src/schemas.ts b/src/schemas.ts
index 13f9ae3..6a273c9 100644
--- a/src/schemas.ts
+++ b/src/schemas.ts
@@ -1,25 +1,15 @@
-import { z } from "zod";
+import { z } from "zod/v4";
+import { MODEL_NAMES } from "./models.js";
+import * as url from "./url.js";
-export const serviceSchema = z.enum(["scrape", "extract", "search", "monitor", "crawl"]);
+// shared sub-schemas composed into route request schemas below
+export const serviceEnumSchema = z.enum(["scrape", "extract", "search", "monitor", "crawl"]);
+export const statusEnumSchema = z.enum(["completed", "failed"]);
export const htmlModeSchema = z.enum(["normal", "reader", "prune"]);
-
-export const fetchModeSchema = z.enum(["auto", "fast", "js"]);
-
-export const timeRangeSchema = z.enum([
- "past_hour",
- "past_24_hours",
- "past_week",
- "past_month",
- "past_year",
-]);
-
-export const crawlStatusSchema = z.enum(["running", "completed", "failed", "paused", "deleted"]);
-
-export const crawlPageStatusSchema = z.enum(["completed", "failed", "skipped"]);
-
export const fetchContentTypeSchema = z.enum([
"text/html",
+ "application/json",
"application/pdf",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
@@ -39,10 +29,37 @@ export const fetchContentTypeSchema = z.enum([
"text/plain",
"application/x-latex",
]);
-
export const userPromptSchema = z.string().min(1).max(10_000);
-export const urlSchema = z.string().url();
+const PUBLIC_DOMAIN_RE =
+ /^(?=.{1,253}\.?$)(?:[a-z\d](?:[a-z\d-]{0,61}[a-z\d])?\.)+[a-z\d](?:[a-z\d-]{0,61}[a-z\d])?\.?$/i;
+const IPV4_RE = /^(?:(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)\.){3}(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)$/;
+
+export const urlSchema = z
+ .string()
+ .trim()
+ .transform((val) => (/^[a-z][a-z\d+.-]*:\/\//i.test(val) ? val : `https://${val}`))
+ .pipe(z.url())
+ .check(
+ z.refine((val) => {
+ try {
+ const { protocol, hostname } = new URL(val);
+ if (protocol !== "http:" && protocol !== "https:") return false;
+ if (
+ !PUBLIC_DOMAIN_RE.test(hostname) &&
+ !IPV4_RE.test(hostname) &&
+ !hostname.includes(":") &&
+ !(process.env.NODE_ENV === "development" && hostname === "localhost")
+ ) {
+ return false;
+ }
+ if (process.env.NODE_ENV === "development") return true;
+ return !url.isInternal(hostname);
+ } catch {
+ return false;
+ }
+ }, "Private or internal URLs are not allowed"),
+ );
export const paginationSchema = z.object({
page: z.coerce.number().int().positive().default(1),
@@ -53,6 +70,8 @@ export const uuidParamSchema = z.object({
id: z.string().regex(/^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i),
});
+export const fetchModeSchema = z.enum(["auto", "fast", "js"]);
+
export const FETCH_CONFIG_DEFAULTS = {
mode: "auto",
stealth: false,
@@ -61,14 +80,6 @@ export const FETCH_CONFIG_DEFAULTS = {
scrolls: 0,
} as const;
-export const mockConfigSchema = z.object({
- minKb: z.number().int().min(1).max(1000).default(1),
- maxKb: z.number().int().min(1).max(1000).default(5),
- minSleep: z.number().int().min(0).max(30000).default(5),
- maxSleep: z.number().int().min(0).max(30000).default(15),
- writeToBucket: z.boolean().default(false),
-});
-
export const fetchConfigSchema = z.object({
mode: fetchModeSchema.default(FETCH_CONFIG_DEFAULTS.mode),
stealth: z.boolean().default(FETCH_CONFIG_DEFAULTS.stealth),
@@ -82,65 +93,122 @@ export const fetchConfigSchema = z.object({
.transform((v) => v.toLowerCase())
.optional(),
scrolls: z.number().int().min(0).max(100).default(FETCH_CONFIG_DEFAULTS.scrolls),
- mock: z.union([z.boolean(), mockConfigSchema]).default(false),
+ mock: z
+ .union([
+ z.boolean(),
+ z.object({
+ minKb: z.number().int().min(1).max(1000).default(1),
+ maxKb: z.number().int().min(1).max(1000).default(5),
+ minSleep: z.number().int().min(0).max(30000).default(5),
+ maxSleep: z.number().int().min(0).max(30000).default(15),
+ writeToBucket: z.boolean().default(false),
+ }),
+ ])
+ .default(false),
+});
+
+export const chunkerSchema = z.object({
+ size: z.union([z.number().int().min(2048), z.literal("dynamic")]).optional(),
+ overlap: z.number().int().min(0).max(512).optional(),
+});
+
+export const llmConfigSchema = z.object({
+ model: z.enum(MODEL_NAMES).optional(),
+ temperature: z.number().min(0).max(1).default(0),
+ maxTokens: z.number().int().min(1).max(16384).default(16384),
+ chunker: chunkerSchema.optional(),
});
+// route request schemas
+
export const historyFilterSchema = z.object({
page: z.coerce.number().int().positive().default(1),
limit: z.coerce.number().int().min(1).max(100).default(20),
- service: serviceSchema.optional(),
+ service: serviceEnumSchema.optional(),
});
-export const markdownFormatConfigSchema = z.object({
- type: z.literal("markdown"),
+export const scrapeContentFormatSchema = z.enum([
+ "markdown",
+ "html",
+ "links",
+ "images",
+ "summary",
+ "json",
+ "branding",
+]);
+export const scrapeCaptureFormatSchema = z.enum(["screenshot"]);
+export const scrapeFormatSchema = z.enum([
+ ...scrapeContentFormatSchema.options,
+ ...scrapeCaptureFormatSchema.options,
+]);
+
+export const markdownConfigSchema = z.object({
mode: htmlModeSchema.default("normal"),
});
-export const htmlFormatConfigSchema = z.object({
- type: z.literal("html"),
+export const htmlConfigSchema = z.object({
mode: htmlModeSchema.default("normal"),
});
-export const screenshotFormatConfigSchema = z.object({
- type: z.literal("screenshot"),
+export const screenshotConfigSchema = z.object({
fullPage: z.boolean().default(false),
width: z.number().int().min(320).max(3840).default(1440),
height: z.number().int().min(200).max(2160).default(900),
quality: z.number().int().min(1).max(100).default(80),
});
-export const jsonFormatConfigSchema = z.object({
- type: z.literal("json"),
- prompt: userPromptSchema,
+export const scrapeJsonConfigSchema = z.object({
+ prompt: z.string().max(10_000).default(""),
schema: z.record(z.string(), z.unknown()).optional(),
+ // llmConfig: llmConfigSchema.optional(),
mode: htmlModeSchema.default("normal"),
});
-export const linksFormatConfigSchema = z.object({
+export const scrapeSummaryConfigSchema = z.object({
+ // llmConfig: llmConfigSchema.optional(),
+});
+
+export const scrapeMarkdownFormatSchema = markdownConfigSchema.extend({
+ type: z.literal("markdown"),
+});
+
+export const scrapeHtmlFormatSchema = htmlConfigSchema.extend({
+ type: z.literal("html"),
+});
+
+export const scrapeScreenshotFormatSchema = screenshotConfigSchema.extend({
+ type: z.literal("screenshot"),
+});
+
+export const scrapeJsonFormatSchema = scrapeJsonConfigSchema.extend({
+ type: z.literal("json"),
+});
+
+export const scrapeLinksFormatSchema = z.object({
type: z.literal("links"),
});
-export const imagesFormatConfigSchema = z.object({
+export const scrapeImagesFormatSchema = z.object({
type: z.literal("images"),
});
-export const summaryFormatConfigSchema = z.object({
+export const scrapeSummaryFormatSchema = scrapeSummaryConfigSchema.extend({
type: z.literal("summary"),
});
-export const brandingFormatConfigSchema = z.object({
+export const scrapeBrandingFormatSchema = z.object({
type: z.literal("branding"),
});
-export const formatConfigSchema = z.discriminatedUnion("type", [
- markdownFormatConfigSchema,
- htmlFormatConfigSchema,
- screenshotFormatConfigSchema,
- jsonFormatConfigSchema,
- linksFormatConfigSchema,
- imagesFormatConfigSchema,
- summaryFormatConfigSchema,
- brandingFormatConfigSchema,
+export const scrapeFormatEntrySchema = z.discriminatedUnion("type", [
+ scrapeMarkdownFormatSchema,
+ scrapeHtmlFormatSchema,
+ scrapeScreenshotFormatSchema,
+ scrapeJsonFormatSchema,
+ scrapeLinksFormatSchema,
+ scrapeImagesFormatSchema,
+ scrapeSummaryFormatSchema,
+ scrapeBrandingFormatSchema,
]);
export const scrapeRequestSchema = z.object({
@@ -148,15 +216,15 @@ export const scrapeRequestSchema = z.object({
contentType: fetchContentTypeSchema.optional(),
fetchConfig: fetchConfigSchema.optional(),
formats: z
- .array(formatConfigSchema)
- .min(1)
+ .array(scrapeFormatEntrySchema)
+ .min(1, { message: "Select at least one format" })
.refine((formats) => new Set(formats.map((format) => format.type)).size === formats.length, {
message: "duplicate format types not allowed",
})
.default([{ type: "markdown", mode: "normal" }]),
});
-export const extractRequestSchema = z
+export const extractRequestBaseSchema = z
.object({
url: urlSchema.optional(),
html: z.string().optional(),
@@ -181,18 +249,68 @@ export const searchRequestSchema = z
prompt: userPromptSchema.optional(),
schema: z.record(z.string(), z.unknown()).optional(),
locationGeoCode: z.string().max(10).optional(),
- timeRange: timeRangeSchema.optional(),
+ timeRange: z
+ .enum(["past_hour", "past_24_hours", "past_week", "past_month", "past_year"])
+ .optional(),
})
.refine((d) => !d.schema || d.prompt, {
message: "schema requires prompt",
+ path: ["prompt"],
});
-export const monitorCreateRequestSchema = z.object({
+// ─── response schemas ───────────────────────────────────────────────────────
+
+export const validateResponseSchema = z.object({
+ email: z.email(),
+});
+
+export const okResponseSchema = z.object({
+ ok: z.literal(true),
+});
+
+export const healthResponseSchema = z.object({
+ status: z.enum(["ok", "degraded"]),
+ uptime: z.number().int().nonnegative(),
+ services: z
+ .object({
+ redis: z.enum(["ok", "down"]),
+ db: z.enum(["ok", "down"]),
+ })
+ .optional(),
+});
+
+export const tokenUsageSchema = z.object({
+ promptTokens: z.number().int().nonnegative(),
+ completionTokens: z.number().int().nonnegative(),
+});
+
+export const chunkerMetadataSchema = z.object({
+ chunks: z.array(z.object({ size: z.number().int().nonnegative() })),
+});
+
+export const jobsStatusSchema = z.object({
+ used: z.number().int().nonnegative(),
+ limit: z.number().int().nonnegative(),
+});
+
+export const creditsResponseSchema = z.object({
+ remaining: z.number().int(),
+ used: z.number().int(),
+ plan: z.string(),
+ jobs: z.object({
+ crawl: jobsStatusSchema,
+ monitor: jobsStatusSchema,
+ }),
+});
+
+// ─── monitor schemas ────────────────────────────────────────────────────────
+
+export const monitorCreateSchema = z.object({
url: urlSchema,
name: z.string().max(200).optional(),
formats: z
- .array(formatConfigSchema)
- .min(1)
+ .array(scrapeFormatEntrySchema)
+ .min(1, { message: "Select at least one format" })
.refine((formats) => new Set(formats.map((f) => f.type)).size === formats.length, {
message: "duplicate format types not allowed",
})
@@ -202,11 +320,11 @@ export const monitorCreateRequestSchema = z.object({
fetchConfig: fetchConfigSchema.optional(),
});
-export const monitorUpdateRequestSchema = z
+export const monitorUpdateSchema = z
.object({
name: z.string().max(200).optional(),
formats: z
- .array(formatConfigSchema)
+ .array(scrapeFormatEntrySchema)
.min(1)
.refine((formats) => new Set(formats.map((f) => f.type)).size === formats.length, {
message: "duplicate format types not allowed",
@@ -218,16 +336,167 @@ export const monitorUpdateRequestSchema = z
})
.partial();
-export const monitorActivityRequestSchema = z.object({
+export const monitorActivityQuerySchema = z.object({
+ cursor: z.iso.datetime({ offset: true, local: true }).optional(),
limit: z.coerce.number().int().min(1).max(100).default(20),
- cursor: z.string().optional(),
});
+// ─── history response schemas ───────────────────────────────────────────────
+
+export const historyStatusSchema = z.enum(["completed", "failed", "running", "paused", "deleted"]);
+
+const historyBase = {
+ id: z.string(),
+ status: historyStatusSchema,
+ error: z.unknown(),
+ elapsedMs: z.number(),
+ createdAt: z.iso.datetime(),
+ requestParentId: z.string().nullable(),
+};
+
+export const paginationInfoSchema = z.object({
+ page: z.number().int(),
+ limit: z.number().int(),
+ total: z.number().int(),
+});
+
+export const cursorPaginationInfoSchema = z.object({
+ limit: z.number().int(),
+ nextCursor: z.string().nullable(),
+});
+
+export function pageResponseSchema(itemSchema: T) {
+ return z.object({
+ data: z.array(itemSchema),
+ pagination: paginationInfoSchema,
+ });
+}
+
+export function cursorPageResponseSchema(itemSchema: T) {
+ return z.object({
+ data: z.array(itemSchema),
+ pagination: cursorPaginationInfoSchema,
+ });
+}
+
+// ─── extract / search response schemas ──────────────────────────────────────
+
+export const extractResponseSchema = z.object({
+ raw: z.string().nullable(),
+ json: z.record(z.string(), z.unknown()).nullable(),
+ usage: tokenUsageSchema,
+ metadata: z.object({
+ chunker: chunkerMetadataSchema,
+ fetch: z.object({ provider: z.string().optional() }).optional(),
+ }),
+});
+
+export const searchResultSchema = z.object({
+ url: z.string(),
+ title: z.string(),
+ content: z.string(),
+ provider: z.string().optional(),
+});
+
+export const searchMetadataSchema = z.object({
+ search: z.object({ provider: z.string().optional() }),
+ pages: z.object({ requested: z.number().int(), scraped: z.number().int() }),
+ chunker: chunkerMetadataSchema.optional(),
+});
+
+export const searchResponseSchema = z.object({
+ results: z.array(searchResultSchema),
+ json: z.record(z.string(), z.unknown()).nullable().optional(),
+ raw: z.string().nullable().optional(),
+ usage: tokenUsageSchema.optional(),
+ metadata: searchMetadataSchema,
+});
+
+// ─── monitor response schemas ───────────────────────────────────────────────
+
+export const textChangeSchema = z.object({
+ type: z.enum(["added", "removed"]),
+ line: z.number().int(),
+ content: z.string(),
+});
+
+export const jsonChangeSchema = z.object({
+ path: z.string(),
+ old: z.unknown(),
+ new: z.unknown(),
+});
+
+export const setChangeSchema = z.object({
+ added: z.array(z.string()),
+ removed: z.array(z.string()),
+});
+
+export const imageChangeSchema = z.object({
+ size: z.number(),
+ changed: z.number(),
+ mask: z.string().optional(),
+});
+
+export const monitorDiffsSchema = z.object({
+ markdown: z.array(textChangeSchema).optional(),
+ html: z.array(textChangeSchema).optional(),
+ json: z.array(jsonChangeSchema).optional(),
+ screenshot: imageChangeSchema.optional(),
+ links: setChangeSchema.optional(),
+ images: setChangeSchema.optional(),
+ summary: z.array(textChangeSchema).optional(),
+ branding: z.array(jsonChangeSchema).optional(),
+});
+
+export const webhookStatusSchema = z.object({
+ sentAt: z.iso.datetime(),
+ statusCode: z.number().int().nullable(),
+ error: z.string().optional(),
+});
+
+export const monitorResultSchema = z.object({
+ changed: z.boolean(),
+ diffs: monitorDiffsSchema,
+ refs: z.record(z.string(), z.string()),
+ webhookStatus: webhookStatusSchema.optional(),
+});
+
+export const monitorResponseSchema = z.object({
+ cronId: z.string(),
+ scheduleId: z.string(),
+ interval: z.string(),
+ status: z.enum(["active", "paused"]),
+ config: monitorCreateSchema,
+ createdAt: z.iso.datetime(),
+ updatedAt: z.iso.datetime(),
+});
+
+export const monitorTickEntrySchema = z.object({
+ id: z.string(),
+ status: z.enum(["completed", "failed", "paused", "running"]),
+ createdAt: z.iso.datetime(),
+ elapsedMs: z.number(),
+ changed: z.boolean(),
+ diffs: monitorDiffsSchema,
+ error: z.string().optional(),
+});
+
+export const monitorActivityResponseSchema = z.object({
+ ticks: z.array(monitorTickEntrySchema),
+ nextCursor: z.string().nullable(),
+});
+
+// ─── crawl schemas ─────────────────────────────────────────────────────────
+
+export const crawlStatusSchema = z.enum(["running", "completed", "failed", "paused", "deleted"]);
+
+export const crawlPageStatusSchema = z.enum(["completed", "failed", "skipped"]);
+
export const crawlRequestSchema = z.object({
url: urlSchema,
formats: z
- .array(formatConfigSchema)
- .min(1)
+ .array(scrapeFormatEntrySchema)
+ .min(1, { message: "Select at least one format" })
.refine((formats) => new Set(formats.map((f) => f.type)).size === formats.length, {
message: "duplicate format types not allowed",
})
@@ -236,8 +505,387 @@ export const crawlRequestSchema = z.object({
maxPages: z.coerce.number().int().min(1).max(1000).default(50),
maxLinksPerPage: z.coerce.number().int().min(1).default(10),
allowExternal: z.boolean().default(false),
- includePatterns: z.array(z.string()).optional(),
- excludePatterns: z.array(z.string()).optional(),
+ includePatterns: z
+ .array(z.string())
+ .optional()
+ .describe(
+ 'Glob-style URL patterns to include. Use "*/" for first-level paths and "**//**" for nested paths.',
+ ),
+ excludePatterns: z
+ .array(z.string())
+ .optional()
+ .describe(
+ 'Glob-style URL patterns to exclude. Use "*/" for first-level paths and "**//**" for nested paths.',
+ ),
contentTypes: z.array(fetchContentTypeSchema).optional(),
fetchConfig: fetchConfigSchema.optional(),
});
+
+export const crawlPagesQuerySchema = z.object({
+ cursor: z.coerce.number().int().min(0).default(0),
+ limit: z.coerce.number().int().min(1).max(100).default(50),
+});
+
+// ─── scrape response schemas ────────────────────────────────────────────────
+
+export const fetchWarningSchema = z.object({
+ reason: z.enum(["too_short", "empty", "bot_blocked", "spa_shell", "soft_404"]),
+ provider: z.string().optional(),
+});
+
+export const contentPageMetadataSchema = z.object({
+ index: z.number().int(),
+ images: z.array(
+ z.object({
+ id: z.string(),
+ topLeftX: z.number(),
+ topLeftY: z.number(),
+ bottomRightX: z.number(),
+ bottomRightY: z.number(),
+ }),
+ ),
+ tables: z.array(z.object({ id: z.string(), content: z.string(), format: z.string() })),
+ hyperlinks: z.array(z.string()),
+ dimensions: z.object({ dpi: z.number(), height: z.number(), width: z.number() }),
+});
+
+export const scrapeMetadataSchema = z.object({
+ provider: z.string().optional(),
+ contentType: z.string(),
+ elapsedMs: z.number().optional(),
+ warnings: z.array(fetchWarningSchema).optional(),
+ ocr: z
+ .object({
+ model: z.string(),
+ pagesProcessed: z.number().int(),
+ pages: z.array(contentPageMetadataSchema),
+ })
+ .optional(),
+});
+
+export const brandingColorsSchema = z.object({
+ primary: z.string(),
+ accent: z.string(),
+ background: z.string(),
+ textPrimary: z.string(),
+ link: z.string(),
+});
+
+export const brandingFontEntrySchema = z.object({
+ family: z.string(),
+ role: z.enum(["heading", "body"]),
+});
+
+export const brandingTypographySchema = z.object({
+ fontFamilies: z.object({ primary: z.string(), heading: z.string() }),
+ fontStacks: z.object({
+ heading: z.array(z.string()),
+ body: z.array(z.string()),
+ paragraph: z.array(z.string()),
+ }),
+ fontSizes: z.record(z.string(), z.string()),
+});
+
+export const brandingSpacingSchema = z.object({
+ baseUnit: z.number(),
+ borderRadius: z.string(),
+});
+
+export const brandingInputComponentSchema = z.object({
+ borderColor: z.string(),
+ borderRadius: z.string(),
+});
+
+export const brandingButtonComponentSchema = z.object({
+ background: z.string(),
+ textColor: z.string(),
+ borderRadius: z.string(),
+ shadow: z.string(),
+});
+
+export const brandingComponentsSchema = z.object({
+ input: brandingInputComponentSchema,
+ buttonPrimary: brandingButtonComponentSchema,
+ buttonSecondary: brandingButtonComponentSchema,
+});
+
+export const brandingImagesSchema = z.object({
+ logo: z.string(),
+ favicon: z.string(),
+ ogImage: z.string(),
+});
+
+export const brandingPersonalitySchema = z.object({
+ tone: z.string(),
+ energy: z.enum(["high", "medium", "low"]),
+ targetAudience: z.string(),
+});
+
+export const brandingDesignSystemSchema = z.object({
+ framework: z.string().nullable(),
+ componentLibrary: z.string().nullable(),
+});
+
+export const brandingButtonPickSchema = z.object({
+ index: z.number().int(),
+ text: z.string(),
+ reasoning: z.string(),
+});
+
+export const brandingButtonReasoningSchema = z.object({
+ primary: brandingButtonPickSchema,
+ secondary: brandingButtonPickSchema,
+ confidence: z.number(),
+});
+
+export const brandingLogoReasoningSchema = z.object({
+ selectedIndex: z.number().int(),
+ reasoning: z.string(),
+ confidence: z.number(),
+});
+
+export const brandingConfidenceSchema = z.object({
+ colors: z.number(),
+ buttons: z.number(),
+ logo: z.number(),
+ fonts: z.number(),
+ components: z.number(),
+ overall: z.number(),
+});
+
+export const brandingSchema = z.object({
+ colorScheme: z.enum(["light", "dark"]),
+ fonts: z.array(brandingFontEntrySchema),
+ colors: brandingColorsSchema,
+ typography: brandingTypographySchema,
+ spacing: brandingSpacingSchema,
+ components: brandingComponentsSchema,
+ images: brandingImagesSchema,
+ frameworkHints: z.array(z.string()),
+ buttonReasoning: brandingButtonReasoningSchema,
+ logoReasoning: brandingLogoReasoningSchema,
+ personality: brandingPersonalitySchema,
+ designSystem: brandingDesignSystemSchema,
+ confidence: brandingConfidenceSchema,
+});
+
+export const brandingMetadataSchema = z.object({
+ title: z.string(),
+ description: z.string(),
+ favicon: z.string(),
+ language: z.string(),
+ themeColor: z.string(),
+ ogTitle: z.string(),
+ ogDescription: z.string(),
+ ogImage: z.string(),
+ ogUrl: z.string(),
+});
+
+export const scrapeScreenshotDataSchema = z.object({
+ url: z.string(),
+ width: z.number().int(),
+ height: z.number().int(),
+});
+
+export const scrapeFormatErrorSchema = z.object({
+ code: z.string(),
+ error: z.string(),
+});
+
+const emptyObj = z.object({});
+
+export const scrapeResultSectionSchemas = {
+ markdown: z.object({ data: z.array(z.string()), metadata: emptyObj.optional() }),
+ html: z.object({ data: z.array(z.string()), metadata: emptyObj.optional() }),
+ links: z.object({
+ data: z.array(z.string()),
+ metadata: z.object({ count: z.number().int() }).optional(),
+ }),
+ images: z.object({
+ data: z.array(z.string()),
+ metadata: z.object({ count: z.number().int() }).optional(),
+ }),
+ summary: z.object({
+ data: z.string(),
+ metadata: z.object({ chunker: chunkerMetadataSchema.optional() }).optional(),
+ }),
+ json: z.object({
+ data: z.unknown(),
+ metadata: z
+ .object({
+ chunker: chunkerMetadataSchema,
+ raw: z.string().nullable().optional(),
+ })
+ .optional(),
+ }),
+ branding: z.object({
+ data: brandingSchema,
+ metadata: z.object({ branding: brandingMetadataSchema }).optional(),
+ }),
+ screenshot: z.object({
+ data: scrapeScreenshotDataSchema,
+ metadata: z
+ .object({
+ contentType: z.string(),
+ provider: z.string().optional(),
+ })
+ .optional(),
+ }),
+} as const;
+
+export const scrapeResultMapSchema = z.object(scrapeResultSectionSchemas).partial();
+
+export const scrapeResponseSchema = z.object({
+ results: scrapeResultMapSchema,
+ metadata: scrapeMetadataSchema,
+ errors: z.record(scrapeFormatSchema, scrapeFormatErrorSchema).optional(),
+});
+
+// [NOTE] @Claude legacy cached/historic scrape responses can predate schema changes
+// (e.g., the branding pipeline rework). This sanitizer drops `results.*` sections that
+// no longer match the current schema so consumers receive a structurally valid response
+// instead of crashing on missing fields. Returns the dropped section names for logging.
+export function sanitizeScrapeResponse(raw: unknown): {
+ data: unknown;
+ dropped: string[];
+} {
+ const parsed = scrapeResponseSchema.safeParse(raw);
+ if (parsed.success) return { data: parsed.data, dropped: [] };
+ if (!raw || typeof raw !== "object") return { data: raw, dropped: [] };
+
+ const obj = { ...(raw as Record) };
+ const rawResults = obj.results;
+ if (!rawResults || typeof rawResults !== "object") return { data: raw, dropped: [] };
+
+ const cleanResults: Record = {};
+ const dropped: string[] = [];
+ for (const [key, sectionSchema] of Object.entries(scrapeResultSectionSchemas)) {
+ const value = (rawResults as Record)[key];
+ if (value === undefined) continue;
+ const check = sectionSchema.safeParse(value);
+ if (check.success) cleanResults[key] = check.data;
+ else dropped.push(key);
+ }
+ obj.results = cleanResults;
+
+ const reparsed = scrapeResponseSchema.safeParse(obj);
+ return reparsed.success ? { data: reparsed.data, dropped } : { data: obj, dropped };
+}
+
+// ─── crawl response schemas ─────────────────────────────────────────────────
+
+export const crawlPageSchema = z.object({
+ url: z.string(),
+ status: crawlPageStatusSchema,
+ depth: z.number().int(),
+ parentUrl: z.string().nullable(),
+ links: z.array(z.string()),
+ scrapeRefId: z.string(),
+ title: z.string(),
+ contentType: z.string(),
+ screenshotUrl: z.string().optional(),
+ reason: z.string().optional(),
+ error: z.string().optional(),
+ scrape: scrapeResponseSchema.optional(),
+});
+
+export const crawlResultSchema = z.object({
+ status: crawlStatusSchema,
+ reason: z.string().optional(),
+ total: z.number().int(),
+ finished: z.number().int(),
+ pages: z.array(crawlPageSchema),
+});
+
+export const crawlResponseSchema = crawlResultSchema.extend({
+ id: z.string(),
+});
+
+export const crawlPagesResponseSchema = cursorPageResponseSchema(crawlPageSchema);
+
+// ─── job payload schemas (internal endpoints) ───────────────────────────────
+
+export const crawlJobPayloadSchema = z.object({
+ crawlId: z.string(),
+ urls: z.array(z.string()),
+ depth: z.number().int(),
+ parentUrl: z.string().nullable(),
+ config: crawlRequestSchema,
+ userId: z.string(),
+ keyId: z.string().nullable(),
+});
+
+export const monitorJobPayloadSchema = z.object({
+ cronId: z.string(),
+ prevId: z.string().nullable(),
+ userId: z.string(),
+ keyId: z.string().nullable(),
+ config: monitorCreateSchema,
+});
+
+// ─── history entry schemas (discriminated union by service) ─────────────────
+
+export const scrapeHistoryEntrySchema = z.object({
+ ...historyBase,
+ service: z.literal("scrape"),
+ params: scrapeRequestSchema,
+ result: scrapeResponseSchema,
+});
+
+export const extractHistoryEntrySchema = z.object({
+ ...historyBase,
+ service: z.literal("extract"),
+ params: extractRequestBaseSchema,
+ result: extractResponseSchema,
+});
+
+export const searchHistoryEntrySchema = z.object({
+ ...historyBase,
+ service: z.literal("search"),
+ params: searchRequestSchema,
+ result: searchResponseSchema,
+});
+
+export const monitorHistoryEntrySchema = z.object({
+ ...historyBase,
+ service: z.literal("monitor"),
+ params: z.object({ cronId: z.string(), url: z.string() }),
+ result: monitorResultSchema,
+});
+
+export const crawlHistoryEntrySchema = z.object({
+ ...historyBase,
+ service: z.literal("crawl"),
+ params: z.object({ url: z.string(), maxPages: z.number().int() }),
+ result: crawlResultSchema,
+});
+
+export const historyEntrySchema = z.discriminatedUnion("service", [
+ scrapeHistoryEntrySchema,
+ extractHistoryEntrySchema,
+ searchHistoryEntrySchema,
+ monitorHistoryEntrySchema,
+ crawlHistoryEntrySchema,
+]);
+
+export const historyPageSchema = pageResponseSchema(historyEntrySchema);
+
+// [NOTE] @Claude runtime history route returns raw DB rows whose JSONB `params`/`result` columns
+// cannot be narrowed at the edge. This loose schema documents the real wire shape (and tolerates
+// a "processing" status for entries still buffered in Redis). SDK consumers that need strong
+// per-service typing should parse against `historyEntrySchema` themselves.
+export const historyRuntimeEntrySchema = z
+ .object({
+ id: z.string(),
+ service: z.string(),
+ status: z.enum([...historyStatusSchema.options, "processing"]),
+ error: z.any().optional(),
+ elapsedMs: z.number().nullable().optional(),
+ createdAt: z.string().optional(),
+ requestParentId: z.string().nullable().optional(),
+ params: z.any().optional(),
+ result: z.any().optional(),
+ })
+ .loose();
+
+export const historyRuntimePageSchema = pageResponseSchema(historyRuntimeEntrySchema);
diff --git a/src/scrapegraphai.ts b/src/scrapegraphai.ts
index b022e98..440ed1f 100644
--- a/src/scrapegraphai.ts
+++ b/src/scrapegraphai.ts
@@ -1,16 +1,18 @@
import { env } from "./env.js";
import type {
ApiResult,
+ CrawlPagesQuery,
+ CrawlPagesResponse,
CrawlRequest,
CrawlResponse,
CreditsResponse,
- ExtractRequest,
+ ExtractRequestBase,
ExtractResponse,
HealthResponse,
HistoryEntry,
HistoryFilter,
HistoryPage,
- MonitorActivityRequest,
+ MonitorActivityQuery,
MonitorActivityResponse,
MonitorCreateRequest,
MonitorResponse,
@@ -122,7 +124,7 @@ export async function scrape(
export async function extract(
apiKey: string,
- params: ExtractRequest,
+ params: ExtractRequestBase,
): Promise> {
try {
const { data, elapsedMs } = await request("POST", "/extract", apiKey, params);
@@ -207,6 +209,24 @@ export const crawl = {
}
},
+ async pages(
+ apiKey: string,
+ id: string,
+ params?: Partial,
+ ): Promise> {
+ try {
+ const qs = new URLSearchParams();
+ if (params?.cursor !== undefined) qs.set("cursor", String(params.cursor));
+ if (params?.limit !== undefined) qs.set("limit", String(params.limit));
+ const query = qs.toString();
+ const path = query ? `/crawl/${id}/pages?${query}` : `/crawl/${id}/pages`;
+ const { data, elapsedMs } = await request("GET", path, apiKey);
+ return ok(data, elapsedMs);
+ } catch (err) {
+ return fail(err);
+ }
+ },
+
async stop(apiKey: string, id: string): Promise> {
try {
const { data, elapsedMs } = await request<{ ok: boolean }>(
@@ -336,7 +356,7 @@ export const monitor = {
async activity(
apiKey: string,
id: string,
- params?: MonitorActivityRequest,
+ params?: MonitorActivityQuery,
): Promise> {
try {
const qs = new URLSearchParams();
@@ -366,7 +386,7 @@ export function ScrapeGraphAI(opts?: ScrapeGraphAIInput) {
const key = resolveApiKey(opts);
return {
scrape: (params: ScrapeRequest) => scrape(key, params),
- extract: (params: ExtractRequest) => extract(key, params),
+ extract: (params: ExtractRequestBase) => extract(key, params),
search: (params: SearchRequest) => search(key, params),
credits: () => getCredits(key),
healthy: () => checkHealth(key),
@@ -377,6 +397,7 @@ export function ScrapeGraphAI(opts?: ScrapeGraphAIInput) {
crawl: {
start: (params: CrawlRequest) => crawl.start(key, params),
get: (id: string) => crawl.get(key, id),
+ pages: (id: string, params?: Partial) => crawl.pages(key, id, params),
stop: (id: string) => crawl.stop(key, id),
resume: (id: string) => crawl.resume(key, id),
delete: (id: string) => crawl.delete(key, id),
@@ -389,7 +410,7 @@ export function ScrapeGraphAI(opts?: ScrapeGraphAIInput) {
delete: (id: string) => monitor.delete(key, id),
pause: (id: string) => monitor.pause(key, id),
resume: (id: string) => monitor.resume(key, id),
- activity: (id: string, params?: MonitorActivityRequest) => monitor.activity(key, id, params),
+ activity: (id: string, params?: MonitorActivityQuery) => monitor.activity(key, id, params),
},
};
}
diff --git a/src/types.ts b/src/types.ts
index 5f9111e..d2fd8ea 100644
--- a/src/types.ts
+++ b/src/types.ts
@@ -1,419 +1,416 @@
-import type { z } from "zod";
+import type { z } from "zod/v4";
import type {
+ brandingButtonComponentSchema,
+ brandingButtonReasoningSchema,
+ brandingColorsSchema,
+ brandingComponentsSchema,
+ brandingConfidenceSchema,
+ brandingDesignSystemSchema,
+ brandingFontEntrySchema,
+ brandingImagesSchema,
+ brandingInputComponentSchema,
+ brandingLogoReasoningSchema,
+ brandingMetadataSchema,
+ brandingPersonalitySchema,
+ brandingSchema,
+ brandingSpacingSchema,
+ brandingTypographySchema,
+ chunkerMetadataSchema,
+ contentPageMetadataSchema,
+ crawlHistoryEntrySchema,
+ crawlJobPayloadSchema,
+ crawlPageSchema,
+ crawlPageStatusSchema,
+ crawlPagesQuerySchema,
+ crawlPagesResponseSchema,
crawlRequestSchema,
- extractRequestSchema,
+ crawlResponseSchema,
+ crawlResultSchema,
+ crawlStatusSchema,
+ creditsResponseSchema,
+ cursorPaginationInfoSchema,
+ extractHistoryEntrySchema,
+ extractRequestBaseSchema,
+ extractResponseSchema,
fetchConfigSchema,
fetchContentTypeSchema,
fetchModeSchema,
- formatConfigSchema,
+ fetchWarningSchema,
+ healthResponseSchema,
+ historyEntrySchema,
historyFilterSchema,
- htmlModeSchema,
- mockConfigSchema,
- monitorActivityRequestSchema,
- monitorCreateRequestSchema,
- monitorUpdateRequestSchema,
+ historyPageSchema,
+ historyStatusSchema,
+ htmlConfigSchema,
+ imageChangeSchema,
+ jobsStatusSchema,
+ jsonChangeSchema,
+ llmConfigSchema,
+ markdownConfigSchema,
+ monitorActivityQuerySchema,
+ monitorActivityResponseSchema,
+ monitorCreateSchema,
+ monitorDiffsSchema,
+ monitorHistoryEntrySchema,
+ monitorJobPayloadSchema,
+ monitorResponseSchema,
+ monitorResultSchema,
+ monitorTickEntrySchema,
+ monitorUpdateSchema,
+ paginationInfoSchema,
+ scrapeBrandingFormatSchema,
+ scrapeCaptureFormatSchema,
+ scrapeContentFormatSchema,
+ scrapeFormatEntrySchema,
+ scrapeFormatErrorSchema,
+ scrapeFormatSchema,
+ scrapeHistoryEntrySchema,
+ scrapeHtmlFormatSchema,
+ scrapeImagesFormatSchema,
+ scrapeJsonFormatSchema,
+ scrapeLinksFormatSchema,
+ scrapeMarkdownFormatSchema,
+ scrapeMetadataSchema,
scrapeRequestSchema,
+ scrapeResponseSchema,
+ scrapeResultMapSchema,
+ scrapeScreenshotDataSchema,
+ scrapeScreenshotFormatSchema,
+ scrapeSummaryConfigSchema,
+ scrapeSummaryFormatSchema,
+ searchHistoryEntrySchema,
+ searchMetadataSchema,
searchRequestSchema,
- timeRangeSchema,
+ searchResponseSchema,
+ searchResultSchema,
+ setChangeSchema,
+ textChangeSchema,
+ tokenUsageSchema,
+ validateResponseSchema,
+ webhookStatusSchema,
} from "./schemas.js";
-export type Service = "scrape" | "extract" | "search" | "monitor" | "crawl";
-export type HtmlMode = z.infer;
-export type FetchMode = z.infer;
-export type TimeRange = z.infer;
-export type CrawlStatus = "running" | "completed" | "failed" | "paused" | "deleted";
-export type CrawlPageStatus = "completed" | "failed" | "skipped";
-export type HistoryStatus = "completed" | "failed" | "running" | "paused" | "deleted";
-export type MonitorTickStatus = "completed" | "failed" | "paused" | "running";
-export type FetchContentType = z.infer;
-
-export type MockConfig = z.input;
-export type FetchConfig = z.input;
-
-export type MarkdownFormatConfig = z.input & { type: "markdown" };
-export type HtmlFormatConfig = z.input & { type: "html" };
-export type ScreenshotFormatConfig = z.input & { type: "screenshot" };
-export type JsonFormatConfig = z.input & { type: "json" };
-export type LinksFormatConfig = z.input & { type: "links" };
-export type ImagesFormatConfig = z.input & { type: "images" };
-export type SummaryFormatConfig = z.input & { type: "summary" };
-export type BrandingFormatConfig = z.input & { type: "branding" };
-export type FormatConfig = z.input;
-
-export type FormatType =
- | "markdown"
- | "html"
- | "links"
- | "images"
- | "summary"
- | "json"
- | "branding"
- | "screenshot";
-
-export type ScrapeRequest = z.input;
-export type ExtractRequest = z.input;
-export type SearchRequest = z.input;
-export type CrawlRequest = z.input;
-export type MonitorCreateRequest = z.input;
-export type MonitorUpdateRequest = z.input;
-export type MonitorActivityRequest = z.input;
-export type HistoryFilter = z.input;
-
-export interface TokenUsage {
- promptTokens: number;
- completionTokens: number;
-}
-
-export interface ChunkerMetadata {
- chunks: { size: number }[];
-}
-
-export type FetchWarningReason = "too_short" | "empty" | "bot_blocked" | "spa_shell" | "soft_404";
-
-export interface FetchWarning {
- reason: FetchWarningReason;
- provider?: string;
-}
-
-export interface ContentPageMetadata {
- index: number;
- images: Array<{
- id: string;
- topLeftX: number;
- topLeftY: number;
- bottomRightX: number;
- bottomRightY: number;
- }>;
- tables: Array<{ id: string; content: string; format: string }>;
- hyperlinks: string[];
- dimensions: { dpi: number; height: number; width: number };
-}
+// ─── generic / config ────────────────────────────────────────────────────────
-export interface ScrapeMetadata {
- provider?: string;
- contentType: string;
- elapsedMs?: number;
- warnings?: FetchWarning[];
- ocr?: {
- model: string;
- pagesProcessed: number;
- pages: ContentPageMetadata[];
- };
-}
+export type { ModelName } from "./models.js";
+export type UserRole = "user" | "admin";
-export interface BrandingColors {
- primary: string;
- accent: string;
- background: string;
- textPrimary: string;
- link: string;
-}
-
-export interface BrandingFontEntry {
- family: string;
- fallback: string;
-}
+export type ErrorType =
+ | "auth_missing_key"
+ | "internal"
+ | "monitor_tick_failed"
+ | "not_found"
+ | "upstream_failed"
+ | "validation";
-export interface BrandingTypography {
- primary: BrandingFontEntry;
- heading: BrandingFontEntry;
- mono: BrandingFontEntry;
- sizes: { h1: string; h2: string; body: string };
+export interface Error {
+ type: ErrorType;
+ message: string;
+ details?: unknown;
}
-export interface BrandingImages {
- logo: string;
- favicon: string;
- ogImage: string;
+export interface ErrorResponse {
+ error: Error;
}
-export interface BrandingPersonality {
- tone: string;
- energy: "high" | "medium" | "low";
- targetAudience: string;
-}
+export type RateLimitKind = "work" | "poll";
-export interface Branding {
- colorScheme: "light" | "dark";
- colors: BrandingColors;
- typography: BrandingTypography;
- images: BrandingImages;
- spacing: { baseUnit: number; borderRadius: string };
- frameworkHints: string[];
- personality: BrandingPersonality;
- confidence: number;
+export interface RateLimitConfig {
+ work: number;
+ poll: number;
}
-export interface BrandingMetadata {
- title: string;
- description: string;
- favicon: string;
- language: string;
- themeColor: string;
- ogTitle: string;
- ogDescription: string;
- ogImage: string;
- ogUrl: string;
+export interface ServiceConfig {
+ rateLimit: RateLimitConfig;
+ maxJobs?: number;
}
-export interface ScreenshotData {
- url: string;
- width: number;
- height: number;
-}
+export type ServicesConfig = Record;
-export interface FormatError {
- code: string;
- error: string;
-}
+export type TokenUsage = z.infer;
+export type ChunkerMetadata = z.infer;
+export type ValidateResponse = z.infer;
+export type HealthResponse = z.infer;
-export interface FormatResponseMap {
- markdown: string[];
- html: string[];
- links: string[];
- images: string[];
- summary: string;
- json: Record;
- branding: Branding;
- screenshot: ScreenshotData;
-}
+// ─── scrape ──────────────────────────────────────────────────────────────────
+export type ScrapeRequest = z.infer;
+export type FetchConfig = z.infer;
+export type FetchMode = z.infer;
+export type FetchContentType = z.infer;
export type ImageContentType = Extract;
-export interface FormatMetadataMap {
- markdown: Record;
- html: Record;
- links: { count: number };
- images: { count: number };
- summary: { chunker?: ChunkerMetadata };
- json: { chunker: ChunkerMetadata; raw?: string | null };
- branding: { branding: BrandingMetadata };
- screenshot: { contentType: ImageContentType; provider?: string };
-}
-
-export type ScrapeResultMap = Partial<{
- [K in FormatType]: {
- data: FormatResponseMap[K];
- metadata?: FormatMetadataMap[K];
- };
-}>;
-
-export interface ScrapeResponse {
- results: ScrapeResultMap;
- metadata: ScrapeMetadata;
- errors?: Partial<{ [K in FormatType]: FormatError }>;
-}
-
-export interface ExtractResponse {
- raw: string | null;
- json: Record | null;
- usage: TokenUsage;
- metadata: {
- chunker: ChunkerMetadata;
- fetch?: { provider?: string };
- };
-}
-
-export interface SearchResult {
- url: string;
- title: string;
- content: string;
- provider?: string;
-}
-
-export interface SearchMetadata {
- search: { provider?: string };
- pages: { requested: number; scraped: number };
- chunker?: ChunkerMetadata;
-}
-
-export interface SearchResponse {
- results: SearchResult[];
- json?: Record | null;
- raw?: string | null;
- usage?: TokenUsage;
- metadata: SearchMetadata;
-}
-
-export interface CrawlPage {
- url: string;
- status: CrawlPageStatus;
- depth: number;
- parentUrl: string | null;
- links: string[];
- scrapeRefId: string;
- title: string;
- contentType: string;
- screenshotUrl?: string;
- reason?: string;
- error?: string;
-}
-
-export interface CrawlResult {
- status: CrawlStatus;
- reason?: string;
- total: number;
- finished: number;
- pages: CrawlPage[];
-}
-
-export interface CrawlResponse extends CrawlResult {
- id: string;
-}
-
-export interface TextChange {
- type: "added" | "removed";
- line: number;
- content: string;
-}
-
-export interface JsonChange {
- path: string;
- old: unknown;
- new: unknown;
-}
-
-export interface SetChange {
- added: string[];
- removed: string[];
-}
-
-export interface ImageChange {
- size: number;
- changed: number;
- mask?: string;
-}
-
-export interface MonitorDiffs {
- markdown?: TextChange[];
- html?: TextChange[];
- json?: JsonChange[];
- screenshot?: ImageChange;
- links?: SetChange;
- images?: SetChange;
- summary?: TextChange[];
- branding?: JsonChange[];
-}
-
-export type MonitorRefs = Partial>;
-
-export interface WebhookStatus {
- sentAt: string;
- statusCode: number | null;
- error?: string;
-}
-
-export interface MonitorResult {
- changed: boolean;
- diffs: MonitorDiffs;
- refs: MonitorRefs;
- webhookStatus?: WebhookStatus;
-}
-
-export interface MonitorResponse {
- cronId: string;
- scheduleId: string;
- interval: string;
- status: "active" | "paused";
- config: MonitorCreateRequest;
- createdAt: string;
- updatedAt: string;
-}
+export type ContentPageMetadata = z.infer;
+export type FetchWarning = z.infer;
+export type ScrapeMetadata = z.infer;
+
+export type ScrapeContentFormat = z.infer;
+export type ScrapeCaptureFormat = z.infer;
+export type ScrapeFormat = z.infer;
+export type ScrapeMarkdownConfig = z.infer;
+export type ScrapeHtmlConfig = z.infer;
+export type ScrapeSummaryConfig = z.infer;
+export type ScrapeFormatEntry = z.infer;
+export type ScrapeMarkdownFormatEntry = z.infer;
+export type ScrapeHtmlFormatEntry = z.infer;
+export type ScrapeLinksFormatEntry = z.infer;
+export type ScrapeImagesFormatEntry = z.infer;
+export type ScrapeSummaryFormatEntry = z.infer;
+export type ScrapeJsonFormatEntry = z.infer;
+export type ScrapeBrandingFormatEntry = z.infer;
+export type ScrapeScreenshotFormatEntry = z.infer;
+export type ScrapeContentFormatEntry = Extract;
+
+export type BrandingColors = z.infer;
+export type BrandingFontEntry = z.infer;
+export type BrandingTypography = z.infer;
+export type BrandingSpacing = z.infer;
+export type BrandingInputComponent = z.infer;
+export type BrandingButtonComponent = z.infer;
+export type BrandingComponents = z.infer;
+export type BrandingImages = z.infer;
+export type BrandingPersonality = z.infer;
+export type BrandingDesignSystem = z.infer;
+export type BrandingButtonReasoning = z.infer;
+export type BrandingLogoReasoning = z.infer;
+export type BrandingConfidence = z.infer;
+export type Branding = z.infer;
+export type BrandingMetadata = z.infer;
+export type ScrapeFormatError = z.infer;
+export type ScrapeScreenshotData = z.infer;
+export type ScrapeResultMap = z.infer;
+
+export type ScrapeFormatResponseMap = {
+ [K in keyof Required]: NonNullable["data"];
+};
+
+export type ScrapeFormatMetadataMap = {
+ [K in keyof Required]: NonNullable["metadata"]>;
+};
+
+export type ScrapeResponse = z.infer;
+
+export type ScrapeEvent =
+ | { type: "scrape.fetch.started"; url: string }
+ | { type: "scrape.fetch.completed"; url: string; elapsedMs: number }
+ | { type: "scrape.process.started"; format: ScrapeFormat }
+ | { type: "scrape.process.completed"; format: ScrapeFormat; elapsedMs: number }
+ | { type: "scrape.process.failed"; format: ScrapeFormat; error: string; code: string }
+ | { type: "scrape.result"; data: ScrapeResponse }
+ | { type: "scrape.failed"; error: string; code: string }
+ | { type: "scrape.completed" };
+
+// ─── extract ─────────────────────────────────────────────────────────────────
+
+export type ExtractRequestBase = z.infer;
+export type LlmConfig = z.infer;
+
+export type ExtractResponse = z.infer;
+
+export type ExtractEvent =
+ | { type: "extract.fetch.started"; url: string }
+ | { type: "extract.fetch.completed"; url: string; elapsedMs: number }
+ | { type: "extract.extraction.started" }
+ | { type: "extract.extraction.completed"; elapsedMs: number }
+ | { type: "extract.failed"; error: string }
+ | { type: "extract.completed" };
+
+// ─── search ──────────────────────────────────────────────────────────────────
+
+export type SearchRequest = z.infer;
+
+export type SearchResult = z.infer;
+export type SearchMetadata = z.infer;
+export type SearchResponse = z.infer;
+
+export type SearchEvent =
+ | { type: "search.query.started" }
+ | {
+ type: "search.query.completed";
+ query: string;
+ prompt: string;
+ urls: string[];
+ totalResults: number;
+ }
+ | { type: "search.scrape.started"; url: string; requestId: string }
+ | { type: "search.scrape.completed"; url: string; requestId: string; data: unknown }
+ | { type: "search.scrape.failed"; url: string; requestId: string; error: string }
+ | { type: "search.scrape.done"; total: number; scraped: number }
+ | { type: "search.merge.started" }
+ | { type: "search.failed"; error: string }
+ | { type: "search.completed" };
+
+// ─── monitor ─────────────────────────────────────────────────────────────────
+
+export type MonitorCreateRequest = z.infer;
+export type MonitorUpdateRequest = z.infer;
+export type MonitorActivityQuery = z.input;
+
+export type TextChange = z.infer;
+export type JsonChange = z.infer;
+export type SetChange = z.infer;
+export type ImageChange = z.infer;
+export type MonitorDiffs = z.infer;
+export type MonitorRefs = Partial>;
+export type WebhookStatus = z.infer;
+export type MonitorResult = z.infer;
+
+export function countMonitorDiffs(diffs?: Partial): number {
+ let count = 0;
+ if (diffs?.markdown) count += diffs.markdown.length;
+ if (diffs?.html) count += diffs.html.length;
+ if (diffs?.json) count += diffs.json.length;
+ if (diffs?.summary) count += diffs.summary.length;
+ if (diffs?.branding) count += diffs.branding.length;
+ if (diffs?.links) count += diffs.links.added.length + diffs.links.removed.length;
+ if (diffs?.images) count += diffs.images.added.length + diffs.images.removed.length;
+ if (diffs?.screenshot?.changed) count += 1;
+ return count;
+}
+
+export type MonitorJobPayload = z.infer;
+
+export type MonitorResponse = z.infer;
+export type MonitorTickEntry = z.infer;
+export type MonitorTickStatus = MonitorTickEntry["status"];
+export type MonitorActivityResponse = z.infer;
+
+export type WebhookPayload =
+ | {
+ type: "monitor.change.detected";
+ data: {
+ cronId: string;
+ url: string;
+ changedAt: string;
+ changed: boolean;
+ current: ScrapeResultMap;
+ previous: ScrapeResultMap | null;
+ diffs: MonitorDiffs;
+ };
+ }
+ | {
+ type: "monitor.test";
+ data: {
+ cronId: string;
+ url: string;
+ sentAt: string;
+ };
+ };
+
+export type MonitorEvent =
+ | { type: "monitor.tick.started"; cronId: string; url: string }
+ | { type: "monitor.tick.completed"; cronId: string; changed: boolean }
+ | {
+ type: "monitor.change.detected";
+ cronId: string;
+ url: string;
+ diffs: MonitorDiffs;
+ }
+ | { type: "monitor.tick.failed"; cronId: string; url: string; error: string }
+ | { type: "monitor.paused"; cronId: string; reason: string }
+ | { type: "monitor.webhook.completed"; cronId: string; statusCode: number }
+ | { type: "monitor.webhook.failed"; cronId: string; error: string };
+
+// ─── crawl ───────────────────────────────────────────────────────────────────
+
+export type CrawlRequest = z.infer;
+export type CrawlStatus = z.infer;
+export type CrawlPageStatus = z.infer;
+
+export type CrawlPage = z.infer;
+export type CrawlPagesQuery = z.infer;
+export type CrawlPagesResponse = z.infer;
+export type CrawlResponse = z.infer;
+
+export type CrawlJobPayload = z.infer;
+
+export type CrawlEvent =
+ | { type: "crawl.started"; crawlId: string; url: string }
+ | { type: "crawl.page.completed"; crawlId: string; page: CrawlPage }
+ | { type: "crawl.page.skipped"; crawlId: string; page: CrawlPage; reason: string }
+ | {
+ type: "crawl.page.failed";
+ crawlId: string;
+ page: CrawlPage;
+ error: string;
+ }
+ | { type: "crawl.progress"; crawlId: string; total: number; finished: number }
+ | { type: "crawl.paused"; crawlId: string; reason: string }
+ | { type: "crawl.resumed"; crawlId: string }
+ | { type: "crawl.completed"; crawlId: string };
+
+export type Event = ScrapeEvent | ExtractEvent | SearchEvent | MonitorEvent | CrawlEvent;
+
+export type EventType = Event["type"];
+
+export type EventData = Extract;
+
+// ─── history ─────────────────────────────────────────────────────────────────
+
+export type HistoryFilter = z.infer;
+export type HistoryService = "scrape" | "extract" | "search" | "monitor" | "crawl";
+export type HistoryStatus = z.infer;
+
+export type ScrapeHistoryEntry = z.infer;
+export type ExtractHistoryEntry = z.infer;
+export type SearchHistoryEntry = z.infer;
+export type MonitorHistoryEntry = z.infer;
+export type CrawlHistoryEntry = z.infer;
+export type CrawlResult = z.infer;
+export type HistoryEntry = z.infer;
-export interface MonitorTickEntry {
- id: string;
- status: MonitorTickStatus;
- createdAt: string;
- elapsedMs: number;
- changed: boolean;
- diffs: MonitorDiffs;
- error?: string;
-}
-
-export interface MonitorActivityResponse {
- ticks: MonitorTickEntry[];
- nextCursor: string | null;
-}
-
-interface HistoryBase {
- id: string;
- status: HistoryStatus;
- error: unknown;
- elapsedMs: number;
- createdAt: string;
- requestParentId: string | null;
-}
-
-export interface ScrapeHistoryEntry extends HistoryBase {
- service: "scrape";
- params: ScrapeRequest;
- result: ScrapeResponse;
-}
-
-export interface ExtractHistoryEntry extends HistoryBase {
- service: "extract";
- params: ExtractRequest;
- result: ExtractResponse;
-}
-
-export interface SearchHistoryEntry extends HistoryBase {
- service: "search";
- params: SearchRequest;
- result: SearchResponse;
+export interface PageResponse {
+ data: T[];
+ pagination: z.infer;
}
-export interface MonitorHistoryEntry extends HistoryBase {
- service: "monitor";
- params: { cronId: string; url: string };
- result: MonitorResult;
+export interface CursorPageResponse {
+ data: T[];
+ pagination: z.infer;
}
-export interface CrawlHistoryEntry extends HistoryBase {
- service: "crawl";
- params: { url: string; maxPages: number };
- result: CrawlResult;
-}
+export type HistoryPage = z.infer;
-export type HistoryEntry =
- | ScrapeHistoryEntry
- | ExtractHistoryEntry
- | SearchHistoryEntry
- | MonitorHistoryEntry
- | CrawlHistoryEntry;
-
-export interface HistoryPagination {
- page: number;
- limit: number;
- total: number;
-}
+// ─── credits ─────────────────────────────────────────────────────────────────
-export interface PageResponse {
- data: T[];
- pagination: HistoryPagination;
-}
+export type JobsStatus = z.infer;
+export type CreditsResponse = z.infer;
-export type HistoryPage = PageResponse;
+// ─── credit ledger ──────────────────────────────────────────────────────────
-export interface JobsStatus {
- used: number;
- limit: number;
+// [NOTE] @Claude single-letter keys to minimise Redis memory per entry — flushed to DB with full names
+export interface CreditLedgerEntry {
+ i: string;
+ k: string;
+ a: number;
+ s: string;
+ t: number;
+ r?: string;
+ ak?: string;
}
-export interface CreditsJobs {
- crawl: JobsStatus;
- monitor: JobsStatus;
+export interface TopUpInProcessEntry {
+ i: string;
+ k: string;
+ t: number;
}
+// ─── legacy migration ───────────────────────────────────────────────────────
-export interface CreditsResponse {
- remaining: number;
- used: number;
- plan: string;
- jobs: CreditsJobs;
+export interface LegacyOnboarding {
+ jobRole: string;
+ company: string | null;
+ companySize: string;
+ primaryUseCase: string;
+ source: string;
}
-export interface HealthResponse {
- status: "ok" | "degraded";
- uptime: number;
+export interface LegacyUserData {
+ oldUserId: string;
+ stripeCustomerId: string | null;
+ stripeSubscriptionId: string | null;
+ planId: string;
+ remainingCredits: number;
+ onboarding: LegacyOnboarding;
}
export interface ApiResult {
diff --git a/src/url.ts b/src/url.ts
new file mode 100644
index 0000000..e3cf816
--- /dev/null
+++ b/src/url.ts
@@ -0,0 +1,46 @@
+const IPV4_RE = /^(?:(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)\.){3}(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)$/;
+const IPV6_RE =
+ /^(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}$|^::$|^(?:[0-9a-fA-F]{1,4}:){1,7}:$|^:(?::[0-9a-fA-F]{1,4}){1,7}$|^(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}$|^(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}$|^(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}$|^(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}$|^(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}$|^[0-9a-fA-F]{1,4}:(?::[0-9a-fA-F]{1,4}){1,6}$|^::(?:ffff:)?(?:(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)\.){3}(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)$/;
+
+function isIPv4(s: string): boolean {
+ return IPV4_RE.test(s);
+}
+
+function isIPv6(s: string): boolean {
+ return IPV6_RE.test(s);
+}
+
+const PRIVATE_HOSTNAME_PATTERNS = [/^localhost$/i, /\.local$/i, /\.internal$/i, /\.localhost$/i];
+
+function isPrivateIPv4(ip: string): boolean {
+ const parts = ip.split(".").map(Number);
+ if (parts.length !== 4 || parts.some((p) => Number.isNaN(p) || p < 0 || p > 255)) return false;
+ const [a, b] = parts;
+ if (a === 127) return true;
+ if (a === 10) return true;
+ if (a === 172 && b >= 16 && b <= 31) return true;
+ if (a === 192 && b === 168) return true;
+ if (a === 169 && b === 254) return true;
+ if (a === 0) return true;
+ return false;
+}
+
+function isPrivateIPv6(ip: string): boolean {
+ const normalized = ip.replace(/^\[|]$/g, "").toLowerCase();
+ if (normalized === "::1") return true;
+ if (normalized === "::") return true;
+ if (normalized.startsWith("fe80:")) return true;
+ if (normalized.startsWith("fc") || normalized.startsWith("fd")) return true;
+ if (normalized.startsWith("::ffff:")) {
+ const v4 = normalized.slice(7);
+ if (isIPv4(v4)) return isPrivateIPv4(v4);
+ }
+ return false;
+}
+
+export function isInternal(hostname: string): boolean {
+ if (PRIVATE_HOSTNAME_PATTERNS.some((r) => r.test(hostname))) return true;
+ if (isIPv4(hostname)) return isPrivateIPv4(hostname);
+ if (isIPv6(hostname) || hostname.startsWith("[")) return isPrivateIPv6(hostname);
+ return false;
+}
diff --git a/tests/scrapegraphai.test.ts b/tests/scrapegraphai.test.ts
index b8496bd..edfe47a 100644
--- a/tests/scrapegraphai.test.ts
+++ b/tests/scrapegraphai.test.ts
@@ -906,6 +906,35 @@ describe("crawl", () => {
expectRequest(0, "GET", "/crawl/crawl-123");
});
+ test("pages success", async () => {
+ const body = {
+ data: [
+ {
+ url: "https://example.com",
+ status: "completed",
+ depth: 0,
+ parentUrl: null,
+ links: [],
+ scrapeRefId: "scrape-123",
+ title: "Example",
+ contentType: "text/html",
+ scrape: {
+ results: { markdown: { data: ["# Example"] } },
+ metadata: { contentType: "text/html" },
+ },
+ },
+ ],
+ pagination: { limit: 50, nextCursor: null },
+ };
+ fetchSpy = spyOn(globalThis, "fetch").mockResolvedValueOnce(json(body));
+
+ const res = await sdk.crawl.pages(API_KEY, "crawl-123", { cursor: 0, limit: 50 });
+
+ expect(res.status).toBe("success");
+ expect(res.data).toEqual(body);
+ expectRequest(0, "GET", "/crawl/crawl-123/pages?cursor=0&limit=50");
+ });
+
test("stop success", async () => {
fetchSpy = spyOn(globalThis, "fetch").mockResolvedValueOnce(json({ ok: true }));