From 9d4eba1f154953e401982da7eff85686293b9a48 Mon Sep 17 00:00:00 2001 From: MrAliHasan Date: Sat, 21 Feb 2026 03:17:15 +0500 Subject: [PATCH 1/7] feat: add OpenAI Batch API support for SmartScraperMultiGraph (#1036) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add SmartScraperMultiBatchGraph that uses the OpenAI Batch API for LLM calls, providing ~50% cost savings when real-time results aren't needed. Key features: - SmartScraperMultiBatchGraph: 3-phase pipeline (fetch/parse → batch submit → merge) that separates HTML fetching from LLM generation - BatchGenerateAnswerNode: collects prompts from all URLs and submits them as a single OpenAI Batch API request - utils/batch_api.py: helpers for creating, polling, and retrieving batch results with doc_id → URL mapping - Per-document error handling: partial failures don't break the batch - Configurable polling interval and max wait time - OpenAI-only validation (rejects non-OpenAI providers gracefully) - Results sorted by custom_id for consistent ordering - 18 unit tests with 100% pass rate Usage: graph = SmartScraperMultiBatchGraph( prompt='Extract key points', source=['https://url1.com', 'https://url2.com'], config={'llm': {'model': 'openai/gpt-4o-mini'}} ) result = graph.run() Closes #1036 --- scrapegraphai/graphs/__init__.py | 2 + .../graphs/smart_scraper_multi_batch_graph.py | 216 ++++++++++ scrapegraphai/nodes/__init__.py | 2 + .../nodes/batch_generate_answer_node.py | 253 +++++++++++ scrapegraphai/utils/batch_api.py | 316 ++++++++++++++ tests/test_batch_api.py | 403 ++++++++++++++++++ 6 files changed, 1192 insertions(+) create mode 100644 scrapegraphai/graphs/smart_scraper_multi_batch_graph.py create mode 100644 scrapegraphai/nodes/batch_generate_answer_node.py create mode 100644 scrapegraphai/utils/batch_api.py create mode 100644 tests/test_batch_api.py diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py index 527c6e20..b18d719c 100644 --- a/scrapegraphai/graphs/__init__.py +++ b/scrapegraphai/graphs/__init__.py @@ -23,6 +23,7 @@ from .smart_scraper_lite_graph import SmartScraperLiteGraph from .smart_scraper_multi_concat_graph import SmartScraperMultiConcatGraph from .smart_scraper_multi_graph import SmartScraperMultiGraph +from .smart_scraper_multi_batch_graph import SmartScraperMultiBatchGraph from .smart_scraper_multi_lite_graph import SmartScraperMultiLiteGraph from .speech_graph import SpeechGraph from .xml_scraper_graph import XMLScraperGraph @@ -45,6 +46,7 @@ "SmartScraperGraph", "SmartScraperLiteGraph", "SmartScraperMultiGraph", + "SmartScraperMultiBatchGraph", "SmartScraperMultiLiteGraph", "SmartScraperMultiConcatGraph", # Search-related graphs diff --git a/scrapegraphai/graphs/smart_scraper_multi_batch_graph.py b/scrapegraphai/graphs/smart_scraper_multi_batch_graph.py new file mode 100644 index 00000000..399e18dc --- /dev/null +++ b/scrapegraphai/graphs/smart_scraper_multi_batch_graph.py @@ -0,0 +1,216 @@ +""" +SmartScraperMultiBatchGraph Module + +A scraping pipeline that uses the OpenAI Batch API for LLM calls, +providing 50% cost savings compared to real-time API calls. +""" + +import asyncio +from copy import deepcopy +from typing import Dict, List, Optional, Type + +from pydantic import BaseModel + +from ..nodes import FetchNode, GraphIteratorNode, ParseNode +from ..nodes.batch_generate_answer_node import BatchGenerateAnswerNode +from ..nodes.merge_answers_node import MergeAnswersNode +from ..utils.copy import safe_deepcopy +from .abstract_graph import AbstractGraph +from .base_graph import BaseGraph +from .smart_scraper_graph import SmartScraperGraph + + +class _FetchParseOnlyGraph(AbstractGraph): + """Internal graph that only fetches and parses a URL (no LLM generation). + + This is used to separate the fetch/parse phase from the LLM generation + phase, allowing all LLM calls to be batched together. + """ + + def __init__( + self, + prompt: str, + source: str, + config: dict, + schema: Optional[Type[BaseModel]] = None, + ): + super().__init__(prompt, config, source, schema) + self.input_key = "url" if source.startswith("http") else "local_dir" + + def _create_graph(self) -> BaseGraph: + fetch_node = FetchNode( + input="url | local_dir", + output=["doc"], + node_config={ + "llm_model": self.llm_model, + "force": self.config.get("force", False), + "cut": self.config.get("cut", True), + "loader_kwargs": self.config.get("loader_kwargs", {}), + "browser_base": self.config.get("browser_base"), + "scrape_do": self.config.get("scrape_do"), + "storage_state": self.config.get("storage_state"), + }, + ) + parse_node = ParseNode( + input="doc", + output=["parsed_doc"], + node_config={ + "llm_model": self.llm_model, + "chunk_size": self.model_token, + }, + ) + + return BaseGraph( + nodes=[fetch_node, parse_node], + edges=[(fetch_node, parse_node)], + entry_point=fetch_node, + graph_name=self.__class__.__name__, + ) + + def run(self) -> str: + inputs = {"user_prompt": self.prompt, self.input_key: self.source} + self.final_state, self.execution_info = self.graph.execute(inputs) + return self.final_state.get("parsed_doc", "") + + +class SmartScraperMultiBatchGraph(AbstractGraph): + """A scraping pipeline that uses OpenAI Batch API for cost savings. + + Similar to SmartScraperMultiGraph, but instead of making individual + LLM calls per URL, it: + 1. Fetches and parses all URLs concurrently (Phase 1) + 2. Collects all prompts and submits them as a single OpenAI Batch (Phase 2) + 3. Polls for batch completion (Phase 3) + 4. Merges all results into a final answer (Phase 4) + + This provides ~50% cost savings on OpenAI API calls at the expense + of higher latency (up to 24 hours for batch completion). + + Attributes: + prompt (str): The user prompt for scraping. + source (List[str]): List of URLs to scrape. + config (dict): Configuration including 'llm' and optional 'batch_api' settings. + schema (Optional[BaseModel]): Optional Pydantic schema for structured output. + + Config options under 'batch_api': + poll_interval (int): Seconds between batch status checks (default: 30). + max_wait_time (int): Maximum wait time in seconds (default: 86400 = 24h). + model (str): Override model for batch requests (optional). + temperature (float): Temperature for batch requests (default: 0.0). + + Example: + >>> graph = SmartScraperMultiBatchGraph( + ... prompt="Extract the main topic and key points", + ... source=[ + ... "https://example.com/page1", + ... "https://example.com/page2", + ... ], + ... config={ + ... "llm": {"model": "openai/gpt-4o-mini"}, + ... "batch_api": { + ... "poll_interval": 30, + ... "max_wait_time": 3600, + ... }, + ... } + ... ) + >>> result = graph.run() + """ + + def __init__( + self, + prompt: str, + source: List[str], + config: dict, + schema: Optional[Type[BaseModel]] = None, + ): + self.copy_config = safe_deepcopy(config) + self.copy_schema = deepcopy(schema) + self.batch_config = config.get("batch_api", {}) + + # Validate that the model is OpenAI-based + model_str = config.get("llm", {}).get("model", "") + if "/" in model_str: + provider = model_str.split("/")[0] + else: + provider = "" + if provider and provider != "openai": + raise ValueError( + f"SmartScraperMultiBatchGraph only supports OpenAI models. " + f"Got provider '{provider}'. " + f"Use SmartScraperMultiGraph for other providers." + ) + + super().__init__(prompt, config, source, schema) + + def _create_graph(self) -> BaseGraph: + """Creates the graph of nodes for the batch scraping pipeline. + + The graph has two phases: + 1. GraphIteratorNode runs _FetchParseOnlyGraph per URL (concurrent) + 2. BatchGenerateAnswerNode submits all prompts via Batch API + 3. MergeAnswersNode combines the results + + Returns: + BaseGraph: A graph instance representing the batch scraping workflow. + """ + # Phase 1: Fetch and parse all URLs concurrently + graph_iterator_node = GraphIteratorNode( + input="user_prompt & urls", + output=["parsed_docs"], + node_config={ + "graph_instance": _FetchParseOnlyGraph, + "scraper_config": self.copy_config, + }, + schema=self.copy_schema, + ) + + # Phase 2: Submit all prompts to OpenAI Batch API + batch_generate_node = BatchGenerateAnswerNode( + input="user_prompt & parsed_docs", + output=["results"], + node_config={ + "llm_model": self.llm_model, + "schema": self.copy_schema, + "batch_config": self.batch_config, + }, + ) + + # Phase 3: Merge all results + merge_answers_node = MergeAnswersNode( + input="user_prompt & results", + output=["answer"], + node_config={ + "llm_model": self.llm_model, + "schema": self.copy_schema, + }, + ) + + return BaseGraph( + nodes=[ + graph_iterator_node, + batch_generate_node, + merge_answers_node, + ], + edges=[ + (graph_iterator_node, batch_generate_node), + (batch_generate_node, merge_answers_node), + ], + entry_point=graph_iterator_node, + graph_name=self.__class__.__name__, + ) + + def run(self) -> str: + """Executes the full batch scraping pipeline. + + This will: + 1. Fetch and parse all URLs concurrently + 2. Submit all LLM prompts as an OpenAI Batch + 3. Poll until the batch completes (may take minutes to hours) + 4. Merge results into a final answer + + Returns: + str: The merged answer from all scraped URLs. + """ + inputs = {"user_prompt": self.prompt, "urls": self.source} + self.final_state, self.execution_info = self.graph.execute(inputs) + return self.final_state.get("answer", "No answer found.") diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py index b6917238..ca9deddf 100644 --- a/scrapegraphai/nodes/__init__.py +++ b/scrapegraphai/nodes/__init__.py @@ -3,6 +3,7 @@ """ from .base_node import BaseNode +from .batch_generate_answer_node import BatchGenerateAnswerNode from .concat_answers_node import ConcatAnswersNode from .conditional_node import ConditionalNode from .description_node import DescriptionNode @@ -53,6 +54,7 @@ "DescriptionNode", "ReasoningNode", # Generation nodes + "BatchGenerateAnswerNode", "GenerateAnswerNode", "GenerateAnswerNodeKLevel", "GenerateAnswerCSVNode", diff --git a/scrapegraphai/nodes/batch_generate_answer_node.py b/scrapegraphai/nodes/batch_generate_answer_node.py new file mode 100644 index 00000000..41106952 --- /dev/null +++ b/scrapegraphai/nodes/batch_generate_answer_node.py @@ -0,0 +1,253 @@ +""" +BatchGenerateAnswerNode Module + +A node that collects LLM prompts from multiple scraped documents +and submits them as a single OpenAI Batch API request for 50% cost savings. +""" + +import json +import logging +from typing import Any, Dict, List, Optional + +from langchain.prompts import PromptTemplate +from langchain_core.output_parsers import JsonOutputParser + +from ..prompts import ( + TEMPLATE_NO_CHUNKS_MD, + TEMPLATE_NO_CHUNKS, +) +from ..utils.batch_api import ( + BatchRequest, + BatchResult, + create_batch, + poll_batch_until_complete, + retrieve_batch_results, +) +from ..utils.output_parser import get_pydantic_output_parser +from .base_node import BaseNode + +logger = logging.getLogger(__name__) + + +class BatchGenerateAnswerNode(BaseNode): + """A node that generates answers using the OpenAI Batch API. + + Instead of making individual LLM calls for each document, + this node collects all prompts and submits them as a single + batch request for 50% cost savings. + + Attributes: + llm_model: The language model configuration (must be OpenAI). + verbose (bool): Whether to show progress information. + + Args: + input (str): Boolean expression defining the input keys needed. + output (List[str]): List of output keys to be updated in the state. + node_config (Optional[dict]): Configuration dictionary containing: + - llm_model: The LLM model configuration. + - schema: Optional Pydantic schema for structured output. + - additional_info: Optional additional prompt context. + - batch_config: Optional dict with batch-specific settings: + - poll_interval: Seconds between status checks (default: 30). + - max_wait_time: Maximum wait in seconds (default: 86400). + - model: Override model for batch (optional). + - temperature: Override temperature (default: 0.0). + node_name (str): The unique identifier for this node. + """ + + def __init__( + self, + input: str, + output: List[str], + node_config: Optional[dict] = None, + node_name: str = "BatchGenerateAnswer", + ): + super().__init__(node_name, "node", input, output, 2, node_config) + + self.llm_model = node_config["llm_model"] + self.verbose = node_config.get("verbose", False) + self.additional_info = node_config.get("additional_info") + self.is_md_scraper = node_config.get("is_md_scraper", True) + self.schema = node_config.get("schema") + + # Batch-specific configuration + batch_config = node_config.get("batch_config", {}) + self.poll_interval = batch_config.get("poll_interval", 30) + self.max_wait_time = batch_config.get("max_wait_time", 86_400) + self.batch_model = batch_config.get("model") + self.batch_temperature = batch_config.get("temperature", 0.0) + + def _get_model_name(self) -> str: + """Extract the OpenAI model name from the LLM configuration. + + Returns: + The model name string (e.g., 'gpt-4o-mini'). + """ + if self.batch_model: + return self.batch_model + + # Try to extract model name from the LangChain model instance + if hasattr(self.llm_model, "model_name"): + return self.llm_model.model_name + if hasattr(self.llm_model, "model"): + return self.llm_model.model + + raise ValueError( + "Could not determine model name from llm_model. " + "Please specify 'model' in batch_config." + ) + + def _get_format_instructions(self) -> str: + """Get format instructions based on the schema configuration.""" + if self.schema is not None: + output_parser = get_pydantic_output_parser(self.schema) + return output_parser.get_format_instructions() + return ( + "You must respond with a JSON object. Your response should be " + "formatted as a valid JSON with a 'content' field containing " + 'your analysis. For example:\n' + '{"content": "your analysis here"}' + ) + + def _build_prompt_text( + self, + user_prompt: str, + content: str, + format_instructions: str, + ) -> str: + """Build the full prompt text for a single document. + + Args: + user_prompt: The user's question/prompt. + content: The scraped document content. + format_instructions: JSON output format instructions. + + Returns: + The formatted prompt string. + """ + template = ( + TEMPLATE_NO_CHUNKS_MD + if self.is_md_scraper + else TEMPLATE_NO_CHUNKS + ) + + if self.additional_info: + template = self.additional_info + template + + prompt = PromptTemplate( + template=template, + input_variables=["content", "question"], + partial_variables={"format_instructions": format_instructions}, + ) + return prompt.format(content=content, question=user_prompt) + + def execute(self, state: dict) -> dict: + """Execute the batch generation node. + + Takes multiple parsed documents and a user prompt, builds prompts + for each document, and submits them as a single OpenAI Batch API + request. + + Args: + state (dict): Must contain: + - user_prompt: The user's question. + - parsed_docs: List of parsed document contents. + - urls: List of source URLs (for result mapping). + + Returns: + dict: Updated state with 'results' key containing + a list of answers (one per document). + """ + self.logger.info(f"--- Executing {self.node_name} Node ---") + + user_prompt = state.get("user_prompt", "") + parsed_docs = state.get("parsed_docs", []) + urls = state.get("urls", []) + + if not parsed_docs: + raise ValueError("No parsed documents found in state") + + model_name = self._get_model_name() + format_instructions = self._get_format_instructions() + + # Build batch requests with doc_id → URL mapping + batch_requests = [] + doc_id_to_url = {} + + for i, doc in enumerate(parsed_docs): + custom_id = f"doc_{i:04d}" + doc_id_to_url[custom_id] = urls[i] if i < len(urls) else f"doc_{i}" + + # Handle chunked documents — use first chunk for batch + content = doc[0] if isinstance(doc, list) and len(doc) == 1 else str(doc) + + prompt_text = self._build_prompt_text( + user_prompt, content, format_instructions + ) + + batch_requests.append(BatchRequest( + custom_id=custom_id, + model=model_name, + messages=[{"role": "user", "content": prompt_text}], + temperature=self.batch_temperature, + response_format={"type": "json_object"}, + )) + + self.logger.info( + f"Submitting {len(batch_requests)} requests to " + f"OpenAI Batch API (model: {model_name})..." + ) + + # Submit batch + from openai import OpenAI + + client = OpenAI() + batch_id = create_batch( + client, + batch_requests, + description=f"ScrapeGraphAI: {user_prompt[:100]}", + ) + + self.logger.info(f"Batch submitted: {batch_id}") + state["batch_id"] = batch_id + + # Poll until complete + batch_info = poll_batch_until_complete( + client, + batch_id, + poll_interval=self.poll_interval, + max_wait_time=self.max_wait_time, + ) + + # Retrieve results + results = retrieve_batch_results(client, batch_info) + + # Parse results back into answers, maintaining URL order + answers = [] + for result in results: + if result.error: + self.logger.warning( + f"Request {result.custom_id} " + f"(URL: {doc_id_to_url.get(result.custom_id, 'unknown')}) " + f"failed: {result.error}" + ) + answers.append({"error": result.error}) + continue + + try: + parsed = json.loads(result.content) + answers.append(parsed) + except (json.JSONDecodeError, TypeError): + # If not valid JSON, wrap the raw content + answers.append({"content": result.content}) + + self.logger.info( + f"Batch complete: {len(answers)} answers retrieved " + f"({sum(1 for a in answers if 'error' not in a)} succeeded)" + ) + + state.update({ + self.output[0]: answers, + "doc_id_to_url": doc_id_to_url, + }) + return state diff --git a/scrapegraphai/utils/batch_api.py b/scrapegraphai/utils/batch_api.py new file mode 100644 index 00000000..ee753ad0 --- /dev/null +++ b/scrapegraphai/utils/batch_api.py @@ -0,0 +1,316 @@ +""" +OpenAI Batch API utility functions. + +Provides helpers for creating, polling, and retrieving results +from the OpenAI Batch API, enabling 50% cost savings on LLM calls +when real-time responses are not needed. + +Reference: https://platform.openai.com/docs/guides/batch +""" + +import io +import json +import logging +import time +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional + +from openai import OpenAI + +logger = logging.getLogger(__name__) + +# OpenAI Batch API limits +MAX_REQUESTS_PER_BATCH = 50_000 +DEFAULT_POLL_INTERVAL = 30 # seconds +DEFAULT_MAX_WAIT_TIME = 86_400 # 24 hours + + +@dataclass +class BatchRequest: + """A single request within a batch submission.""" + + custom_id: str + """Unique identifier for mapping responses back to requests.""" + + model: str + """The OpenAI model to use (e.g., 'gpt-4o-mini').""" + + messages: List[Dict[str, str]] + """The chat messages for this request.""" + + temperature: float = 0.0 + """Sampling temperature.""" + + max_tokens: Optional[int] = None + """Maximum tokens in the response.""" + + response_format: Optional[Dict[str, str]] = None + """Optional response format (e.g., {"type": "json_object"}).""" + + def to_jsonl_line(self) -> str: + """Convert to a JSONL line for the Batch API input file.""" + body = { + "model": self.model, + "messages": self.messages, + "temperature": self.temperature, + } + if self.max_tokens is not None: + body["max_tokens"] = self.max_tokens + if self.response_format is not None: + body["response_format"] = self.response_format + + return json.dumps({ + "custom_id": self.custom_id, + "method": "POST", + "url": "/v1/chat/completions", + "body": body, + }) + + +@dataclass +class BatchResult: + """The result of a single request within a completed batch.""" + + custom_id: str + """The custom ID that was provided in the request.""" + + content: Optional[str] = None + """The response content from the LLM.""" + + error: Optional[str] = None + """Error message if this individual request failed.""" + + usage: Optional[Dict[str, int]] = None + """Token usage for this request.""" + + +@dataclass +class BatchJobInfo: + """Status information about a batch job.""" + + batch_id: str + """The OpenAI batch ID.""" + + status: str + """Current status: validating, in_progress, completed, failed, expired, etc.""" + + total_requests: int = 0 + """Total number of requests in the batch.""" + + completed_requests: int = 0 + """Number of completed requests.""" + + failed_requests: int = 0 + """Number of failed requests.""" + + output_file_id: Optional[str] = None + """ID of the output file when batch completes.""" + + error_file_id: Optional[str] = None + """ID of the error file if there are errors.""" + + +def create_batch( + client: OpenAI, + requests: List[BatchRequest], + description: str = "ScrapeGraphAI batch scraping job", +) -> str: + """Create and submit an OpenAI Batch API job. + + Args: + client: An initialized OpenAI client. + requests: List of BatchRequest objects to submit. + description: Human-readable description for the batch. + + Returns: + The batch ID for tracking the job. + + Raises: + ValueError: If the number of requests exceeds the API limit. + """ + if len(requests) > MAX_REQUESTS_PER_BATCH: + raise ValueError( + f"Batch size {len(requests)} exceeds the maximum of " + f"{MAX_REQUESTS_PER_BATCH}. Split into multiple batches." + ) + + # Build JSONL content + jsonl_content = "\n".join(req.to_jsonl_line() for req in requests) + + logger.info( + f"Uploading batch input file with {len(requests)} requests..." + ) + + # Upload the input file + input_file = client.files.create( + file=io.BytesIO(jsonl_content.encode("utf-8")), + purpose="batch", + ) + + logger.info(f"Input file uploaded: {input_file.id}") + + # Create the batch + batch = client.batches.create( + input_file_id=input_file.id, + endpoint="/v1/chat/completions", + completion_window="24h", + metadata={"description": description}, + ) + + logger.info( + f"Batch created: {batch.id} (status: {batch.status})" + ) + + return batch.id + + +def get_batch_status(client: OpenAI, batch_id: str) -> BatchJobInfo: + """Get the current status of a batch job. + + Args: + client: An initialized OpenAI client. + batch_id: The batch ID returned by create_batch. + + Returns: + BatchJobInfo with the current status and counts. + """ + batch = client.batches.retrieve(batch_id) + + return BatchJobInfo( + batch_id=batch.id, + status=batch.status, + total_requests=batch.request_counts.total if batch.request_counts else 0, + completed_requests=batch.request_counts.completed if batch.request_counts else 0, + failed_requests=batch.request_counts.failed if batch.request_counts else 0, + output_file_id=batch.output_file_id, + error_file_id=batch.error_file_id, + ) + + +def poll_batch_until_complete( + client: OpenAI, + batch_id: str, + poll_interval: int = DEFAULT_POLL_INTERVAL, + max_wait_time: int = DEFAULT_MAX_WAIT_TIME, +) -> BatchJobInfo: + """Poll a batch job until it completes, fails, or times out. + + Args: + client: An initialized OpenAI client. + batch_id: The batch ID to poll. + poll_interval: Seconds between status checks. + max_wait_time: Maximum seconds to wait before giving up. + + Returns: + Final BatchJobInfo when the batch reaches a terminal state. + + Raises: + TimeoutError: If max_wait_time is exceeded. + RuntimeError: If the batch fails or is cancelled. + """ + terminal_states = {"completed", "failed", "expired", "cancelled"} + start_time = time.time() + + logger.info( + f"Polling batch {batch_id} every {poll_interval}s " + f"(max wait: {max_wait_time}s)..." + ) + + while True: + elapsed = time.time() - start_time + if elapsed > max_wait_time: + raise TimeoutError( + f"Batch {batch_id} did not complete within " + f"{max_wait_time}s (last status check at {elapsed:.0f}s)" + ) + + info = get_batch_status(client, batch_id) + + logger.info( + f"Batch {batch_id}: {info.status} " + f"({info.completed_requests}/{info.total_requests} done, " + f"{info.failed_requests} failed)" + ) + + if info.status in terminal_states: + if info.status == "failed": + raise RuntimeError( + f"Batch {batch_id} failed. " + f"Error file: {info.error_file_id}" + ) + if info.status in {"expired", "cancelled"}: + raise RuntimeError( + f"Batch {batch_id} was {info.status}." + ) + return info + + time.sleep(poll_interval) + + +def retrieve_batch_results( + client: OpenAI, + batch_info: BatchJobInfo, +) -> List[BatchResult]: + """Retrieve and parse results from a completed batch. + + Args: + client: An initialized OpenAI client. + batch_info: A BatchJobInfo from a completed batch. + + Returns: + List of BatchResult objects, one per request, + ordered by their custom_id. + """ + if not batch_info.output_file_id: + raise ValueError( + f"Batch {batch_info.batch_id} has no output file. " + f"Status: {batch_info.status}" + ) + + logger.info(f"Downloading results from {batch_info.output_file_id}...") + + output_content = client.files.content(batch_info.output_file_id).text + results = [] + + for line in output_content.strip().split("\n"): + if not line: + continue + + response_data = json.loads(line) + custom_id = response_data["custom_id"] + + error = response_data.get("error") + if error: + results.append(BatchResult( + custom_id=custom_id, + error=json.dumps(error), + )) + continue + + body = response_data.get("response", {}).get("body", {}) + choices = body.get("choices", []) + + if choices: + content = choices[0].get("message", {}).get("content", "") + usage = body.get("usage") + results.append(BatchResult( + custom_id=custom_id, + content=content, + usage=usage, + )) + else: + results.append(BatchResult( + custom_id=custom_id, + error="No choices returned in response", + )) + + # Sort by custom_id to maintain order + results.sort(key=lambda r: r.custom_id) + + logger.info( + f"Retrieved {len(results)} results " + f"({sum(1 for r in results if r.error is None)} succeeded, " + f"{sum(1 for r in results if r.error is not None)} failed)" + ) + + return results diff --git a/tests/test_batch_api.py b/tests/test_batch_api.py new file mode 100644 index 00000000..8f77c1c9 --- /dev/null +++ b/tests/test_batch_api.py @@ -0,0 +1,403 @@ +""" +Tests for the OpenAI Batch API integration. + +Tests cover: +- batch_api.py utility functions +- BatchGenerateAnswerNode +- SmartScraperMultiBatchGraph initialization and validation +""" + +import json + +import pytest + +from scrapegraphai.utils.batch_api import ( + BatchJobInfo, + BatchRequest, + BatchResult, + retrieve_batch_results, +) + + +# ─── BatchRequest Tests ─── + + +class TestBatchRequest: + """Tests for the BatchRequest dataclass.""" + + def test_to_jsonl_line_basic(self): + """Test basic JSONL line generation.""" + req = BatchRequest( + custom_id="doc_0000", + model="gpt-4o-mini", + messages=[{"role": "user", "content": "Hello"}], + ) + line = req.to_jsonl_line() + data = json.loads(line) + + assert data["custom_id"] == "doc_0000" + assert data["method"] == "POST" + assert data["url"] == "/v1/chat/completions" + assert data["body"]["model"] == "gpt-4o-mini" + assert data["body"]["messages"] == [{"role": "user", "content": "Hello"}] + assert data["body"]["temperature"] == 0.0 + + def test_to_jsonl_line_with_max_tokens(self): + """Test JSONL line with max_tokens specified.""" + req = BatchRequest( + custom_id="doc_0001", + model="gpt-4o", + messages=[{"role": "user", "content": "Test"}], + max_tokens=500, + ) + data = json.loads(req.to_jsonl_line()) + assert data["body"]["max_tokens"] == 500 + + def test_to_jsonl_line_with_response_format(self): + """Test JSONL line with response_format specified.""" + req = BatchRequest( + custom_id="doc_0002", + model="gpt-4o-mini", + messages=[{"role": "user", "content": "Extract"}], + response_format={"type": "json_object"}, + ) + data = json.loads(req.to_jsonl_line()) + assert data["body"]["response_format"] == {"type": "json_object"} + + def test_to_jsonl_line_without_optional_fields(self): + """Test that optional fields are excluded when None.""" + req = BatchRequest( + custom_id="doc_0003", + model="gpt-4o-mini", + messages=[{"role": "user", "content": "Test"}], + ) + data = json.loads(req.to_jsonl_line()) + assert "max_tokens" not in data["body"] + assert "response_format" not in data["body"] + + def test_to_jsonl_line_custom_temperature(self): + """Test custom temperature in JSONL output.""" + req = BatchRequest( + custom_id="doc_0004", + model="gpt-4o-mini", + messages=[{"role": "user", "content": "Test"}], + temperature=0.7, + ) + data = json.loads(req.to_jsonl_line()) + assert data["body"]["temperature"] == 0.7 + + +# ─── BatchResult Tests ─── + + +class TestBatchResult: + """Tests for the BatchResult dataclass.""" + + def test_successful_result(self): + """Test creating a successful batch result.""" + result = BatchResult( + custom_id="doc_0000", + content='{"key": "value"}', + usage={"prompt_tokens": 100, "completion_tokens": 50}, + ) + assert result.custom_id == "doc_0000" + assert result.content == '{"key": "value"}' + assert result.error is None + assert result.usage["prompt_tokens"] == 100 + + def test_failed_result(self): + """Test creating a failed batch result.""" + result = BatchResult( + custom_id="doc_0001", + error="Rate limit exceeded", + ) + assert result.custom_id == "doc_0001" + assert result.content is None + assert result.error == "Rate limit exceeded" + + +# ─── BatchJobInfo Tests ─── + + +class TestBatchJobInfo: + """Tests for the BatchJobInfo dataclass.""" + + def test_completed_batch(self): + """Test a completed batch job info.""" + info = BatchJobInfo( + batch_id="batch_123", + status="completed", + total_requests=10, + completed_requests=10, + failed_requests=0, + output_file_id="file-abc", + ) + assert info.status == "completed" + assert info.total_requests == 10 + assert info.failed_requests == 0 + + def test_in_progress_batch(self): + """Test an in-progress batch job info.""" + info = BatchJobInfo( + batch_id="batch_456", + status="in_progress", + total_requests=100, + completed_requests=42, + failed_requests=1, + ) + assert info.status == "in_progress" + assert info.completed_requests == 42 + assert info.output_file_id is None + + +# ─── retrieve_batch_results Tests ─── + + +class TestRetrieveBatchResults: + """Tests for result retrieval and parsing.""" + + def test_retrieve_no_output_file(self): + """Test that retrieval fails when no output file is available.""" + info = BatchJobInfo( + batch_id="batch_789", + status="failed", + output_file_id=None, + ) + + class DummyClient: + pass + + with pytest.raises(ValueError, match="no output file"): + retrieve_batch_results(DummyClient(), info) + + def test_results_sorted_by_custom_id(self): + """Test that results are sorted by custom_id for consistent ordering.""" + # Simulate results out of order + jsonl_output = "\n".join([ + json.dumps({ + "custom_id": "doc_0002", + "response": { + "body": { + "choices": [{"message": {"content": '{"val": "c"}'}}], + "usage": {"prompt_tokens": 10, "completion_tokens": 5}, + } + }, + }), + json.dumps({ + "custom_id": "doc_0000", + "response": { + "body": { + "choices": [{"message": {"content": '{"val": "a"}'}}], + "usage": {"prompt_tokens": 10, "completion_tokens": 5}, + } + }, + }), + json.dumps({ + "custom_id": "doc_0001", + "response": { + "body": { + "choices": [{"message": {"content": '{"val": "b"}'}}], + "usage": {"prompt_tokens": 10, "completion_tokens": 5}, + } + }, + }), + ]) + + class DummyFileContent: + text = jsonl_output + + class DummyFiles: + def content(self, file_id): + return DummyFileContent() + + class DummyClient: + files = DummyFiles() + + info = BatchJobInfo( + batch_id="batch_sorted", + status="completed", + output_file_id="file-sorted", + ) + + results = retrieve_batch_results(DummyClient(), info) + + assert len(results) == 3 + assert results[0].custom_id == "doc_0000" + assert results[1].custom_id == "doc_0001" + assert results[2].custom_id == "doc_0002" + assert results[0].content == '{"val": "a"}' + + def test_handles_partial_failures(self): + """Test that partial failures in batch results are handled correctly.""" + jsonl_output = "\n".join([ + json.dumps({ + "custom_id": "doc_0000", + "response": { + "body": { + "choices": [{"message": {"content": '{"result": "ok"}'}}], + } + }, + }), + json.dumps({ + "custom_id": "doc_0001", + "error": {"code": "rate_limit", "message": "Too many requests"}, + }), + ]) + + class DummyFileContent: + text = jsonl_output + + class DummyFiles: + def content(self, file_id): + return DummyFileContent() + + class DummyClient: + files = DummyFiles() + + info = BatchJobInfo( + batch_id="batch_partial", + status="completed", + output_file_id="file-partial", + ) + + results = retrieve_batch_results(DummyClient(), info) + + assert len(results) == 2 + # doc_0000 succeeded + assert results[0].content == '{"result": "ok"}' + assert results[0].error is None + # doc_0001 failed + assert results[1].error is not None + assert results[1].content is None + + +# ─── SmartScraperMultiBatchGraph Validation Tests ─── + + +class TestSmartScraperMultiBatchGraphValidation: + """Tests for SmartScraperMultiBatchGraph initialization validation.""" + + def test_rejects_non_openai_provider(self): + """Test that non-OpenAI providers are rejected.""" + from scrapegraphai.graphs.smart_scraper_multi_batch_graph import ( + SmartScraperMultiBatchGraph, + ) + + with pytest.raises(ValueError, match="only supports OpenAI"): + SmartScraperMultiBatchGraph( + prompt="Test prompt", + source=["https://example.com"], + config={"llm": {"model": "anthropic/claude-3"}}, + ) + + def test_rejects_groq_provider(self): + """Test that Groq provider is rejected.""" + from scrapegraphai.graphs.smart_scraper_multi_batch_graph import ( + SmartScraperMultiBatchGraph, + ) + + with pytest.raises(ValueError, match="only supports OpenAI"): + SmartScraperMultiBatchGraph( + prompt="Test", + source=["https://example.com"], + config={"llm": {"model": "groq/llama-3"}}, + ) + + +# ─── BatchGenerateAnswerNode Tests ─── + + +class TestBatchGenerateAnswerNode: + """Tests for the BatchGenerateAnswerNode.""" + + def test_empty_parsed_docs_raises(self): + """Test that empty parsed_docs raises ValueError.""" + from scrapegraphai.nodes.batch_generate_answer_node import ( + BatchGenerateAnswerNode, + ) + + class DummyLLM: + model_name = "gpt-4o-mini" + + node = BatchGenerateAnswerNode( + input="user_prompt & parsed_docs", + output=["results"], + node_config={ + "llm_model": DummyLLM(), + "batch_config": {}, + }, + ) + + class DummyLogger: + def info(self, msg): + pass + def error(self, msg): + pass + def warning(self, msg): + pass + + node.logger = DummyLogger() + node.get_input_keys = lambda state: ["user_prompt", "parsed_docs"] + + with pytest.raises(ValueError, match="No parsed documents"): + node.execute({ + "user_prompt": "Test", + "parsed_docs": [], + "urls": [], + }) + + def test_model_name_extraction(self): + """Test model name is correctly extracted from LLM instance.""" + from scrapegraphai.nodes.batch_generate_answer_node import ( + BatchGenerateAnswerNode, + ) + + class DummyLLM: + model_name = "gpt-4o-mini" + + node = BatchGenerateAnswerNode( + input="user_prompt & parsed_docs", + output=["results"], + node_config={"llm_model": DummyLLM(), "batch_config": {}}, + ) + + assert node._get_model_name() == "gpt-4o-mini" + + def test_batch_model_override(self): + """Test that batch_config model overrides the LLM model name.""" + from scrapegraphai.nodes.batch_generate_answer_node import ( + BatchGenerateAnswerNode, + ) + + class DummyLLM: + model_name = "gpt-4o-mini" + + node = BatchGenerateAnswerNode( + input="user_prompt & parsed_docs", + output=["results"], + node_config={ + "llm_model": DummyLLM(), + "batch_config": {"model": "gpt-4o"}, + }, + ) + + assert node._get_model_name() == "gpt-4o" + + def test_format_instructions_without_schema(self): + """Test default format instructions when no schema is provided.""" + from scrapegraphai.nodes.batch_generate_answer_node import ( + BatchGenerateAnswerNode, + ) + + class DummyLLM: + model_name = "gpt-4o-mini" + + node = BatchGenerateAnswerNode( + input="user_prompt & parsed_docs", + output=["results"], + node_config={"llm_model": DummyLLM(), "batch_config": {}}, + ) + + instructions = node._get_format_instructions() + assert "JSON" in instructions + assert "content" in instructions From 54d147309dc7a1ce1b191c3e4feb927ee3ff4392 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Tue, 24 Feb 2026 23:29:34 +0000 Subject: [PATCH 2/7] ci(release): 1.60.0-beta.2 [skip ci] ## [1.60.0-beta.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.60.0-beta.1...v1.60.0-beta.2) (2026-02-24) ### Features * add OpenAI Batch API support for SmartScraperMultiGraph ([#1036](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/1036)) ([9d4eba1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9d4eba1f154953e401982da7eff85686293b9a48)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dd2c01cd..2d94322c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.60.0-beta.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.60.0-beta.1...v1.60.0-beta.2) (2026-02-24) + + +### Features + +* add OpenAI Batch API support for SmartScraperMultiGraph ([#1036](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/1036)) ([9d4eba1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9d4eba1f154953e401982da7eff85686293b9a48)) + ## [1.60.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.59.0...v1.60.0-beta.1) (2025-06-24) diff --git a/pyproject.toml b/pyproject.toml index f5ff5572..2f9dfa1a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.60.0b1" +version = "1.60.0b2" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ From 536e5adcde179a12ec146bd8a10cbf654e0eeeaa Mon Sep 17 00:00:00 2001 From: khadyottakale Date: Wed, 4 Mar 2026 13:38:00 +0530 Subject: [PATCH 3/7] fix: update broken test imports to match current API - Replace removed ScrapeGraph with SmartScraperGraph in scrape_graph_test.py - Replace renamed convert_to_csv/convert_to_json with export_to_csv/export_to_json in xml_scraper_openai_test.py --- tests/graphs/scrape_graph_test.py | 19 ++++++++++--------- tests/graphs/xml_scraper_openai_test.py | 6 +++--- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/tests/graphs/scrape_graph_test.py b/tests/graphs/scrape_graph_test.py index 272fc238..89c5464b 100644 --- a/tests/graphs/scrape_graph_test.py +++ b/tests/graphs/scrape_graph_test.py @@ -1,5 +1,5 @@ """ -Module for testing the scrape graph class +Module for testing the smart scraper graph class """ import os @@ -7,7 +7,7 @@ import pytest from dotenv import load_dotenv -from scrapegraphai.graphs import ScrapeGraph +from scrapegraphai.graphs import SmartScraperGraph load_dotenv() @@ -19,7 +19,7 @@ def graph_config(): return { "llm": { "api_key": openai_key, - "model": "openai/gpt-3.5-turbo", + "model": "openai/gpt-4o", }, "verbose": True, "headless": False, @@ -28,26 +28,27 @@ def graph_config(): def test_scraping_pipeline(graph_config): """Start of the scraping pipeline""" - scrape_graph = ScrapeGraph( + smart_scraper_graph = SmartScraperGraph( + prompt="List me all the projects with their descriptions", source="https://perinim.github.io/projects/", config=graph_config, ) - result = scrape_graph.run() + result = smart_scraper_graph.run() assert result is not None - assert isinstance(result, list) def test_get_execution_info(graph_config): """Get the execution info""" - scrape_graph = ScrapeGraph( + smart_scraper_graph = SmartScraperGraph( + prompt="List me all the projects with their descriptions", source="https://perinim.github.io/projects/", config=graph_config, ) - scrape_graph.run() + smart_scraper_graph.run() - graph_exec_info = scrape_graph.get_execution_info() + graph_exec_info = smart_scraper_graph.get_execution_info() assert graph_exec_info is not None diff --git a/tests/graphs/xml_scraper_openai_test.py b/tests/graphs/xml_scraper_openai_test.py index cb2b4aa3..65bc240f 100644 --- a/tests/graphs/xml_scraper_openai_test.py +++ b/tests/graphs/xml_scraper_openai_test.py @@ -8,7 +8,7 @@ from dotenv import load_dotenv from scrapegraphai.graphs import XMLScraperGraph -from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info +from scrapegraphai.utils import export_to_csv, export_to_json, prettify_exec_info load_dotenv() @@ -96,8 +96,8 @@ def test_xml_scraper_save_results(graph_config: dict, xml_content: str): result = xml_scraper_graph.run() # Save to csv and json - convert_to_csv(result, "result") - convert_to_json(result, "result") + export_to_csv(result, "result") + export_to_json(result, "result") assert os.path.exists("result.csv") assert os.path.exists("result.json") From 637c696da77da1bb916a4ece03bd66fea50be47e Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Sun, 15 Mar 2026 04:58:11 +0000 Subject: [PATCH 4/7] ci(release): 1.60.0-beta.3 [skip ci] ## [1.60.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.60.0-beta.2...v1.60.0-beta.3) (2026-03-15) ### Bug Fixes * update broken test imports to match current API ([536e5ad](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/536e5adcde179a12ec146bd8a10cbf654e0eeeaa)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2d94322c..e0094710 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.60.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.60.0-beta.2...v1.60.0-beta.3) (2026-03-15) + + +### Bug Fixes + +* update broken test imports to match current API ([536e5ad](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/536e5adcde179a12ec146bd8a10cbf654e0eeeaa)) + ## [1.60.0-beta.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.60.0-beta.1...v1.60.0-beta.2) (2026-02-24) diff --git a/pyproject.toml b/pyproject.toml index 2f9dfa1a..5d288759 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.60.0b2" +version = "1.60.0b3" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ From 35ec272b6f267a164fac67a26787396db548e2a7 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Tue, 7 Apr 2026 06:33:47 +0000 Subject: [PATCH 5/7] ci(release): 1.76.0-beta.1 [skip ci] ## [1.76.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.75.1...v1.76.0-beta.1) (2026-04-07) ### Features * add OpenAI Batch API support for SmartScraperMultiGraph ([#1036](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/1036)) ([9d4eba1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9d4eba1f154953e401982da7eff85686293b9a48)) ### Bug Fixes * update broken test imports to match current API ([536e5ad](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/536e5adcde179a12ec146bd8a10cbf654e0eeeaa)) ### CI * **release:** 1.60.0-beta.2 [skip ci] ([54d1473](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/54d147309dc7a1ce1b191c3e4feb927ee3ff4392)), closes [#1036](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/1036) * **release:** 1.60.0-beta.3 [skip ci] ([637c696](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/637c696da77da1bb916a4ece03bd66fea50be47e)) * reduce GitHub Actions costs by ~85% on PRs ([403080a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/403080ad82c2097b111d3472cc0c6d4ee709c6fe)) --- CHANGELOG.md | 19 +++++++++++++++++++ pyproject.toml | 2 +- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 088a43a9..920b2ca7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,22 @@ +## [1.76.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.75.1...v1.76.0-beta.1) (2026-04-07) + + +### Features + +* add OpenAI Batch API support for SmartScraperMultiGraph ([#1036](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/1036)) ([9d4eba1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9d4eba1f154953e401982da7eff85686293b9a48)) + + +### Bug Fixes + +* update broken test imports to match current API ([536e5ad](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/536e5adcde179a12ec146bd8a10cbf654e0eeeaa)) + + +### CI + +* **release:** 1.60.0-beta.2 [skip ci] ([54d1473](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/54d147309dc7a1ce1b191c3e4feb927ee3ff4392)), closes [#1036](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/1036) +* **release:** 1.60.0-beta.3 [skip ci] ([637c696](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/637c696da77da1bb916a4ece03bd66fea50be47e)) +* reduce GitHub Actions costs by ~85% on PRs ([403080a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/403080ad82c2097b111d3472cc0c6d4ee709c6fe)) + ## [1.75.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.75.0...v1.75.1) (2026-03-24) diff --git a/pyproject.toml b/pyproject.toml index 6537bbcf..b09a7bf7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.75.1" +version = "1.76.0b1" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ From a2ea9eb45f1406aac054e057c19a6bbf806fc38a Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Sun, 19 Apr 2026 08:04:28 +0000 Subject: [PATCH 6/7] ci(release): 2.1.0-beta.1 [skip ci] ## [2.1.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v2.0.0...v2.1.0-beta.1) (2026-04-19) ### Features * add OpenAI Batch API support for SmartScraperMultiGraph ([#1036](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/1036)) ([9d4eba1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9d4eba1f154953e401982da7eff85686293b9a48)) ### Bug Fixes * update broken test imports to match current API ([536e5ad](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/536e5adcde179a12ec146bd8a10cbf654e0eeeaa)) ### CI * **release:** 1.60.0-beta.2 [skip ci] ([54d1473](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/54d147309dc7a1ce1b191c3e4feb927ee3ff4392)), closes [#1036](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/1036) * **release:** 1.60.0-beta.3 [skip ci] ([637c696](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/637c696da77da1bb916a4ece03bd66fea50be47e)) * **release:** 1.76.0-beta.1 [skip ci] ([35ec272](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/35ec272b6f267a164fac67a26787396db548e2a7)), closes [#1036](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/1036) [#1036](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/1036) --- CHANGELOG.md | 19 +++++++++++++++++++ pyproject.toml | 2 +- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 59c99f64..b1055216 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,22 @@ +## [2.1.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v2.0.0...v2.1.0-beta.1) (2026-04-19) + + +### Features + +* add OpenAI Batch API support for SmartScraperMultiGraph ([#1036](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/1036)) ([9d4eba1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9d4eba1f154953e401982da7eff85686293b9a48)) + + +### Bug Fixes + +* update broken test imports to match current API ([536e5ad](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/536e5adcde179a12ec146bd8a10cbf654e0eeeaa)) + + +### CI + +* **release:** 1.60.0-beta.2 [skip ci] ([54d1473](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/54d147309dc7a1ce1b191c3e4feb927ee3ff4392)), closes [#1036](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/1036) +* **release:** 1.60.0-beta.3 [skip ci] ([637c696](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/637c696da77da1bb916a4ece03bd66fea50be47e)) +* **release:** 1.76.0-beta.1 [skip ci] ([35ec272](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/35ec272b6f267a164fac67a26787396db548e2a7)), closes [#1036](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/1036) [#1036](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/1036) + ## [2.0.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.76.0...v2.0.0) (2026-04-19) diff --git a/pyproject.toml b/pyproject.toml index a929d3bd..c3f3e62d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "2.0.0" +version = "2.1.0b1" description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines." authors = [ From 9a329428a8420792680965e3f9f3020cef62cd58 Mon Sep 17 00:00:00 2001 From: Gabriele Maria Bellavia Date: Wed, 13 May 2026 14:17:49 +0200 Subject: [PATCH 7/7] Add Italian README translation and fix outdated links (#1070) --- README.md | 19 ++-- docs/italian.md | 241 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 251 insertions(+), 9 deletions(-) create mode 100644 docs/italian.md diff --git a/README.md b/README.md index d16bdf3e..469db9d6 100644 --- a/README.md +++ b/README.md @@ -10,13 +10,14 @@

-[English](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/README.md) | [中文](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/docs/chinese.md) | [日本語](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/docs/japanese.md) -| [한국어](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/docs/korean.md) -| [Русский](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/docs/russian.md) | [Türkçe](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/docs/turkish.md) +[English](https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/main/README.md) | [中文](https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/main/docs/chinese.md) | [日本語](https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/main/docs/japanese.md) +| [한국어](https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/main/docs/korean.md) +| [Русский](https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/main/docs/russian.md) | [Türkçe](https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/main/docs/turkish.md) | [Deutsch](https://www.readme-i18n.com/ScrapeGraphAI/Scrapegraph-ai?lang=de) | [Español](https://www.readme-i18n.com/ScrapeGraphAI/Scrapegraph-ai?lang=es) | [français](https://www.readme-i18n.com/ScrapeGraphAI/Scrapegraph-ai?lang=fr) -| [Português](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/docs/portuguese.md) +| [Português](https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/main/docs/portuguese.md) +| [Italiano](https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/main/docs/italian.md) [![PyPI Downloads](https://static.pepy.tech/personalized-badge/scrapegraphai?period=total&units=INTERNATIONAL_SYSTEM&left_color=BLACK&right_color=GREEN&left_text=downloads)](https://pepy.tech/projects/scrapegraphai) @@ -24,7 +25,7 @@ [![](https://dcbadge.vercel.app/api/server/gkxQDAjfeX)](https://discord.gg/gkxQDAjfeX)

-VinciGit00%2FScrapegraph-ai | Trendshift +ScrapeGraphAI%2FScrapegraph-ai | Trendshift

[ScrapeGraphAI](https://scrapegraphai.com) is a *web scraping* python library that uses LLM and direct graph logic to create scraping pipelines for websites and local documents (XML, HTML, JSON, Markdown, etc.). @@ -163,7 +164,7 @@ Check out also the Docusaurus [here](https://docs-oss.scrapegraphai.com/). Feel free to contribute and join our Discord server to discuss with us improvements and give us suggestions! -Please see the [contributing guidelines](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/CONTRIBUTING.md). +Please see the [contributing guidelines](https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/main/CONTRIBUTING.md). [![My Skills](https://skillicons.dev/icons?i=discord)](https://discord.gg/uJN7TYcpNa) [![My Skills](https://skillicons.dev/icons?i=linkedin)](https://www.linkedin.com/company/scrapegraphai/) @@ -187,7 +188,7 @@ The Official API Documentation can be found [here](https://docs.scrapegraphai.co We collect anonymous usage metrics to enhance our package's quality and user experience. The data helps us prioritize improvements and ensure compatibility. If you wish to opt-out, set the environment variable SCRAPEGRAPHAI_TELEMETRY_ENABLED=false. For more information, please refer to the documentation [here](https://scrapegraph-ai.readthedocs.io/en/latest/scrapers/telemetry.html). ## ❤️ Contributors -[![Contributors](https://contrib.rocks/image?repo=VinciGit00/Scrapegraph-ai)](https://github.com/VinciGit00/Scrapegraph-ai/graphs/contributors) +[![Contributors](https://contrib.rocks/image?repo=ScrapeGraphAI/Scrapegraph-ai)](https://github.com/ScrapeGraphAI/Scrapegraph-ai/graphs/contributors) ## 🎓 Citations If you have used our library for research purposes please quote us with the following reference: @@ -196,7 +197,7 @@ If you have used our library for research purposes please quote us with the foll author = {Lorenzo Padoan, Marco Vinciguerra}, title = {Scrapegraph-ai}, year = {2024}, - url = {https://github.com/VinciGit00/Scrapegraph-ai}, + url = {https://github.com/ScrapeGraphAI/Scrapegraph-ai}, note = {A Python library for scraping leveraging large language models} } ``` @@ -209,7 +210,7 @@ If you have used our library for research purposes please quote us with the foll ## 📜 License -ScrapeGraphAI is licensed under the MIT License. See the [LICENSE](https://github.com/VinciGit00/Scrapegraph-ai/blob/main/LICENSE) file for more information. +ScrapeGraphAI is licensed under the MIT License. See the [LICENSE](https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/main/LICENSE) file for more information. ## Acknowledgements diff --git a/docs/italian.md b/docs/italian.md new file mode 100644 index 00000000..eb0644c1 --- /dev/null +++ b/docs/italian.md @@ -0,0 +1,241 @@ +## 🚀 **Cerchi un modo ancora più veloce e semplice per fare scraping su larga scala (con sole 5 righe di codice)?** Scopri la nostra versione potenziata su [**ScrapeGraphAI.com**](https://scrapegraphai.com/?utm_source=github&utm_medium=readme&utm_campaign=oss_cta&utm_content=top_banner)! 🚀 + +--- + +# 🕷️ ScrapeGraphAI: You Only Scrape Once + +

+ + ScrapeGraphAI + +

+ +[English](https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/main/README.md) | [中文](https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/main/docs/chinese.md) | [日本語](https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/main/docs/japanese.md) +| [한국어](https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/main/docs/korean.md) +| [Русский](https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/main/docs/russian.md) | [Türkçe](https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/main/docs/turkish.md) +| [Deutsch](https://www.readme-i18n.com/ScrapeGraphAI/Scrapegraph-ai?lang=de) +| [Español](https://www.readme-i18n.com/ScrapeGraphAI/Scrapegraph-ai?lang=es) +| [français](https://www.readme-i18n.com/ScrapeGraphAI/Scrapegraph-ai?lang=fr) +| [Português](https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/main/docs/portuguese.md) +| [Italiano](https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/main/docs/italian.md) + +[![PyPI Downloads](https://static.pepy.tech/personalized-badge/scrapegraphai?period=total&units=INTERNATIONAL_SYSTEM&left_color=BLACK&right_color=GREEN&left_text=downloads)](https://pepy.tech/projects/scrapegraphai) +[![linting: pylint](https://img.shields.io/badge/linting-pylint-yellowgreen?style=for-the-badge)](https://github.com/pylint-dev/pylint) +[![Pylint](https://img.shields.io/github/actions/workflow/status/ScrapeGraphAI/Scrapegraph-ai/code-quality.yml?label=Pylint&logo=github&style=for-the-badge)](https://github.com/ScrapeGraphAI/Scrapegraph-ai/actions/workflows/code-quality.yml) +[![CodeQL](https://img.shields.io/github/actions/workflow/status/ScrapeGraphAI/Scrapegraph-ai/codeql.yml?label=CodeQL&logo=github&style=for-the-badge)](https://github.com/ScrapeGraphAI/Scrapegraph-ai/actions/workflows/codeql.yml) +[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg?style=for-the-badge)](https://opensource.org/licenses/MIT) +[![](https://dcbadge.vercel.app/api/server/gkxQDAjfeX)](https://discord.gg/gkxQDAjfeX) + +[![API Banner](https://raw.githubusercontent.com/ScrapeGraphAI/Scrapegraph-ai/main/docs/assets/api_banner.png)](https://scrapegraphai.com/?utm_source=github&utm_medium=readme&utm_campaign=api_banner&utm_content=api_banner_image) + +

+ScrapeGraphAI%2FScrapegraph-ai | Trendshift +

+ +[ScrapeGraphAI](https://scrapegraphai.com) è una libreria Python per il *web scraping* che utilizza LLM e logica basata sui grafi per creare pipeline di scraping per siti web e documenti locali (XML, HTML, JSON, Markdown, ecc.). + +Indica semplicemente quali informazioni vuoi estrarre e la libreria lo farà per te! + +

+ ScrapeGraphAI Hero +

+ +## 🚀 Integrazioni + +ScrapeGraphAI offre integrazioni con i framework e gli strumenti più diffusi per potenziare le tue capacità di scraping. Che tu stia sviluppando in Python o Node.js, usando framework LLM o piattaforme no-code, offriamo un'ampia gamma di opzioni di integrazione. + +Puoi trovare ulteriori informazioni al seguente [link](https://scrapegraphai.com) + +**Integrazioni**: +- **API**: [Documentazione](https://docs.scrapegraphai.com/introduction) +- **SDK**: [Python](https://docs.scrapegraphai.com/sdks/python), [Node](https://docs.scrapegraphai.com/sdks/javascript) +- **Framework LLM**: [Langchain](https://docs.scrapegraphai.com/integrations/langchain), [Llama Index](https://docs.scrapegraphai.com/integrations/llamaindex), [Crew.ai](https://docs.scrapegraphai.com/integrations/crewai), [Agno](https://docs.scrapegraphai.com/integrations/agno), [CamelAI](https://github.com/camel-ai/camel) +- **Framework Low-code**: [Pipedream](https://pipedream.com/apps/scrapegraphai), [Bubble](https://bubble.io/plugin/scrapegraphai-1745408893195x213542371433906180), [Zapier](https://zapier.com/apps/scrapegraphai/integrations), [n8n](http://localhost:5001/dashboard), [Dify](https://dify.ai), [Toolhouse](https://app.toolhouse.ai/mcp-servers/scrapegraph_smartscraper) +- **Server MCP**: [Link](https://smithery.ai/server/@ScrapeGraphAI/scrapegraph-mcp) + +## 🚀 Installazione rapida + +La pagina di riferimento per scrapegraph-ai è disponibile sulla pagina ufficiale di PyPI: [pypi](https://pypi.org/project/scrapegraphai/). + +```bash +pip install scrapegraphai + +# IMPORTANTE (per il recupero del contenuto dei siti web) +playwright install +``` + +**Nota**: si consiglia di installare la libreria in un ambiente virtuale per evitare conflitti con altre librerie 🐱 + +## 💻 Utilizzo + +Esistono diverse pipeline di scraping predefinite che possono essere utilizzate per estrarre informazioni da un sito web (o da un file locale). + +La più comune è `SmartScraperGraph`, che estrae informazioni da una singola pagina dato un prompt dell'utente e un URL sorgente. + +```python +from scrapegraphai.graphs import SmartScraperGraph + +# Definisci la configurazione per la pipeline di scraping +graph_config = { + "llm": { + "model": "ollama/llama3.2", + "model_tokens": 8192, + "format": "json", + }, + "verbose": True, + "headless": False, +} + +# Crea l'istanza di SmartScraperGraph +smart_scraper_graph = SmartScraperGraph( + prompt="Estrai informazioni utili dalla pagina web, inclusa una descrizione di cosa fa l'azienda, i fondatori e i link ai social media", + source="https://scrapegraphai.com/", + config=graph_config +) + +# Esegui la pipeline +result = smart_scraper_graph.run() + +import json +print(json.dumps(result, indent=4)) +``` + +> [!NOTE] +> Per OpenAI e altri modelli è sufficiente modificare la configurazione llm! +> ```python +> graph_config = { +> "llm": { +> "api_key": "LA_TUA_OPENAI_API_KEY", +> "model": "openai/gpt-4o-mini", +> }, +> "verbose": True, +> "headless": False, +> } +> ``` + +L'output sarà un dizionario simile al seguente: + +```python +{ + "description": "ScrapeGraphAI transforms websites into clean, organized data for AI agents and data analytics. It offers an AI-powered API for effortless and cost-effective data extraction.", + "founders": [ + { + "name": "", + "role": "Founder & Technical Lead", + "linkedin": "https://www.linkedin.com/in/perinim/" + }, + { + "name": "Marco Vinciguerra", + "role": "Founder & Software Engineer", + "linkedin": "https://www.linkedin.com/in/marco-vinciguerra-7ba365242/" + }, + { + "name": "Lorenzo Padoan", + "role": "Founder & Product Engineer", + "linkedin": "https://www.linkedin.com/in/lorenzo-padoan-4521a2154/" + } + ], + "social_media_links": { + "linkedin": "https://www.linkedin.com/company/101881123", + "twitter": "https://x.com/scrapegraphai", + "github": "https://github.com/ScrapeGraphAI/Scrapegraph-ai" + } +} +``` + +Esistono altre pipeline che possono essere utilizzate per estrarre informazioni da più pagine, generare script Python o persino generare file audio. + +| Nome Pipeline | Descrizione | +|-------------------------|------------------------------------------------------------------------------------------------------------------| +| SmartScraperGraph | Scraper di singole pagine che richiede solo un prompt utente e una sorgente. | +| SearchGraph | Scraper multi-pagina che estrae informazioni dai primi n risultati di un motore di ricerca. | +| SpeechGraph | Scraper di singole pagine che estrae informazioni da un sito web e genera un file audio. | +| ScriptCreatorGraph | Scraper di singole pagine che estrae informazioni da un sito web e genera uno script Python. | +| SmartScraperMultiGraph | Scraper multi-pagina che estrae informazioni da più pagine dato un singolo prompt e una lista di sorgenti. | +| ScriptCreatorMultiGraph | Scraper multi-pagina che genera uno script Python per estrarre informazioni da più pagine e sorgenti. | + +Per ciascuno di questi grafi esiste una versione multi, che consente di effettuare chiamate all'LLM in parallelo. + +È possibile utilizzare diversi LLM tramite API, come **OpenAI**, **Groq**, **Azure**, **Gemini**, **MiniMax** e altri, oppure modelli locali tramite **Ollama**. + +Ricordati di avere [Ollama](https://ollama.com/) installato e di scaricare i modelli con il comando **ollama pull**, se desideri utilizzare modelli locali. + +## 📖 Documentazione + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1sEZBonBMGP44CtO6GQTwAlL0BGJXjtfd?usp=sharing) + +La documentazione di ScrapeGraphAI è disponibile [qui](https://scrapegraph-ai.readthedocs.io/en/latest/). +Consulta anche il Docusaurus [qui](https://docs-oss.scrapegraphai.com/). + +## 🤝 Vuoi contribuire? + +Sentiti libero di contribuire e unisciti al nostro server Discord per discutere con noi su cosa migliorare e darci suggerimenti! + +Consulta le [linee guida per i contributi](https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/main/CONTRIBUTING.md). + +[![My Skills](https://skillicons.dev/icons?i=discord)](https://discord.gg/uJN7TYcpNa) +[![My Skills](https://skillicons.dev/icons?i=linkedin)](https://www.linkedin.com/company/scrapegraphai/) +[![My Skills](https://skillicons.dev/icons?i=twitter)](https://twitter.com/scrapegraphai) + +## 🔗 API e SDK di ScrapeGraph + +Se stai cercando una soluzione rapida per integrare ScrapeGraph nel tuo sistema, scopri la nostra potente API [qui!](https://dashboard.scrapegraphai.com/login) + +[![API Banner](https://raw.githubusercontent.com/ScrapeGraphAI/Scrapegraph-ai/main/docs/assets/api_banner.png)](https://dashboard.scrapegraphai.com/login) + +Offriamo gli SDK sia in Python che in Node.js, per una facile integrazione nei tuoi progetti. Scoprili di seguito: + +| SDK | Linguaggio | Link GitHub | +|------------|------------|-----------------------------------------------------------------------------| +| Python SDK | Python | [scrapegraph-py](https://github.com/ScrapeGraphAI/scrapegraph-sdk/tree/main/scrapegraph-py) | +| Node.js SDK | Node.js | [scrapegraph-js](https://github.com/ScrapeGraphAI/scrapegraph-sdk/tree/main/scrapegraph-js) | + +La documentazione ufficiale dell'API è disponibile [qui](https://docs.scrapegraphai.com/). + +## 🔥 Benchmark + +Secondo il benchmark di Firecrawl [Firecrawl benchmark](https://github.com/firecrawl/scrape-evals/pull/3), ScrapeGraph è il miglior fetcher sul mercato! + +![here](assets/histogram.png) + +## 📈 Telemetria + +Raccogliamo metriche di utilizzo anonimizzate per migliorare la qualità e la user experience del nostro pacchetto. I dati ci aiutano a stabilire le priorità e a garantire la compatibilità. Se desideri disattivare la telemetria, imposta la variabile d'ambiente `SCRAPEGRAPHAI_TELEMETRY_ENABLED=false`. Per ulteriori informazioni, consulta la documentazione [qui](https://scrapegraph-ai.readthedocs.io/en/latest/scrapers/telemetry.html). + +## ❤️ Collaboratori + +[![Contributors](https://contrib.rocks/image?repo=ScrapeGraphAI/Scrapegraph-ai)](https://github.com/ScrapeGraphAI/Scrapegraph-ai/graphs/contributors) + +## 🎓 Citazioni + +Se hai utilizzato la nostra libreria per scopi di ricerca, citaci con il seguente riferimento: + +```text + @misc{scrapegraph-ai, + author = {Lorenzo Padoan, Marco Vinciguerra}, + title = {Scrapegraph-ai}, + year = {2024}, + url = {https://github.com/ScrapeGraphAI/Scrapegraph-ai}, + note = {A Python library for scraping leveraging large language models} + } +``` + +## Autori + +| | Contatti | +|--------------------|----------------------| +| Marco Vinciguerra | [![Linkedin Badge](https://img.shields.io/badge/-Linkedin-blue?style=flat&logo=Linkedin&logoColor=white)](https://www.linkedin.com/in/marco-vinciguerra-7ba365242/) | +| Lorenzo Padoan | [![Linkedin Badge](https://img.shields.io/badge/-Linkedin-blue?style=flat&logo=Linkedin&logoColor=white)](https://www.linkedin.com/in/lorenzo-padoan-4521a2154/) | + +## 📜 Licenza + +ScrapeGraphAI è rilasciato sotto la Licenza MIT. Consulta il file [LICENSE](https://github.com/ScrapeGraphAI/Scrapegraph-ai/blob/main/LICENSE) per ulteriori informazioni. + +## Ringraziamenti + +- Ringraziamo tutti i collaboratori del progetto e la comunità open-source per il loro supporto. +- ScrapeGraphAI è destinato esclusivamente a scopi di esplorazione dei dati e ricerca. Non siamo responsabili per eventuali usi impropri della libreria. + +Fatto con il ❤️ da [ScrapeGraph AI](https://scrapegraphai.com) + +[Scarf tracking](https://static.scarf.sh/a.png?x-pxid=102d4b8c-cd6a-4b9e-9a16-d6d141b9212d)