From 518945dd75ed98d86c1c1706c4383123c5e24974 Mon Sep 17 00:00:00 2001 From: Vikrant-Khedkar Date: Mon, 16 Feb 2026 13:24:45 +0530 Subject: [PATCH 1/3] use custom api for tracing --- scrapegraphai/telemetry/telemetry.py | 153 ++++++++++++++++----------- 1 file changed, 94 insertions(+), 59 deletions(-) diff --git a/scrapegraphai/telemetry/telemetry.py b/scrapegraphai/telemetry/telemetry.py index 5c4496e4..af2c0562 100644 --- a/scrapegraphai/telemetry/telemetry.py +++ b/scrapegraphai/telemetry/telemetry.py @@ -4,25 +4,19 @@ import json import logging import os -import platform import threading import uuid from typing import Callable, Dict from urllib import request +from pydantic import BaseModel, Field -# Load version VERSION = importlib.metadata.version("scrapegraphai") -STR_VERSION = ".".join([str(i) for i in VERSION]) - -# 🚀 Your proxy service endpoint (instead of PostHog) -PROXY_URL = "https://scrapegraph-proxy.onrender.com/capture/" - +TRACK_URL = "https://sgai-oss-tracing.onrender.com/v1/telemetry" TIMEOUT = 2 DEFAULT_CONFIG_LOCATION = os.path.expanduser("~/.scrapegraphai.conf") logger = logging.getLogger(__name__) -# Everything below remains mostly same def _load_config(config_location: str) -> configparser.ConfigParser: config = configparser.ConfigParser() try: @@ -70,16 +64,6 @@ def _check_config_and_environ_for_telemetry_flag(default_value: bool, config_obj MAX_COUNT_SESSION = 1000 -BASE_PROPERTIES = { - "os_type": os.name, - "os_version": platform.platform(), - "python_version": f"{platform.python_version()}/{platform.python_implementation()}", - "distinct_id": g_anonymous_id, - "scrapegraphai_version": VERSION, - "telemetry_version": "0.0.4-proxy", -} - - def disable_telemetry(): global g_telemetry_enabled g_telemetry_enabled = False @@ -95,44 +79,93 @@ def is_telemetry_enabled() -> bool: return False -# ⭐ UPDATED FOR PROXY — send without API key -def _send_event_json(event_json: dict): +class TelemetryEvent(BaseModel): + user_prompt: str = Field(min_length=1, max_length=4096) + json_schema: str = Field(min_length=512, max_length=16384) + website_content: str = Field(min_length=1, max_length=65536) + llm_response: str = Field(min_length=1, max_length=32768) + llm_model: str = Field(min_length=1, max_length=256) + url: str = Field(min_length=1, max_length=2048) + + +def _build_valid_telemetry_event( + prompt: str | None, + schema: dict | None, + content: str | None, + response: dict | str | None, + llm_model: str | None, + source: list[str] | None, +) -> TelemetryEvent | None: + """Build and validate a TelemetryEvent. Returns None if validation fails.""" + url: str | None = source[0] if isinstance(source, list) and source else None + + json_schema: str | None = None + if isinstance(schema, dict): + try: + json_schema = json.dumps(schema) + except Exception: + json_schema = None + elif schema is not None: + json_schema = str(schema) + + llm_response: str | None = None + if isinstance(response, dict): + try: + llm_response = json.dumps(response) + except Exception: + llm_response = None + elif response is not None: + llm_response = str(response) + + try: + return TelemetryEvent( + user_prompt=prompt, + json_schema=json_schema, + website_content=content, + llm_response=llm_response, + llm_model=llm_model or "unknown", + url=url, + ) + except Exception: + return None + + +def _send_telemetry(event: TelemetryEvent): + """Send telemetry event to the tracing endpoint.""" headers = { "Content-Type": "application/json", - "User-Agent": f"scrapegraphai/{STR_VERSION}", + "sgai-oss-version": VERSION, } try: - data = json.dumps(event_json).encode() - req = request.Request(PROXY_URL, data=data, headers=headers) + data = json.dumps(event.model_dump()).encode() + except Exception as e: + logger.debug(f"Failed to serialize telemetry event: {e}") + return + try: + req = request.Request(TRACK_URL, data=data, headers=headers) with request.urlopen(req, timeout=TIMEOUT) as f: - response_body = f.read() - if f.code != 200: - raise RuntimeError(response_body) + f.read() + if f.code == 201: + logger.debug("Telemetry data sent successfully") + else: + logger.debug(f"Telemetry endpoint returned unexpected status: {f.code}") except Exception as e: - logger.debug(f"Failed to send telemetry data to proxy: {e}") - else: - logger.debug(f"Telemetry payload forwarded to proxy: {data}") + logger.debug(f"Failed to send telemetry data: {e}") -def send_event_json(event_json: dict): - if not g_telemetry_enabled: - raise RuntimeError("Telemetry tracking is disabled!") +def _send_telemetry_threaded(event: TelemetryEvent): + """Send telemetry in a background daemon thread.""" try: - th = threading.Thread(target=_send_event_json, args=(event_json,)) + th = threading.Thread(target=_send_telemetry, args=(event,)) + th.daemon = True th.start() - except Exception as e: - logger.debug(f"Telemetry dispatch thread failed: {e}") + except RuntimeError as e: + logger.debug(f"Failed to send telemetry data in a thread: {e}") def log_event(event: str, properties: Dict[str, any]): - if is_telemetry_enabled(): - payload = { - "event": event, - "distinct_id": g_anonymous_id, - "properties": {**BASE_PROPERTIES, **properties}, - } - send_event_json(payload) + pass def log_graph_execution( @@ -150,23 +183,25 @@ def log_graph_execution( exception: str = None, total_tokens: int = None, ): - props = { - "graph_name": graph_name, - "source": source, - "prompt": prompt, - "schema": schema, - "llm_model": llm_model, - "embedder_model": embedder_model, - "source_type": source_type, - "content": content, - "response": response, - "execution_time": execution_time, - "error_node": error_node, - "exception": exception, - "total_tokens": total_tokens, - "type": "community-library", - } - log_event("graph_execution", props) + if not is_telemetry_enabled(): + return + + if error_node is not None: + return + + event = _build_valid_telemetry_event( + prompt=prompt, + schema=schema, + content=content, + response=response, + llm_model=llm_model, + source=source, + ) + if event is None: + logger.debug("Telemetry skipped: event validation failed") + return + + _send_telemetry_threaded(event) def capture_function_usage(call_fn: Callable) -> Callable: From b17b154bff044f0042d9982eb3408a98fe9aed98 Mon Sep 17 00:00:00 2001 From: Vikrant-Khedkar Date: Mon, 16 Feb 2026 15:20:04 +0530 Subject: [PATCH 2/3] fix: handle list content in telemetry event validation --- scrapegraphai/telemetry/telemetry.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scrapegraphai/telemetry/telemetry.py b/scrapegraphai/telemetry/telemetry.py index af2c0562..5308fa48 100644 --- a/scrapegraphai/telemetry/telemetry.py +++ b/scrapegraphai/telemetry/telemetry.py @@ -99,6 +99,9 @@ def _build_valid_telemetry_event( """Build and validate a TelemetryEvent. Returns None if validation fails.""" url: str | None = source[0] if isinstance(source, list) and source else None + if isinstance(content, list): + content = "\n".join(str(c) for c in content) + json_schema: str | None = None if isinstance(schema, dict): try: From 96dc59c7971a9cdad6de4d677d016f1882cd7f60 Mon Sep 17 00:00:00 2001 From: Vikrant-Khedkar Date: Mon, 16 Feb 2026 15:41:32 +0530 Subject: [PATCH 3/3] remove client side validation to save cpu usage for user --- scrapegraphai/telemetry/telemetry.py | 73 +++++++++++----------------- 1 file changed, 29 insertions(+), 44 deletions(-) diff --git a/scrapegraphai/telemetry/telemetry.py b/scrapegraphai/telemetry/telemetry.py index 5308fa48..07fa6e08 100644 --- a/scrapegraphai/telemetry/telemetry.py +++ b/scrapegraphai/telemetry/telemetry.py @@ -8,8 +8,6 @@ import uuid from typing import Callable, Dict from urllib import request -from pydantic import BaseModel, Field - VERSION = importlib.metadata.version("scrapegraphai") TRACK_URL = "https://sgai-oss-tracing.onrender.com/v1/telemetry" TIMEOUT = 2 @@ -79,88 +77,75 @@ def is_telemetry_enabled() -> bool: return False -class TelemetryEvent(BaseModel): - user_prompt: str = Field(min_length=1, max_length=4096) - json_schema: str = Field(min_length=512, max_length=16384) - website_content: str = Field(min_length=1, max_length=65536) - llm_response: str = Field(min_length=1, max_length=32768) - llm_model: str = Field(min_length=1, max_length=256) - url: str = Field(min_length=1, max_length=2048) - - -def _build_valid_telemetry_event( +def _build_telemetry_payload( prompt: str | None, schema: dict | None, content: str | None, response: dict | str | None, llm_model: str | None, source: list[str] | None, -) -> TelemetryEvent | None: - """Build and validate a TelemetryEvent. Returns None if validation fails.""" - url: str | None = source[0] if isinstance(source, list) and source else None +) -> dict | None: + """Build telemetry payload dict. Returns None if required fields are missing.""" + url = source[0] if isinstance(source, list) and source else None if isinstance(content, list): content = "\n".join(str(c) for c in content) - json_schema: str | None = None + json_schema = None if isinstance(schema, dict): try: json_schema = json.dumps(schema) - except Exception: + except (TypeError, ValueError): json_schema = None elif schema is not None: json_schema = str(schema) - llm_response: str | None = None + llm_response = None if isinstance(response, dict): try: llm_response = json.dumps(response) - except Exception: + except (TypeError, ValueError): llm_response = None elif response is not None: llm_response = str(response) - try: - return TelemetryEvent( - user_prompt=prompt, - json_schema=json_schema, - website_content=content, - llm_response=llm_response, - llm_model=llm_model or "unknown", - url=url, - ) - except Exception: + if not all([prompt, json_schema, content, llm_response, url]): return None + return { + "user_prompt": prompt, + "json_schema": json_schema, + "website_content": content, + "llm_response": llm_response, + "llm_model": llm_model or "unknown", + "url": url, + } -def _send_telemetry(event: TelemetryEvent): - """Send telemetry event to the tracing endpoint.""" + +def _send_telemetry(payload: dict): + """Send telemetry payload to the tracing endpoint.""" headers = { "Content-Type": "application/json", "sgai-oss-version": VERSION, } try: - data = json.dumps(event.model_dump()).encode() - except Exception as e: - logger.debug(f"Failed to serialize telemetry event: {e}") + data = json.dumps(payload).encode() + except (TypeError, ValueError) as e: + logger.debug(f"Failed to serialize telemetry payload: {e}") return try: req = request.Request(TRACK_URL, data=data, headers=headers) with request.urlopen(req, timeout=TIMEOUT) as f: f.read() - if f.code == 201: - logger.debug("Telemetry data sent successfully") - else: - logger.debug(f"Telemetry endpoint returned unexpected status: {f.code}") except Exception as e: logger.debug(f"Failed to send telemetry data: {e}") -def _send_telemetry_threaded(event: TelemetryEvent): +def _send_telemetry_threaded(payload: dict): """Send telemetry in a background daemon thread.""" try: - th = threading.Thread(target=_send_telemetry, args=(event,)) + th = threading.Thread(target=_send_telemetry, args=(payload,)) th.daemon = True th.start() except RuntimeError as e: @@ -192,7 +177,7 @@ def log_graph_execution( if error_node is not None: return - event = _build_valid_telemetry_event( + payload = _build_telemetry_payload( prompt=prompt, schema=schema, content=content, @@ -200,11 +185,11 @@ def log_graph_execution( llm_model=llm_model, source=source, ) - if event is None: - logger.debug("Telemetry skipped: event validation failed") + if payload is None: + logger.debug("Telemetry skipped: missing required fields") return - _send_telemetry_threaded(event) + _send_telemetry_threaded(payload) def capture_function_usage(call_fn: Callable) -> Callable: