From fba37d91c5f109dafeccd7a91ba05513b6d54863 Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Fri, 19 Dec 2025 14:11:11 +0100 Subject: [PATCH] Fix ruff formatting issues in databricks_ingestion_monitoring files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit fixes formatting issues in the databricks_ingestion_monitoring files that were introduced in PR #126. The files were merged without being properly formatted according to ruff standards, causing CI checks to fail. Changes: - Reformatted 14 files (Python files and Jupyter notebooks) using ruff format - No functional changes, only formatting improvements 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../cdc_monitoring_pipeline_main.py | 267 ++-- .../databricks_ingestion_monitoring/common.py | 1164 ++++++++------- .../common_ldp.py | 1283 +++++++++-------- .../standard_tables.py | 655 +++++---- .../src/build_pipeline_tags_index.ipynb | 6 +- ...ate_imported_event_logs_target_table.ipynb | 12 +- .../common/src/import_event_logs.ipynb | 12 +- .../common/src/publish_dashboard.ipynb | 10 +- .../src/update_monitoring_tables_meta.ipynb | 16 +- .../third_party_sinks/azuremonitor_sink.py | 522 ++++--- .../common/third_party_sinks/datadog_sink.py | 626 +++++--- .../common/third_party_sinks/newrelic_sink.py | 451 +++--- .../splunk_observability_sink.py | 445 +++--- .../sdp_monitoring_pipeline_main.py | 10 +- 14 files changed, 3132 insertions(+), 2347 deletions(-) diff --git a/contrib/databricks_ingestion_monitoring/cdc_connector_monitoring_dab/monitoring_etl/cdc_monitoring_pipeline_main.py b/contrib/databricks_ingestion_monitoring/cdc_connector_monitoring_dab/monitoring_etl/cdc_monitoring_pipeline_main.py index 3cf0280..b7f71a0 100644 --- a/contrib/databricks_ingestion_monitoring/cdc_connector_monitoring_dab/monitoring_etl/cdc_monitoring_pipeline_main.py +++ b/contrib/databricks_ingestion_monitoring/cdc_connector_monitoring_dab/monitoring_etl/cdc_monitoring_pipeline_main.py @@ -6,16 +6,20 @@ sys.path.append("../../common/lib") -from databricks_ingestion_monitoring.common_ldp import Configuration, Constants, MonitoringEtlPipeline +from databricks_ingestion_monitoring.common_ldp import ( + Configuration, + Constants, + MonitoringEtlPipeline, +) from databricks_ingestion_monitoring.standard_tables import ( - EVENTS_TABLE_METRICS, - TABLE_STATUS, - TABLE_STATUS_PER_PIPELINE_RUN - ) + EVENTS_TABLE_METRICS, + TABLE_STATUS, + TABLE_STATUS_PER_PIPELINE_RUN, +) # Configure logging -logging.basicConfig(level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" ) logger = logging.getLogger(__name__) logger.info("Starting CDC Connector Monitoring ETL Pipeline") @@ -24,86 +28,98 @@ conf = Configuration(spark.conf) + class CdcConstants: - CDC_FLOW_TYPE = 'cdc' - SNAPSHOT_FLOW_TYPE = 'snapshot' - CDC_STAGING_TABLE_FLOW_TYPE = 'cdc_staging' - TABLE_STATUS_PER_PIPELINE_RUN = 'table_status_per_pipeline_run' - CDC_STAGING_TABLE = 'cdc_staging_table' + CDC_FLOW_TYPE = "cdc" + SNAPSHOT_FLOW_TYPE = "snapshot" + CDC_STAGING_TABLE_FLOW_TYPE = "cdc_staging" + TABLE_STATUS_PER_PIPELINE_RUN = "table_status_per_pipeline_run" + CDC_STAGING_TABLE = "cdc_staging_table" class CdcConnectorMonitoringEtlPipeline(MonitoringEtlPipeline): - def __init__(self, conf: Configuration, spark: SparkSession): - super().__init__(conf, spark) - - def _get_event_logs_bronze_sql(self, event_log_source: str): - """ - Override base definition for append flows from the event log sources into `event_logs_bronze` table. It adds - CDC Connector-specific fields - """ - sql = super()._get_event_logs_bronze_sql(event_log_source) - sql = sql.replace(Constants.sql_fields_def_extension_point, - f""", (CASE WHEN endswith(flow_name, "_snapshot_flow") THEN 'snapshot' + def __init__(self, conf: Configuration, spark: SparkSession): + super().__init__(conf, spark) + + def _get_event_logs_bronze_sql(self, event_log_source: str): + """ + Override base definition for append flows from the event log sources into `event_logs_bronze` table. It adds + CDC Connector-specific fields + """ + sql = super()._get_event_logs_bronze_sql(event_log_source) + sql = sql.replace( + Constants.sql_fields_def_extension_point, + f""", (CASE WHEN endswith(flow_name, "_snapshot_flow") THEN 'snapshot' WHEN details:operation_progress.cdc_snapshot.table_name::string is not null THEN '{CdcConstants.SNAPSHOT_FLOW_TYPE}' WHEN endswith(flow_name, "_cdc_flow") THEN '{CdcConstants.CDC_FLOW_TYPE}' WHEN endswith(flow_name, ".{CdcConstants.CDC_STAGING_TABLE}") THEN '{CdcConstants.CDC_STAGING_TABLE_FLOW_TYPE}' END) flow_type{Constants.sql_fields_def_extension_point} - """) - return sql - - - def _get_events_errors_sql(self): - sql = super()._get_events_errors_sql() - sql = sql.replace(Constants.sql_fields_def_extension_point, - f", flow_type{Constants.sql_fields_def_extension_point}") - return sql - - - def _get_events_warnings_sql(self): - sql = super()._get_events_warnings_sql() - sql = sql.replace(Constants.sql_fields_def_extension_point, - f", flow_type{Constants.sql_fields_def_extension_point}") - return sql - - - def _get_events_table_metrics_sql(self): - sql = super()._get_events_table_metrics_sql() - return sql.replace(Constants.sql_fields_def_extension_point, f", flow_type{Constants.sql_fields_def_extension_point}") - - - def register_base_tables_and_views(self, spark: SparkSession): - super().register_base_tables_and_views(spark) - - def _get_table_run_processing_state_sql(self): - sql = super()._get_table_run_processing_state_sql() - sql = sql.replace(Constants.where_clause_extension_point, f"AND (table_name not LIKE '%.{CdcConstants.CDC_STAGING_TABLE}') {Constants.where_clause_extension_point}") - sql = sql.replace(Constants.sql_fields_def_extension_point, f", flow_type{Constants.sql_fields_def_extension_point}") - return sql - - - def register_table_status(self, spark: SparkSession): - table_status_per_pipeline_run_cdf = f"{TABLE_STATUS_PER_PIPELINE_RUN.name}_cdf" - @dlt.view(name=table_status_per_pipeline_run_cdf) - def table_run_processing_state_cdf(): - return ( - spark.readStream - .option("readChangeFeed", "true") - .table(TABLE_STATUS_PER_PIPELINE_RUN.name) - .filter("_change_type IN ('insert', 'update_postimage')") - ) - - silver_table_name = f"{TABLE_STATUS.name}_silver" - dlt.create_streaming_table(name=silver_table_name, - comment="Capture information about the latest state, ingested data and errors for target tables", - cluster_by=['pipeline_id', 'table_name'], - table_properties={ - "delta.enableRowTracking": "true" - }) - - silver_latest_source_view_name = f"{silver_table_name}_latest_source" - @dlt.view(name=silver_latest_source_view_name) - def table_latest_run_processing_state_source(): - return spark.sql(f""" + """, + ) + return sql + + def _get_events_errors_sql(self): + sql = super()._get_events_errors_sql() + sql = sql.replace( + Constants.sql_fields_def_extension_point, + f", flow_type{Constants.sql_fields_def_extension_point}", + ) + return sql + + def _get_events_warnings_sql(self): + sql = super()._get_events_warnings_sql() + sql = sql.replace( + Constants.sql_fields_def_extension_point, + f", flow_type{Constants.sql_fields_def_extension_point}", + ) + return sql + + def _get_events_table_metrics_sql(self): + sql = super()._get_events_table_metrics_sql() + return sql.replace( + Constants.sql_fields_def_extension_point, + f", flow_type{Constants.sql_fields_def_extension_point}", + ) + + def register_base_tables_and_views(self, spark: SparkSession): + super().register_base_tables_and_views(spark) + + def _get_table_run_processing_state_sql(self): + sql = super()._get_table_run_processing_state_sql() + sql = sql.replace( + Constants.where_clause_extension_point, + f"AND (table_name not LIKE '%.{CdcConstants.CDC_STAGING_TABLE}') {Constants.where_clause_extension_point}", + ) + sql = sql.replace( + Constants.sql_fields_def_extension_point, + f", flow_type{Constants.sql_fields_def_extension_point}", + ) + return sql + + def register_table_status(self, spark: SparkSession): + table_status_per_pipeline_run_cdf = f"{TABLE_STATUS_PER_PIPELINE_RUN.name}_cdf" + + @dlt.view(name=table_status_per_pipeline_run_cdf) + def table_run_processing_state_cdf(): + return ( + spark.readStream.option("readChangeFeed", "true") + .table(TABLE_STATUS_PER_PIPELINE_RUN.name) + .filter("_change_type IN ('insert', 'update_postimage')") + ) + + silver_table_name = f"{TABLE_STATUS.name}_silver" + dlt.create_streaming_table( + name=silver_table_name, + comment="Capture information about the latest state, ingested data and errors for target tables", + cluster_by=["pipeline_id", "table_name"], + table_properties={"delta.enableRowTracking": "true"}, + ) + + silver_latest_source_view_name = f"{silver_table_name}_latest_source" + + @dlt.view(name=silver_latest_source_view_name) + def table_latest_run_processing_state_source(): + return spark.sql(f""" SELECT pipeline_id, table_name, pipeline_run_id AS latest_pipeline_run_id, @@ -129,18 +145,22 @@ def table_latest_run_processing_state_source(): WHERE table_name NOT LIKE '%.{CdcConstants.CDC_STAGING_TABLE}' """) - dlt.create_auto_cdc_flow( - name=f"{silver_table_name}_apply_latest", - source=silver_latest_source_view_name, - target=silver_table_name, - keys=['pipeline_id', 'table_name'], - sequence_by='updated_at', - ignore_null_updates=True) - - silver_latest_cdc_changes_source_view_name = f"{silver_table_name}_latest_cdc_changes_source" - @dlt.view(name=silver_latest_cdc_changes_source_view_name) - def table_latest_run_processing_state_source(): - return spark.sql(f""" + dlt.create_auto_cdc_flow( + name=f"{silver_table_name}_apply_latest", + source=silver_latest_source_view_name, + target=silver_table_name, + keys=["pipeline_id", "table_name"], + sequence_by="updated_at", + ignore_null_updates=True, + ) + + silver_latest_cdc_changes_source_view_name = ( + f"{silver_table_name}_latest_cdc_changes_source" + ) + + @dlt.view(name=silver_latest_cdc_changes_source_view_name) + def table_latest_run_processing_state_source(): + return spark.sql(f""" SELECT pipeline_id, table_name, null AS latest_pipeline_run_id, @@ -168,18 +188,22 @@ def table_latest_run_processing_state_source(): AND flow_type='cdc' """) - dlt.create_auto_cdc_flow( - name=f"{silver_table_name}_apply_latest_cdc_changes", - source=silver_latest_cdc_changes_source_view_name, - target=silver_table_name, - keys=['pipeline_id', 'table_name'], - sequence_by='updated_at', - ignore_null_updates=True) - - silver_latest_snapshot_changes_source_view_name = f"{silver_table_name}_latest_snapshot_changes_source" - @dlt.view(name=silver_latest_snapshot_changes_source_view_name) - def table_latest_run_processing_state_source(): - return spark.sql(f""" + dlt.create_auto_cdc_flow( + name=f"{silver_table_name}_apply_latest_cdc_changes", + source=silver_latest_cdc_changes_source_view_name, + target=silver_table_name, + keys=["pipeline_id", "table_name"], + sequence_by="updated_at", + ignore_null_updates=True, + ) + + silver_latest_snapshot_changes_source_view_name = ( + f"{silver_table_name}_latest_snapshot_changes_source" + ) + + @dlt.view(name=silver_latest_snapshot_changes_source_view_name) + def table_latest_run_processing_state_source(): + return spark.sql(f""" SELECT pipeline_id, table_name, null AS latest_pipeline_run_id, @@ -207,22 +231,23 @@ def table_latest_run_processing_state_source(): AND flow_type='snapshot' """) - dlt.create_auto_cdc_flow( - name=f"{silver_table_name}_apply_latest_snapshot_changes", - source=silver_latest_snapshot_changes_source_view_name, - target=silver_table_name, - keys=['pipeline_id', 'table_name'], - sequence_by='updated_at', - ignore_null_updates=True) - - @dlt.table(name=TABLE_STATUS.name, - comment=TABLE_STATUS.table_comment, - cluster_by=['pipeline_id', 'table_name'], - table_properties={ - "delta.enableRowTracking": "true" - }) - def table_status(): - return spark.sql(f""" + dlt.create_auto_cdc_flow( + name=f"{silver_table_name}_apply_latest_snapshot_changes", + source=silver_latest_snapshot_changes_source_view_name, + target=silver_table_name, + keys=["pipeline_id", "table_name"], + sequence_by="updated_at", + ignore_null_updates=True, + ) + + @dlt.table( + name=TABLE_STATUS.name, + comment=TABLE_STATUS.table_comment, + cluster_by=["pipeline_id", "table_name"], + table_properties={"delta.enableRowTracking": "true"}, + ) + def table_status(): + return spark.sql(f""" SELECT s.*, latest_pipeline_run_num_written_cdc_changes, latest_pipeline_run_num_written_snapshot_changes @@ -240,7 +265,7 @@ def table_status(): AND s.latest_pipeline_run_id = etm.pipeline_run_id AND s.table_name = etm.table_name """) - + pipeline = CdcConnectorMonitoringEtlPipeline(conf, spark) -pipeline.register_base_tables_and_views(spark) \ No newline at end of file +pipeline.register_base_tables_and_views(spark) diff --git a/contrib/databricks_ingestion_monitoring/common/lib/databricks_ingestion_monitoring/common.py b/contrib/databricks_ingestion_monitoring/common/lib/databricks_ingestion_monitoring/common.py index bd641f6..aaa30a4 100644 --- a/contrib/databricks_ingestion_monitoring/common/lib/databricks_ingestion_monitoring/common.py +++ b/contrib/databricks_ingestion_monitoring/common/lib/databricks_ingestion_monitoring/common.py @@ -14,71 +14,75 @@ from databricks.sdk.service.sql import State -def parse_comma_separated_list(s: Optional[str]) ->List[str]: - """ - Parses a notebook parameter that contains a comma-separated list of items. It strips whitespace and - skips empty items. - :return: The parsed list of items - """ - if s is None: - return [] - return [j for j in [i.strip() for i in s.strip().split(',')] if len(j) > 0] +def parse_comma_separated_list(s: Optional[str]) -> List[str]: + """ + Parses a notebook parameter that contains a comma-separated list of items. It strips whitespace and + skips empty items. + :return: The parsed list of items + """ + if s is None: + return [] + return [j for j in [i.strip() for i in s.strip().split(",")] if len(j) > 0] def is_parameter_defined(s: Optional[str]) -> bool: - return s is not None and len(s.strip()) > 0 + return s is not None and len(s.strip()) > 0 + def parse_tag_value_pairs(tags_str: Optional[str]) -> List[List[tuple]]: - """ - Parses a tag filter expression with OR of ANDs semantics. - - Format: Semi-colon-separated groups where each group is comma-separated tag[:value] pairs. - - Semicolons separate groups (OR logic between groups) - - Commas separate tags within a group (AND logic within group) - - 'tag' is shorthand for 'tag:' (tag with empty value) - - :param tags_str: String like "tier:T0;team:data,tier:T1" meaning (tier:T0) OR (team:data AND tier:T1) - :return: List of groups, where each group is a list of tuples like [[("tier", "T0")], [("team", "data"), ("tier", "T1")]] - - Examples: - - "env:prod" -> [[("env", "prod")]] - - "env:prod,tier:T0" -> [[("env", "prod"), ("tier", "T0")]] - - "env:prod;env:staging" -> [[("env", "prod")], [("env", "staging")]] - - "tier:T0;team:data,tier:T1" -> [[("tier", "T0")], [("team", "data"), ("tier", "T1")]] - - "monitoring" -> [[("monitoring", "")]] - """ - if not is_parameter_defined(tags_str): - return [] - - result = [] - # Split by semicolon to get groups (OR logic) - groups = [g.strip() for g in tags_str.strip().split(';') if g.strip()] - - for group in groups: - # Split by comma to get individual tags in this group (AND logic) - tag_pairs = [] - items = [item.strip() for item in group.split(',') if item.strip()] - - for item in items: - if ':' in item: - # tag:value format - parts = item.split(':', 1) - tag_pairs.append((parts[0].strip(), parts[1].strip())) - else: - # tag format (shorthand for tag:) - tag_pairs.append((item.strip(), "")) - - if tag_pairs: - result.append(tag_pairs) - - return result + """ + Parses a tag filter expression with OR of ANDs semantics. + + Format: Semi-colon-separated groups where each group is comma-separated tag[:value] pairs. + - Semicolons separate groups (OR logic between groups) + - Commas separate tags within a group (AND logic within group) + - 'tag' is shorthand for 'tag:' (tag with empty value) + + :param tags_str: String like "tier:T0;team:data,tier:T1" meaning (tier:T0) OR (team:data AND tier:T1) + :return: List of groups, where each group is a list of tuples like [[("tier", "T0")], [("team", "data"), ("tier", "T1")]] + + Examples: + - "env:prod" -> [[("env", "prod")]] + - "env:prod,tier:T0" -> [[("env", "prod"), ("tier", "T0")]] + - "env:prod;env:staging" -> [[("env", "prod")], [("env", "staging")]] + - "tier:T0;team:data,tier:T1" -> [[("tier", "T0")], [("team", "data"), ("tier", "T1")]] + - "monitoring" -> [[("monitoring", "")]] + """ + if not is_parameter_defined(tags_str): + return [] + + result = [] + # Split by semicolon to get groups (OR logic) + groups = [g.strip() for g in tags_str.strip().split(";") if g.strip()] + + for group in groups: + # Split by comma to get individual tags in this group (AND logic) + tag_pairs = [] + items = [item.strip() for item in group.split(",") if item.strip()] + + for item in items: + if ":" in item: + # tag:value format + parts = item.split(":", 1) + tag_pairs.append((parts[0].strip(), parts[1].strip())) + else: + # tag format (shorthand for tag:) + tag_pairs.append((item.strip(), "")) + + if tag_pairs: + result.append(tag_pairs) + + return result + def get_pipeline_tags(wc: WorkspaceClient, pipeline_id: str) -> Optional[dict]: # For now we use the REST API directly as older Python SDK versions may not support tags - pipeline_spec = wc.api_client.do(method='get', path=f'/api/2.0/pipelines/{pipeline_id}')['spec'] + pipeline_spec = wc.api_client.do( + method="get", path=f"/api/2.0/pipelines/{pipeline_id}" + )["spec"] # Check if pipeline has tags - return pipeline_spec.get('tags') + return pipeline_spec.get("tags") def get_pipeline_ids_by_tags( @@ -89,169 +93,189 @@ def get_pipeline_ids_by_tags( index_enabled: bool = True, index_max_age_hours: int = 24, api_fallback_enabled: bool = True, - log: Optional[logging.Logger] = None + log: Optional[logging.Logger] = None, ) -> List[str]: - """ - Fetches pipeline IDs using OR of ANDs logic for tag matching. - This is a common helper function used by both EventLogImporter and MonitoringEtlPipeline. - - Logic: A pipeline matches if it satisfies ALL tags in ANY group. - - Within a group: ALL tags must match (AND logic) - - Between groups: ANY group can match (OR logic) - - Tries to use the pipeline tags index table first (if enabled and fresh), falls back to API-based discovery if needed. - - :param wc: WorkspaceClient instance - :param tag_groups: List of tag groups, e.g., [[("env", "prod")], [("team", "data"), ("tier", "T1")]] - means (env:prod) OR (team:data AND tier:T1) - :param spark: Optional SparkSession for index table queries - :param index_table_fqn: Fully qualified name of the pipeline tags index table - :param index_enabled: Whether to use the index table - :param index_max_age_hours: Maximum age of index (in hours) before falling back to API - :param api_fallback_enabled: Whether to fall back to API if index is unavailable/stale - :param log: Optional logger for logging - :return: List of pipeline IDs matching the tag filter expression - """ - if not tag_groups: - return [] - - if log is None: - log = logging.getLogger("get_pipeline_ids_by_tags") - - # Try index-based lookup first - if index_enabled and spark is not None and index_table_fqn: - try: - # Check index freshness - try: - freshness_check = spark.sql(f""" + """ + Fetches pipeline IDs using OR of ANDs logic for tag matching. + This is a common helper function used by both EventLogImporter and MonitoringEtlPipeline. + + Logic: A pipeline matches if it satisfies ALL tags in ANY group. + - Within a group: ALL tags must match (AND logic) + - Between groups: ANY group can match (OR logic) + + Tries to use the pipeline tags index table first (if enabled and fresh), falls back to API-based discovery if needed. + + :param wc: WorkspaceClient instance + :param tag_groups: List of tag groups, e.g., [[("env", "prod")], [("team", "data"), ("tier", "T1")]] + means (env:prod) OR (team:data AND tier:T1) + :param spark: Optional SparkSession for index table queries + :param index_table_fqn: Fully qualified name of the pipeline tags index table + :param index_enabled: Whether to use the index table + :param index_max_age_hours: Maximum age of index (in hours) before falling back to API + :param api_fallback_enabled: Whether to fall back to API if index is unavailable/stale + :param log: Optional logger for logging + :return: List of pipeline IDs matching the tag filter expression + """ + if not tag_groups: + return [] + + if log is None: + log = logging.getLogger("get_pipeline_ids_by_tags") + + # Try index-based lookup first + if index_enabled and spark is not None and index_table_fqn: + try: + # Check index freshness + try: + freshness_check = spark.sql(f""" SELECT MAX(last_updated) as last_updated, timestampdiff(HOUR, MAX(last_updated), CURRENT_TIMESTAMP()) as age_hours FROM {index_table_fqn} """).collect()[0] - age_hours = freshness_check['age_hours'] - except Exception as e: - if log: - log.warning(f"Failed to check pipeline tags index: {e}") - age_hours = None - - if age_hours is None or age_hours <= index_max_age_hours: - # Index is fresh, use it - log.info(f"Using pipeline tags index table (age: {age_hours:.1f} hours)") - - # Step 1: Collect all unique (tag_key, tag_value) pairs across all groups - all_tag_pairs = set() - for group in tag_groups: - all_tag_pairs.update(group) - - # Step 2: Query database once for all tag pairs - where_conditions = " OR ".join([ - f"(tag_key = '{tag_key}' AND tag_value = '{tag_value}')" - for tag_key, tag_value in all_tag_pairs - ]) - - log.info(f"Querying pipeline tags index table {index_table_fqn} with where conditions: {where_conditions}") - - query = f""" + age_hours = freshness_check["age_hours"] + except Exception as e: + if log: + log.warning(f"Failed to check pipeline tags index: {e}") + age_hours = None + + if age_hours is None or age_hours <= index_max_age_hours: + # Index is fresh, use it + log.info( + f"Using pipeline tags index table (age: {age_hours:.1f} hours)" + ) + + # Step 1: Collect all unique (tag_key, tag_value) pairs across all groups + all_tag_pairs = set() + for group in tag_groups: + all_tag_pairs.update(group) + + # Step 2: Query database once for all tag pairs + where_conditions = " OR ".join( + [ + f"(tag_key = '{tag_key}' AND tag_value = '{tag_value}')" + for tag_key, tag_value in all_tag_pairs + ] + ) + + log.info( + f"Querying pipeline tags index table {index_table_fqn} with where conditions: {where_conditions}" + ) + + query = f""" SELECT DISTINCT tag_key, tag_value, explode(pipeline_ids) as pipeline_id FROM {index_table_fqn} WHERE {where_conditions} """ - result = spark.sql(query).collect() - - # Step 3: Build map from (tag_key, tag_value) -> set of pipeline_ids - tag_to_pipelines = {} - for row in result: - tag_pair = (row['tag_key'], row['tag_value']) - if tag_pair not in tag_to_pipelines: - tag_to_pipelines[tag_pair] = set() - tag_to_pipelines[tag_pair].add(row['pipeline_id']) - - # Step 4: For each group, intersect pipeline_ids (AND logic) - matching_pipeline_ids = set() - for group in tag_groups: - if not group: + result = spark.sql(query).collect() + + # Step 3: Build map from (tag_key, tag_value) -> set of pipeline_ids + tag_to_pipelines = {} + for row in result: + tag_pair = (row["tag_key"], row["tag_value"]) + if tag_pair not in tag_to_pipelines: + tag_to_pipelines[tag_pair] = set() + tag_to_pipelines[tag_pair].add(row["pipeline_id"]) + + # Step 4: For each group, intersect pipeline_ids (AND logic) + matching_pipeline_ids = set() + for group in tag_groups: + if not group: + continue + + # Get pipeline_ids for each tag in the group + group_pipeline_sets = [] + for tag_pair in group: + if tag_pair in tag_to_pipelines: + group_pipeline_sets.append(tag_to_pipelines[tag_pair]) + else: + # Tag doesn't exist in index, so no pipelines match this group + group_pipeline_sets = [] + break + + # Intersect all sets in this group (AND logic) + if group_pipeline_sets: + group_result = set.intersection(*group_pipeline_sets) + if group_result: + log.info( + f"Found {len(group_result)} pipeline(s) matching group {group}" + ) + # Step 5: Union with results from other groups (OR logic) + matching_pipeline_ids.update(group_result) + + return list(matching_pipeline_ids) + else: + # Index is stale + log.warning( + f"Pipeline tags index is stale (age: {age_hours:.1f} hours > max: {index_max_age_hours} hours)" + ) + if not api_fallback_enabled: + raise ValueError(f"Index is stale and API fallback is disabled") + except Exception as e: + log.warning(f"Failed to use pipeline tags index: {e}") + if not api_fallback_enabled: + raise + + # Fall back to API-based discovery + log.warning("Falling back to API-based pipeline discovery (this may be slow)") + + matching_pipeline_ids = set() + + # List all pipelines in the workspace (this returns basic info only) + all_pipeline_ids = [ + (pi.pipeline_id, pi.name) for pi in wc.pipelines.list_pipelines() + ] + + for pipeline_id, pipeline_name in all_pipeline_ids: + try: + # Fetch the full pipeline spec to get tags + pipeline_tags = get_pipeline_tags(wc, pipeline_id) + + if not pipeline_tags: + continue + + # Check if this pipeline matches any group (OR of ANDs) + for group in tag_groups: + # Check if pipeline has ALL tags in this group + group_matches = True + for tag_key, tag_value in group: + if ( + tag_key not in pipeline_tags + or pipeline_tags[tag_key] != tag_value + ): + group_matches = False + break + + if group_matches: + matching_pipeline_ids.add(pipeline_id) + log.info( + f"Pipeline {pipeline_name} ({pipeline_id}) matches group {group}" + ) + break # Pipeline matches at least one group, no need to check other groups + except Exception as e: + log.warning(f"Failed to fetch pipeline {pipeline_id}: {e}") continue - # Get pipeline_ids for each tag in the group - group_pipeline_sets = [] - for tag_pair in group: - if tag_pair in tag_to_pipelines: - group_pipeline_sets.append(tag_to_pipelines[tag_pair]) - else: - # Tag doesn't exist in index, so no pipelines match this group - group_pipeline_sets = [] - break - - # Intersect all sets in this group (AND logic) - if group_pipeline_sets: - group_result = set.intersection(*group_pipeline_sets) - if group_result: - log.info(f"Found {len(group_result)} pipeline(s) matching group {group}") - # Step 5: Union with results from other groups (OR logic) - matching_pipeline_ids.update(group_result) - - return list(matching_pipeline_ids) - else: - # Index is stale - log.warning(f"Pipeline tags index is stale (age: {age_hours:.1f} hours > max: {index_max_age_hours} hours)") - if not api_fallback_enabled: - raise ValueError(f"Index is stale and API fallback is disabled") - except Exception as e: - log.warning(f"Failed to use pipeline tags index: {e}") - if not api_fallback_enabled: - raise - - # Fall back to API-based discovery - log.warning("Falling back to API-based pipeline discovery (this may be slow)") - - matching_pipeline_ids = set() - - # List all pipelines in the workspace (this returns basic info only) - all_pipeline_ids = [(pi.pipeline_id, pi.name) for pi in wc.pipelines.list_pipelines()] - - for pipeline_id, pipeline_name in all_pipeline_ids: - try: - # Fetch the full pipeline spec to get tags - pipeline_tags = get_pipeline_tags(wc, pipeline_id) - - if not pipeline_tags: - continue - - # Check if this pipeline matches any group (OR of ANDs) - for group in tag_groups: - # Check if pipeline has ALL tags in this group - group_matches = True - for tag_key, tag_value in group: - if tag_key not in pipeline_tags or pipeline_tags[tag_key] != tag_value: - group_matches = False - break - - if group_matches: - matching_pipeline_ids.add(pipeline_id) - log.info(f"Pipeline {pipeline_name} ({pipeline_id}) matches group {group}") - break # Pipeline matches at least one group, no need to check other groups - except Exception as e: - log.warning(f"Failed to fetch pipeline {pipeline_id}: {e}") - continue - - return list(matching_pipeline_ids) + return list(matching_pipeline_ids) def get_optional_parameter(value: Optional[str]) -> str: - return value.strip() if is_parameter_defined(value) else None + return value.strip() if is_parameter_defined(value) else None + def get_required_parameter(name: str, value: Optional[str]) -> str: - if is_parameter_defined(value): - return value.strip() - - raise ValueError(f"Missing required parameter: {name}") + if is_parameter_defined(value): + return value.strip() + + raise ValueError(f"Missing required parameter: {name}") + def get_required_widget_parameter(widgets, param_name: str): - return get_required_parameter(param_name, widgets.get(param_name)) + return get_required_parameter(param_name, widgets.get(param_name)) + -SDP_EVENT_LOG_SCHEMA=""" +SDP_EVENT_LOG_SCHEMA = """ id STRING, sequence STRUCT, control_plane_seq_no: BIGINT>, @@ -297,378 +321,448 @@ def get_required_widget_parameter(widgets, param_name: str): class EventLogImporter: - """ - A helper class to incrementally import SDP event logs from pipelines that are not configured to store the event log - directly in a Delta table (see the [`event_log` option](https://docs.databricks.com/api/workspace/pipelines/create#event_log) in the - Pipelines API). This can happen for example, if the pipeline was created prior to the introduction of ability to [Publish to Multiple Catalogs and Schemas from a Single DLT/SDP Pipeline](https://www.databricks.com/blog/publish-multiple-catalogs-and-schemas-single-dlt-pipeline). - - The import is done into a Delta table that can be used to store the logs from multiple pipelines. - - Note that is an expensive operation (it uses `MERGE` statements to achieve incrementalization) and should be used only if - direct write of the event log to a Delta table is not possible. - """ - - def __init__(self, monitoring_catalog: str, monitoring_schema: str, imported_event_logs_table: str, - index_table_name: str = "pipeline_tags_index", - index_enabled: bool = True, - index_max_age_hours: int = 24, - api_fallback_enabled: bool = True, - wc: Optional[WorkspaceClient] = None): - """ - Constructor. - :param monitoring_catalog: The catalog for the table with the imported event logs - :param monitoring_schema: The schema for the table with the imported event logs - :param imported_event_logs_table: The name of the table where the imported event logs are to be stored - :param index_table_name: The name of the pipeline tags index table - :param index_enabled: Whether to use the pipeline tags index - :param index_max_age_hours: Maximum age of the index (in hours) before falling back to API - :param api_fallback_enabled: Whether to fall back to API if index is unavailable/stale - :param wc: The WorkspaceClient to use; if none is specified, a new one will be instantiated - """ - if monitoring_catalog is None or len(monitoring_catalog.strip()) == 0: - raise ValueError("Monitoring catalog cannot be empty") - if monitoring_schema is None or len(monitoring_schema.strip()) == 0: - raise ValueError("Monitoring schema cannot be empty") - if imported_event_logs_table is None or len(imported_event_logs_table) == 0: - raise ValueError("Imported event logs table cannot be empty") - - self.monitoring_catalog = monitoring_catalog.strip() - self.monitoring_schema = monitoring_schema.strip() - self.imported_event_logs_table = imported_event_logs_table.strip() - self.imported_event_logs_table_fqname = f"`{self.monitoring_catalog}`.`{self.monitoring_schema}`.`{self.imported_event_logs_table}`" - self.index_table_fqn = f"`{self.monitoring_catalog}`.`{self.monitoring_schema}`.`{index_table_name}`" - self.index_enabled = index_enabled - self.index_max_age_hours = index_max_age_hours - self.api_fallback_enabled = api_fallback_enabled - self.wc = wc if wc else WorkspaceClient() - self.log = logging.getLogger("EventLogImporter") - - - def create_target_table(self, spark: SparkSession): """ - Creates the target table where the event logs will be imported if it does not exists. - """ - spark.sql(f"CREATE TABLE IF NOT EXISTS {self.imported_event_logs_table_fqname} ({SDP_EVENT_LOG_SCHEMA}) CLUSTER BY AUTO") + A helper class to incrementally import SDP event logs from pipelines that are not configured to store the event log + directly in a Delta table (see the [`event_log` option](https://docs.databricks.com/api/workspace/pipelines/create#event_log) in the + Pipelines API). This can happen for example, if the pipeline was created prior to the introduction of ability to [Publish to Multiple Catalogs and Schemas from a Single DLT/SDP Pipeline](https://www.databricks.com/blog/publish-multiple-catalogs-and-schemas-single-dlt-pipeline). + The import is done into a Delta table that can be used to store the logs from multiple pipelines. - def import_event_log_for_one_pipeline(self, pipeline_id: str, spark: SparkSession): - """ - Imports current contents of the event log for the pipeline with the specified `pipeline_id` + Note that is an expensive operation (it uses `MERGE` statements to achieve incrementalization) and should be used only if + direct write of the event log to a Delta table is not possible. """ - self.log.info(f"Merging changes from event log for pipeline {pipeline_id} ...") - merge_res_df = spark.sql(f""" + + def __init__( + self, + monitoring_catalog: str, + monitoring_schema: str, + imported_event_logs_table: str, + index_table_name: str = "pipeline_tags_index", + index_enabled: bool = True, + index_max_age_hours: int = 24, + api_fallback_enabled: bool = True, + wc: Optional[WorkspaceClient] = None, + ): + """ + Constructor. + :param monitoring_catalog: The catalog for the table with the imported event logs + :param monitoring_schema: The schema for the table with the imported event logs + :param imported_event_logs_table: The name of the table where the imported event logs are to be stored + :param index_table_name: The name of the pipeline tags index table + :param index_enabled: Whether to use the pipeline tags index + :param index_max_age_hours: Maximum age of the index (in hours) before falling back to API + :param api_fallback_enabled: Whether to fall back to API if index is unavailable/stale + :param wc: The WorkspaceClient to use; if none is specified, a new one will be instantiated + """ + if monitoring_catalog is None or len(monitoring_catalog.strip()) == 0: + raise ValueError("Monitoring catalog cannot be empty") + if monitoring_schema is None or len(monitoring_schema.strip()) == 0: + raise ValueError("Monitoring schema cannot be empty") + if imported_event_logs_table is None or len(imported_event_logs_table) == 0: + raise ValueError("Imported event logs table cannot be empty") + + self.monitoring_catalog = monitoring_catalog.strip() + self.monitoring_schema = monitoring_schema.strip() + self.imported_event_logs_table = imported_event_logs_table.strip() + self.imported_event_logs_table_fqname = f"`{self.monitoring_catalog}`.`{self.monitoring_schema}`.`{self.imported_event_logs_table}`" + self.index_table_fqn = f"`{self.monitoring_catalog}`.`{self.monitoring_schema}`.`{index_table_name}`" + self.index_enabled = index_enabled + self.index_max_age_hours = index_max_age_hours + self.api_fallback_enabled = api_fallback_enabled + self.wc = wc if wc else WorkspaceClient() + self.log = logging.getLogger("EventLogImporter") + + def create_target_table(self, spark: SparkSession): + """ + Creates the target table where the event logs will be imported if it does not exists. + """ + spark.sql( + f"CREATE TABLE IF NOT EXISTS {self.imported_event_logs_table_fqname} ({SDP_EVENT_LOG_SCHEMA}) CLUSTER BY AUTO" + ) + + def import_event_log_for_one_pipeline(self, pipeline_id: str, spark: SparkSession): + """ + Imports current contents of the event log for the pipeline with the specified `pipeline_id` + """ + self.log.info(f"Merging changes from event log for pipeline {pipeline_id} ...") + merge_res_df = spark.sql(f""" MERGE INTO {self.imported_event_logs_table_fqname} AS t USING (SELECT * FROM event_log('{pipeline_id}')) as s ON t.origin.pipeline_id = s.origin.pipeline_id and t.id = s.id WHEN NOT MATCHED THEN INSERT * """) - merge_res_df.show(truncate=False) - latest_event_timestamp = spark.sql(f""" + merge_res_df.show(truncate=False) + latest_event_timestamp = spark.sql(f""" SELECT max(`timestamp`) FROM {self.imported_event_logs_table_fqname} WHERE origin.pipeline_id='{pipeline_id}' """).collect()[0][0] - self.log.info(f"Latest imported event for pipeline {pipeline_id} as of {latest_event_timestamp}") - - - def import_event_logs_for_pipelines(self, pipeline_ids: List[str], spark: SparkSession): - """ - Imports current contents of the event logs for the pipelines in the `pipeline_ids` list - """ - if len(pipeline_ids) == 0: - print("Nothing to import") - else: - for pipeline_id in pipeline_ids: - self.import_event_log_for_one_pipeline(pipeline_id, spark) + self.log.info( + f"Latest imported event for pipeline {pipeline_id} as of {latest_event_timestamp}" + ) + def import_event_logs_for_pipelines( + self, pipeline_ids: List[str], spark: SparkSession + ): + """ + Imports current contents of the event logs for the pipelines in the `pipeline_ids` list + """ + if len(pipeline_ids) == 0: + print("Nothing to import") + else: + for pipeline_id in pipeline_ids: + self.import_event_log_for_one_pipeline(pipeline_id, spark) + + def import_event_logs_for_pipelines_comma_list( + self, pipeline_ids_list: str, spark: SparkSession + ): + """ + Imports current contents of the event logs for the pipelines in comma-separated list in + `pipeline_ids_list`. This is primarily for use with notebook parameters. + """ + self.import_event_logs_for_pipelines( + parse_comma_separated_list(pipeline_ids_list), spark + ) - def import_event_logs_for_pipelines_comma_list(self, pipeline_ids_list: str, spark: SparkSession): - """ - Imports current contents of the event logs for the pipelines in comma-separated list in - `pipeline_ids_list`. This is primarily for use with notebook parameters. - """ - self.import_event_logs_for_pipelines(parse_comma_separated_list(pipeline_ids_list), spark) + def import_event_logs_for_pipelines_by_tags( + self, tags_str: str, spark: SparkSession + ): + """ + Imports current contents of the event logs for pipelines matching ANY of the specified tag:value pairs. + :param tags_str: Comma-separated list of tag:value pairs (e.g., "env:prod,team:data") + :param spark: SparkSession instance + """ + tag_value_pairs = parse_tag_value_pairs(tags_str) + if not tag_value_pairs: + self.log.info("No tags specified for pipeline filtering") + return + + self.log.info(f"Fetching pipelines matching tags: {tags_str}") + pipeline_ids = get_pipeline_ids_by_tags(self.wc, tag_value_pairs, self.log) + + if not pipeline_ids: + self.log.warning(f"No pipelines found matching any of the tags: {tags_str}") + else: + self.log.info(f"Found {len(pipeline_ids)} pipeline(s) matching tags") + self.import_event_logs_for_pipelines(pipeline_ids, spark) + + def import_event_logs_for_pipelines_by_ids_and_tags( + self, pipeline_ids_list: str, tags_str: str, spark: SparkSession + ): + """ + Imports current contents of the event logs for pipelines specified by IDs or matching tags. + Pipelines matching either criteria will be included. + :param pipeline_ids_list: Comma-separated list of pipeline IDs + :param tags_str: Comma-separated list of tag:value pairs + :param spark: SparkSession instance + """ + # Collect pipeline IDs from explicit list + explicit_ids = ( + set(parse_comma_separated_list(pipeline_ids_list)) + if pipeline_ids_list + else set() + ) + # Collect pipeline IDs from tags + tag_value_pairs = parse_tag_value_pairs(tags_str) if tags_str else [] + tag_ids = ( + set( + get_pipeline_ids_by_tags( + self.wc, + tag_value_pairs, + spark=spark, + index_table_fqn=self.index_table_fqn, + index_enabled=self.index_enabled, + index_max_age_hours=self.index_max_age_hours, + api_fallback_enabled=self.api_fallback_enabled, + log=self.log, + ) + ) + if tag_value_pairs + else set() + ) - def import_event_logs_for_pipelines_by_tags(self, tags_str: str, spark: SparkSession): - """ - Imports current contents of the event logs for pipelines matching ANY of the specified tag:value pairs. - :param tags_str: Comma-separated list of tag:value pairs (e.g., "env:prod,team:data") - :param spark: SparkSession instance - """ - tag_value_pairs = parse_tag_value_pairs(tags_str) - if not tag_value_pairs: - self.log.info("No tags specified for pipeline filtering") - return + # Combine both sets + all_pipeline_ids = explicit_ids.union(tag_ids) - self.log.info(f"Fetching pipelines matching tags: {tags_str}") - pipeline_ids = get_pipeline_ids_by_tags(self.wc, tag_value_pairs, self.log) + if not all_pipeline_ids: + self.log.info("No pipelines specified (neither by ID nor by tags)") + return - if not pipeline_ids: - self.log.warning(f"No pipelines found matching any of the tags: {tags_str}") - else: - self.log.info(f"Found {len(pipeline_ids)} pipeline(s) matching tags") - self.import_event_logs_for_pipelines(pipeline_ids, spark) + self.log.info(f"Importing event logs for {len(all_pipeline_ids)} pipeline(s)") + self.import_event_logs_for_pipelines(list(all_pipeline_ids), spark) - def import_event_logs_for_pipelines_by_ids_and_tags(self, pipeline_ids_list: str, tags_str: str, spark: SparkSession): +class PipelineTagsIndexBuilder: """ - Imports current contents of the event logs for pipelines specified by IDs or matching tags. - Pipelines matching either criteria will be included. - :param pipeline_ids_list: Comma-separated list of pipeline IDs - :param tags_str: Comma-separated list of tag:value pairs - :param spark: SparkSession instance + A helper class to build an inverted index mapping pipeline tags to pipeline IDs. + The index is stored in a Delta table and enables efficient discovery of pipelines by tags + without having to query the Databricks API for every pipeline. """ - # Collect pipeline IDs from explicit list - explicit_ids = set(parse_comma_separated_list(pipeline_ids_list)) if pipeline_ids_list else set() - - # Collect pipeline IDs from tags - tag_value_pairs = parse_tag_value_pairs(tags_str) if tags_str else [] - tag_ids = set(get_pipeline_ids_by_tags( - self.wc, - tag_value_pairs, - spark=spark, - index_table_fqn=self.index_table_fqn, - index_enabled=self.index_enabled, - index_max_age_hours=self.index_max_age_hours, - api_fallback_enabled=self.api_fallback_enabled, - log=self.log - )) if tag_value_pairs else set() - - # Combine both sets - all_pipeline_ids = explicit_ids.union(tag_ids) - - if not all_pipeline_ids: - self.log.info("No pipelines specified (neither by ID nor by tags)") - return - - self.log.info(f"Importing event logs for {len(all_pipeline_ids)} pipeline(s)") - self.import_event_logs_for_pipelines(list(all_pipeline_ids), spark) + def __init__( + self, + monitoring_catalog: str, + monitoring_schema: str, + index_table_name: str, + wc: Optional[WorkspaceClient] = None, + ): + """ + Constructor. + :param monitoring_catalog: The catalog for the index table + :param monitoring_schema: The schema for the index table + :param index_table_name: The name of the index table + :param wc: The WorkspaceClient to use; if none is specified, a new one will be instantiated + """ + self.monitoring_catalog = monitoring_catalog.strip() + self.monitoring_schema = monitoring_schema.strip() + self.index_table_name = index_table_name.strip() + self.index_table_fqn = f"`{self.monitoring_catalog}`.`{self.monitoring_schema}`.`{self.index_table_name}`" + self.wc = wc if wc else WorkspaceClient() + self.log = logging.getLogger("PipelineTagsIndexBuilder") + + def build_index(self, spark: SparkSession): + """ + Builds the pipeline tags index and writes it to a Delta table. + The index maps tag:value pairs to lists of pipeline IDs. + """ + from datetime import datetime + from pyspark.sql import Row + + self.log.info(f"Building pipeline tags index in table: {self.index_table_fqn}") + + # List all pipelines + self.log.info("Listing all pipelines...") + all_pipelines_id = [pi.pipeline_id for pi in self.wc.pipelines.list_pipelines()] + self.log.info(f"Found {len(all_pipelines_id)} pipelines") + + # Build inverted index: tag:value -> [pipeline_ids] + tags_index = {} # {(tag_key, tag_value): [pipeline_ids]} + processed_count = 0 + error_count = 0 + + for pipeline_id in all_pipelines_id: + try: + # Check if pipeline has tags + pipeline_tags = get_pipeline_tags(self.wc, pipeline_id) + if pipeline_tags: + # Add to inverted index + for tag_key, tag_value in pipeline_tags.items(): + key = (tag_key, tag_value) + if key not in tags_index: + tags_index[key] = [] + tags_index[key].append(pipeline_id) + + processed_count += 1 + if processed_count % 100 == 0: + self.log.info( + f"Processed {processed_count}/{len(all_pipelines_id)} pipelines..." + ) + + except Exception as e: + error_count += 1 + self.log.warning(f"Failed to process pipeline {pipeline_id}: {e}") + continue + + self.log.info(f"Processed {processed_count} pipelines ({error_count} errors)") + self.log.info(f"Found {len(tags_index)} unique tag:value pairs") + + # Convert to DataFrame and write to Delta table + if tags_index: + # Create rows for the DataFrame + rows = [ + Row( + tag_key=tag_key, + tag_value=tag_value, + pipeline_ids=pipeline_ids, + last_updated=datetime.utcnow(), + ) + for (tag_key, tag_value), pipeline_ids in tags_index.items() + ] + + # Create DataFrame + df = spark.createDataFrame(rows) + + # Write to Delta table (overwrite to ensure freshness) + self.log.info(f"Writing index to {self.index_table_fqn}...") + df.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable( + self.index_table_fqn + ) + + self.log.info( + f"Successfully built pipeline tags index with {len(tags_index)} entries" + ) + + else: + self.log.warning( + "No tags found in any pipelines. Index table will not be created/updated." + ) -class PipelineTagsIndexBuilder: - """ - A helper class to build an inverted index mapping pipeline tags to pipeline IDs. - The index is stored in a Delta table and enables efficient discovery of pipelines by tags - without having to query the Databricks API for every pipeline. - """ - def __init__(self, monitoring_catalog: str, monitoring_schema: str, index_table_name: str, wc: Optional[WorkspaceClient] = None): - """ - Constructor. - :param monitoring_catalog: The catalog for the index table - :param monitoring_schema: The schema for the index table - :param index_table_name: The name of the index table - :param wc: The WorkspaceClient to use; if none is specified, a new one will be instantiated +class DashboardTemplate: """ - self.monitoring_catalog = monitoring_catalog.strip() - self.monitoring_schema = monitoring_schema.strip() - self.index_table_name = index_table_name.strip() - self.index_table_fqn = f"`{self.monitoring_catalog}`.`{self.monitoring_schema}`.`{self.index_table_name}`" - self.wc = wc if wc else WorkspaceClient() - self.log = logging.getLogger("PipelineTagsIndexBuilder") - + A helper class to transform the definition of dashboard based on DAB configuration variables. This is a workaround as + currently AI/BI dashboards have limitted parametrization capabilites. - def build_index(self, spark: SparkSession): + Currently, the only transformation supported is setting the default catalog and schema for all datasets in the dashboard. """ - Builds the pipeline tags index and writes it to a Delta table. - The index maps tag:value pairs to lists of pipeline IDs. - """ - from datetime import datetime - from pyspark.sql import Row - - self.log.info(f"Building pipeline tags index in table: {self.index_table_fqn}") - - # List all pipelines - self.log.info("Listing all pipelines...") - all_pipelines_id = [pi.pipeline_id for pi in self.wc.pipelines.list_pipelines()] - self.log.info(f"Found {len(all_pipelines_id)} pipelines") - - # Build inverted index: tag:value -> [pipeline_ids] - tags_index = {} # {(tag_key, tag_value): [pipeline_ids]} - processed_count = 0 - error_count = 0 - - for pipeline_id in all_pipelines_id: - try: - # Check if pipeline has tags - pipeline_tags = get_pipeline_tags(self.wc, pipeline_id) - if pipeline_tags: - # Add to inverted index - for tag_key, tag_value in pipeline_tags.items(): - key = (tag_key, tag_value) - if key not in tags_index: - tags_index[key] = [] - tags_index[key].append(pipeline_id) - - processed_count += 1 - if processed_count % 100 == 0: - self.log.info(f"Processed {processed_count}/{len(all_pipelines_id)} pipelines...") - - except Exception as e: - error_count += 1 - self.log.warning(f"Failed to process pipeline {pipeline_id}: {e}") - continue - - self.log.info(f"Processed {processed_count} pipelines ({error_count} errors)") - self.log.info(f"Found {len(tags_index)} unique tag:value pairs") - - # Convert to DataFrame and write to Delta table - if tags_index: - # Create rows for the DataFrame - rows = [ - Row( - tag_key=tag_key, - tag_value=tag_value, - pipeline_ids=pipeline_ids, - last_updated=datetime.utcnow() + + def __init__( + self, + dashboard_template_path: str, + dashboard_id: Optional[str] = None, + published_dashboard_name: Optional[str] = None, + default_dataset_catalog: Optional[str] = None, + default_dataset_schema: Optional[str] = None, + warehouse_id: Optional[str] = None, + wc: Optional[WorkspaceClient] = None, + ): + """ + Constructor + :param dashboard_template_path: (required) the path to the `.lvdash.json` file of the dashboard to use as a template for publishing + :param dashboard_id: the name of the AI/BI dashboard to update if known. If not specified, the notebook will attempt to find a + dashboard with the specified `published_dashboard_name`. If none is found, a new one will be created. If + multiple such dashboards aere found, the notebook will fail with an error and an explicit dashboard_id must + be specified. + :param published_dashboard_name: (optional) the display name of the dashboard. If not specified, the name of the file (without + the .lvdash.json extension and "Template") will be used. + :param default_dataset_catalog: (optional) the default catalog for datasets to be set + :param default_dataset_schema: (optional) the detault schema for datasets to be set + :param warehouse_id: (optional) the ID of the warehouse to use for the AI/BI dashboard. If not specified, the first suitable one will be used. + :param wc: the WorkspaceClient to use; if none is specified, a new one will be instantiated + """ + self.log = logging.getLogger("DashboardTemplate") + self.wc = wc if wc else WorkspaceClient() + self.dashboard_template_path = get_required_parameter( + name="dashboard_template_path", value=dashboard_template_path + ) + if not os.path.exists(dashboard_template_path): + raise ValueError( + f"Dashboard at path {dashboard_template_path} does not exist" + ) + self.dashboard_id = get_optional_parameter(dashboard_id) + self.published_dashboard_name = ( + published_dashboard_name + if is_parameter_defined(published_dashboard_name) + else self._extract_dashboard_name_from_path(dashboard_template_path) + ) + self.default_dataset_catalog = get_optional_parameter(default_dataset_catalog) + self.default_dataset_schema = get_optional_parameter(default_dataset_schema) + self.warehouse_id = ( + warehouse_id + if is_parameter_defined(warehouse_id) + else self._get_default_warehouse_id() + ) + if self.warehouse_id is None: + raise Exception( + "Unable to find a suitable warehouse for the AI/BI dashboard. Please set `warehouse_id` with the ID of the warehouse to use." + ) + + @staticmethod + def from_notebook_widgets(widgets, wc: Optional[WorkspaceClient] = None): + return DashboardTemplate( + dashboard_template_path=widgets.get("dashboard_template_path"), + dashboard_id=widgets.get("dashboard_id"), + published_dashboard_name=widgets.get("published_dashboard_name"), + default_dataset_catalog=widgets.get("default_dataset_catalog"), + default_dataset_schema=widgets.get("default_dataset_schema"), + warehouse_id=widgets.get("warehouse_id"), + wc=wc, ) - for (tag_key, tag_value), pipeline_ids in tags_index.items() - ] - # Create DataFrame - df = spark.createDataFrame(rows) + @staticmethod + def _extract_dashboard_name_from_path(dasboard_path: str) -> str: + display_name = os.path.basename(dashboard_path).replace(".lvdash.json", "") + return re.sub(r"\s+Template", "", display_name) - # Write to Delta table (overwrite to ensure freshness) - self.log.info(f"Writing index to {self.index_table_fqn}...") - df.write \ - .mode("overwrite") \ - .option("overwriteSchema", "true") \ - .saveAsTable(self.index_table_fqn) + def _get_default_warehouse_id(self): + warehouse_name = None + preferred_warehouse = min( + [w for w in self.wc.warehouses.list() if w.state == State.RUNNING], + key=lambda w: f"{'0' if w.enable_serverless_compute else '9'}{w.name}", + ) + if preferred_warehouse is not None: + self.log.info( + f"Using warehouse: {preferred_warehouse.name} ({preferred_warehouse.id})" + ) + return preferred_warehouse.id + else: + self.log.warn(f"No suitable warehouse found") + + def _find_all_dashboards_with_name(self, display_name: str): + dashboard_ids = [] + for d in self.wc.lakeview.list(): + if d.display_name == display_name: + dashboard_ids.append(d.dashboard_id) + self.log.info( + f"Found existing dashboard with display name '{d.display_name}' ({d.dashboard_id})" + ) + else: + self.log.debug( + f"Ignoring dashboard with display name '{d.display_name}' != '{display_name}'" + ) + return dashboard_ids - self.log.info(f"Successfully built pipeline tags index with {len(tags_index)} entries") + def _get_dashboard_id(self) -> Optional[str]: + if self.dashboard_id is not None: + return self.dashboard_id - else: - self.log.warning("No tags found in any pipelines. Index table will not be created/updated.") + candidate_ids = self._find_all_dashboards_with_name( + self.published_dashboard_name + ) + if len(candidate_ids) > 1: + raise ValueError( + f"Multiple dashboard found with display name {self.published_dashboard_name}. Please specify an explicit `dashboard_id`." + ) + return None if len(candidate_ids) == 0 else candidate_ids[0] + def _process_dataset(self, dataset_elem: dict): + dataset_elem["catalog"] = self.default_dataset_catalog + dataset_elem["schema"] = self.default_dataset_schema -class DashboardTemplate: - """ - A helper class to transform the definition of dashboard based on DAB configuration variables. This is a workaround as - currently AI/BI dashboards have limitted parametrization capabilites. - - Currently, the only transformation supported is setting the default catalog and schema for all datasets in the dashboard. - """ - def __init__(self, - dashboard_template_path: str, - dashboard_id: Optional[str] = None, - published_dashboard_name: Optional[str] = None, - default_dataset_catalog: Optional[str] = None, - default_dataset_schema: Optional[str] = None, - warehouse_id: Optional[str] = None, - wc: Optional[WorkspaceClient] = None): - """ - Constructor - :param dashboard_template_path: (required) the path to the `.lvdash.json` file of the dashboard to use as a template for publishing - :param dashboard_id: the name of the AI/BI dashboard to update if known. If not specified, the notebook will attempt to find a - dashboard with the specified `published_dashboard_name`. If none is found, a new one will be created. If - multiple such dashboards aere found, the notebook will fail with an error and an explicit dashboard_id must - be specified. - :param published_dashboard_name: (optional) the display name of the dashboard. If not specified, the name of the file (without - the .lvdash.json extension and "Template") will be used. - :param default_dataset_catalog: (optional) the default catalog for datasets to be set - :param default_dataset_schema: (optional) the detault schema for datasets to be set - :param warehouse_id: (optional) the ID of the warehouse to use for the AI/BI dashboard. If not specified, the first suitable one will be used. - :param wc: the WorkspaceClient to use; if none is specified, a new one will be instantiated - """ - self.log = logging.getLogger("DashboardTemplate") - self.wc = wc if wc else WorkspaceClient() - self.dashboard_template_path = get_required_parameter(name='dashboard_template_path', value=dashboard_template_path) - if not os.path.exists(dashboard_template_path): - raise ValueError(f"Dashboard at path {dashboard_template_path} does not exist") - self.dashboard_id = get_optional_parameter(dashboard_id) - self.published_dashboard_name = published_dashboard_name if is_parameter_defined(published_dashboard_name) else self._extract_dashboard_name_from_path(dashboard_template_path) - self.default_dataset_catalog = get_optional_parameter(default_dataset_catalog) - self.default_dataset_schema = get_optional_parameter(default_dataset_schema) - self.warehouse_id = warehouse_id if is_parameter_defined(warehouse_id) else self._get_default_warehouse_id() - if self.warehouse_id is None: - raise Exception("Unable to find a suitable warehouse for the AI/BI dashboard. Please set `warehouse_id` with the ID of the warehouse to use.") - - @staticmethod - def from_notebook_widgets(widgets, wc: Optional[WorkspaceClient] = None): - return DashboardTemplate(dashboard_template_path=widgets.get("dashboard_template_path"), - dashboard_id=widgets.get("dashboard_id"), - published_dashboard_name=widgets.get("published_dashboard_name"), - default_dataset_catalog=widgets.get("default_dataset_catalog"), - default_dataset_schema=widgets.get("default_dataset_schema"), - warehouse_id=widgets.get("warehouse_id"), - wc=wc) - - - @staticmethod - def _extract_dashboard_name_from_path(dasboard_path: str) -> str: - display_name = os.path.basename(dashboard_path).replace(".lvdash.json", "") - return re.sub(r'\s+Template', '', display_name) - - - def _get_default_warehouse_id(self): - warehouse_name = None - preferred_warehouse = min([w for w in self.wc.warehouses.list() if w.state == State.RUNNING], - key=lambda w: f"{'0' if w.enable_serverless_compute else '9'}{w.name}") - if preferred_warehouse is not None: - self.log.info(f"Using warehouse: {preferred_warehouse.name} ({preferred_warehouse.id})") - return preferred_warehouse.id - else: - self.log.warn(f"No suitable warehouse found") - - - def _find_all_dashboards_with_name(self, display_name: str): - dashboard_ids = [] - for d in self.wc.lakeview.list(): - if d.display_name == display_name: - dashboard_ids.append(d.dashboard_id) - self.log.info(f"Found existing dashboard with display name '{d.display_name}' ({d.dashboard_id})") - else: - self.log.debug(f"Ignoring dashboard with display name '{d.display_name}' != '{display_name}'") - return dashboard_ids - - - def _get_dashboard_id(self) -> Optional[str]: - if self.dashboard_id is not None: - return self.dashboard_id - - candidate_ids = self._find_all_dashboards_with_name(self.published_dashboard_name) - if len(candidate_ids) > 1: - raise ValueError(f"Multiple dashboard found with display name {self.published_dashboard_name}. Please specify an explicit `dashboard_id`.") - return None if len(candidate_ids) == 0 else candidate_ids[0] - - - def _process_dataset(self, dataset_elem: dict): - dataset_elem['catalog'] = self.default_dataset_catalog - dataset_elem['schema'] = self.default_dataset_schema - - - def publish(self): - """ - Publishes the dashboard - """ - with open(self.dashboard_template_path) as f: - dashboard_json = json.load(f) - - for ds in dashboard_json.get("datasets", []): - self._process_dataset(ds) - - real_dashboard_id = self._get_dashboard_id() - if real_dashboard_id is None: - self.log.info(f"Creating new dashboard with display name '{self.published_dashboard_name}'") - else: - self.log.info(f"Using existing dashboard with ID {real_dashboard_id}") - - d_json = { - "display_name": self.published_dashboard_name, - "serialized_dashboard": json.dumps(dashboard_json), - "warehouse_id": self.warehouse_id + def publish(self): + """ + Publishes the dashboard + """ + with open(self.dashboard_template_path) as f: + dashboard_json = json.load(f) + + for ds in dashboard_json.get("datasets", []): + self._process_dataset(ds) + + real_dashboard_id = self._get_dashboard_id() + if real_dashboard_id is None: + self.log.info( + f"Creating new dashboard with display name '{self.published_dashboard_name}'" + ) + else: + self.log.info(f"Using existing dashboard with ID {real_dashboard_id}") + + d_json = { + "display_name": self.published_dashboard_name, + "serialized_dashboard": json.dumps(dashboard_json), + "warehouse_id": self.warehouse_id, } - if real_dashboard_id is None: - d = self.wc.lakeview.create(dashboard=Dashboard.from_dict(d_json)) - self.log.info(f"Created dashboard '{d.display_name}' (ID={d.dashboard_id} ETAG={d.etag})") - real_dashboard_id = d.dashboard_id - else: - d_json["dashboard_id"] = real_dashboard_id - d = self.wc.lakeview.update(dashboard_id=real_dashboard_id, dashboard=Dashboard.from_dict(d_json)) - self.log.info(f"Updated dashboard '{d.display_name}' (ID={d.dashboard_id} ETAG={d.etag})") - - pd = self.wc.lakeview.publish(dashboard_id=real_dashboard_id, embed_credentials=True, warehouse_id=self.warehouse_id) - self.log.info(f"Published dashboard '{pd.display_name}' revision time {pd.revision_create_time}") - - + if real_dashboard_id is None: + d = self.wc.lakeview.create(dashboard=Dashboard.from_dict(d_json)) + self.log.info( + f"Created dashboard '{d.display_name}' (ID={d.dashboard_id} ETAG={d.etag})" + ) + real_dashboard_id = d.dashboard_id + else: + d_json["dashboard_id"] = real_dashboard_id + d = self.wc.lakeview.update( + dashboard_id=real_dashboard_id, dashboard=Dashboard.from_dict(d_json) + ) + self.log.info( + f"Updated dashboard '{d.display_name}' (ID={d.dashboard_id} ETAG={d.etag})" + ) + + pd = self.wc.lakeview.publish( + dashboard_id=real_dashboard_id, + embed_credentials=True, + warehouse_id=self.warehouse_id, + ) + self.log.info( + f"Published dashboard '{pd.display_name}' revision time {pd.revision_create_time}" + ) diff --git a/contrib/databricks_ingestion_monitoring/common/lib/databricks_ingestion_monitoring/common_ldp.py b/contrib/databricks_ingestion_monitoring/common/lib/databricks_ingestion_monitoring/common_ldp.py index 1b856d6..b40874c 100644 --- a/contrib/databricks_ingestion_monitoring/common/lib/databricks_ingestion_monitoring/common_ldp.py +++ b/contrib/databricks_ingestion_monitoring/common/lib/databricks_ingestion_monitoring/common_ldp.py @@ -13,343 +13,400 @@ from .common import parse_comma_separated_list from .standard_tables import ( - EVENT_LOGS_BRONZE, EVENTS_ERRORS, EVENTS_TABLE_METRICS, EVENTS_WARNINGS, - METRIC_PIPELINE_HOURLY_ERROR_RATE, - MONITORED_PIPELINES, MONITORED_TABLES, - PIPELINE_RUNS_STATUS, - PIPELINES_STATUS, PIPELINES_STATUS_SILVER, PIPELINE_TAGS_INDEX, - TABLE_EVENTS_EXPECTATION_CHECKS, - TABLE_STATUS_PER_PIPELINE_RUN, TABLE_STATUS - ) + EVENT_LOGS_BRONZE, + EVENTS_ERRORS, + EVENTS_TABLE_METRICS, + EVENTS_WARNINGS, + METRIC_PIPELINE_HOURLY_ERROR_RATE, + MONITORED_PIPELINES, + MONITORED_TABLES, + PIPELINE_RUNS_STATUS, + PIPELINES_STATUS, + PIPELINES_STATUS_SILVER, + PIPELINE_TAGS_INDEX, + TABLE_EVENTS_EXPECTATION_CHECKS, + TABLE_STATUS_PER_PIPELINE_RUN, + TABLE_STATUS, +) + def sanitize_string_for_dlt_name(s: str) -> str: - res = "" - for c in s: - if c == '.' or c == '-': - res += '_' - elif c != '`': - res += c - return res + res = "" + for c in s: + if c == "." or c == "-": + res += "_" + elif c != "`": + res += c + return res class Constants: - """ - Shared names and other constants - """ - # Shared table names - created_pipeline_runs="created_pipeline_runs" - standard_pipeline_runs="standard_pipeline_runs" + """ + Shared names and other constants + """ + + # Shared table names + created_pipeline_runs = "created_pipeline_runs" + standard_pipeline_runs = "standard_pipeline_runs" - # Miscellaneous - sql_fields_def_extension_point = "-- fields def extension point" - where_clause_extension_point = "-- where clause extension point" + # Miscellaneous + sql_fields_def_extension_point = "-- fields def extension point" + where_clause_extension_point = "-- where clause extension point" class Configuration: - """ - Base monitoring ETL pipeline configuration - """ - def __init__(self, conf: Dict[str, str]): - self.monitoring_catalog = self._required_string_param(conf, "monitoring_catalog") - self.monitoring_schema = self._required_string_param(conf, "monitoring_schema") - self.directly_monitored_pipeline_ids=conf.get("directly_monitored_pipeline_ids", "") - self.directly_monitored_pipeline_tags=conf.get("directly_monitored_pipeline_tags", "") - self.imported_event_log_tables = conf.get("imported_event_log_tables", "") - - # Pipeline tags index configuration - self.pipeline_tags_index_table_name = conf.get("pipeline_tags_index_table_name", "pipeline_tags_index") - self.pipeline_tags_index_enabled = conf.get("pipeline_tags_index_enabled", "true").lower() == "true" - self.pipeline_tags_index_max_age_hours = int(conf.get("pipeline_tags_index_max_age_hours", "24")) - self.pipeline_tags_index_api_fallback_enabled = conf.get("pipeline_tags_index_api_fallback_enabled", "true").lower() == "true" - - @staticmethod - def _required_string_param(conf: Dict[str, str], param_name: str): - val = conf.get(param_name) - if val is None or len(val.strip()) == 0: - raise ValueError(f"Missing required parameter '{param_name}'") - return val + """ + Base monitoring ETL pipeline configuration + """ + + def __init__(self, conf: Dict[str, str]): + self.monitoring_catalog = self._required_string_param( + conf, "monitoring_catalog" + ) + self.monitoring_schema = self._required_string_param(conf, "monitoring_schema") + self.directly_monitored_pipeline_ids = conf.get( + "directly_monitored_pipeline_ids", "" + ) + self.directly_monitored_pipeline_tags = conf.get( + "directly_monitored_pipeline_tags", "" + ) + self.imported_event_log_tables = conf.get("imported_event_log_tables", "") + + # Pipeline tags index configuration + self.pipeline_tags_index_table_name = conf.get( + "pipeline_tags_index_table_name", "pipeline_tags_index" + ) + self.pipeline_tags_index_enabled = ( + conf.get("pipeline_tags_index_enabled", "true").lower() == "true" + ) + self.pipeline_tags_index_max_age_hours = int( + conf.get("pipeline_tags_index_max_age_hours", "24") + ) + self.pipeline_tags_index_api_fallback_enabled = ( + conf.get("pipeline_tags_index_api_fallback_enabled", "true").lower() + == "true" + ) + + @staticmethod + def _required_string_param(conf: Dict[str, str], param_name: str): + val = conf.get(param_name) + if val is None or len(val.strip()) == 0: + raise ValueError(f"Missing required parameter '{param_name}'") + return val # A helper class to capture metadata about monitored pipelines PipelineInfo = namedtuple( - 'PipelineInfo', + "PipelineInfo", field_names=[ - "pipeline_id", - "pipeline_name", - "pipeline_link", - "pipeline_type", - "default_catalog", - "default_schema", - "event_log_source", - "tags_map", - "tags_array" - ] + "pipeline_id", + "pipeline_name", + "pipeline_link", + "pipeline_type", + "default_catalog", + "default_schema", + "event_log_source", + "tags_map", + "tags_array", + ], ) class MonitoringEtlPipeline: - """ - A helper class to keep track of monitored pipelines. - """ - - def __init__(self, conf: Configuration, spark: SparkSession): - self.conf = conf - self.spark = spark - self.monitored_pipeline_ids = [] - self.imported_event_log_tables = [] - # a dict from a pipeline id to a imported event log table for pipelines detected in these tables - self.other_pipeline_event_logs: Dict[str, str] = {} - # a dict from a monitored pipeline id to all metadata about this pipeline - self.pipeline_infos: Dict[str, PipelineInfo] = {} - # The set of all unique sources of event logs; this includes both tables with imported logs and also Delta event logs - self.event_log_sources: Set[str] = set() - self.wc = WorkspaceClient() - self.log = logging.getLogger("MonitoredPipelines") - self.event_log_source_views_mapping = {} - - # Automatically register pipelines in configuration - self.register_delta_event_logs_from_pipelines_comma_list(self.conf.directly_monitored_pipeline_ids) - self.register_delta_event_logs_from_pipelines_by_tags(self.conf.directly_monitored_pipeline_tags) - self.register_imported_logs_tables_from_comma_list(self.conf.imported_event_log_tables, spark) - - - def register_delta_event_logs_for_one_pipeline(self, pipeline_id: str): """ - Registers a pipeline that is being monitored. This method will extract all necessary metadata. + A helper class to keep track of monitored pipelines. """ - self.log.info(f"Detecting configuration for pipeline {pipeline_id} ...") - try: - spec = self.wc.api_client.do("GET", f"/api/2.0/pipelines/{pipeline_id}").get('spec', {}) - except ResourceDoesNotExist: - self.log.warn(f"Skipping pipeline {pipeline_id} that no longer exists...") - return + def __init__(self, conf: Configuration, spark: SparkSession): + self.conf = conf + self.spark = spark + self.monitored_pipeline_ids = [] + self.imported_event_log_tables = [] + # a dict from a pipeline id to a imported event log table for pipelines detected in these tables + self.other_pipeline_event_logs: Dict[str, str] = {} + # a dict from a monitored pipeline id to all metadata about this pipeline + self.pipeline_infos: Dict[str, PipelineInfo] = {} + # The set of all unique sources of event logs; this includes both tables with imported logs and also Delta event logs + self.event_log_sources: Set[str] = set() + self.wc = WorkspaceClient() + self.log = logging.getLogger("MonitoredPipelines") + self.event_log_source_views_mapping = {} + + # Automatically register pipelines in configuration + self.register_delta_event_logs_from_pipelines_comma_list( + self.conf.directly_monitored_pipeline_ids + ) + self.register_delta_event_logs_from_pipelines_by_tags( + self.conf.directly_monitored_pipeline_tags + ) + self.register_imported_logs_tables_from_comma_list( + self.conf.imported_event_log_tables, spark + ) + + def register_delta_event_logs_for_one_pipeline(self, pipeline_id: str): + """ + Registers a pipeline that is being monitored. This method will extract all necessary metadata. + """ - event_log_info = spec.get("event_log", {}) - if ('name' not in event_log_info) and (pipeline_id not in self.other_pipeline_event_logs): - raise Exception(f"""Pipeline {spec.get('name')} ({pipeline_id}) is not configured for Delta table event log and is not imported. + self.log.info(f"Detecting configuration for pipeline {pipeline_id} ...") + try: + spec = self.wc.api_client.do( + "GET", f"/api/2.0/pipelines/{pipeline_id}" + ).get("spec", {}) + except ResourceDoesNotExist: + self.log.warn(f"Skipping pipeline {pipeline_id} that no longer exists...") + return + + event_log_info = spec.get("event_log", {}) + if ("name" not in event_log_info) and ( + pipeline_id not in self.other_pipeline_event_logs + ): + raise Exception(f"""Pipeline {spec.get("name")} ({pipeline_id}) is not configured for Delta table event log and is not imported. Either configure the event log to be written to a Delta table or imported it using the import_event_logs job: {spec}""") - - if spec.get('gateway_definition') is not None: - pipeline_type = 'gateway' - elif spec.get('ingestion_definition') is not None: - pipeline_type = 'ingestion' - else: - pipeline_type = 'etl' - - event_log_source = ( - f"`{event_log_info['catalog']}`.`{event_log_info['schema']}`.`{event_log_info['name']}`" if 'name' in event_log_info - else self.other_pipeline_event_logs[pipeline_id] - ) - - # Extract tags from pipeline spec - tags = spec.get('tags', {}) - # Create a map representation of tags - tags_map = tags if tags else None - # Create an array of "tag:value" strings for AI/BI dashboard filtering - tags_array = [f"{k}:{v}" for k, v in tags.items()] if tags else None - - self.pipeline_infos[pipeline_id] = PipelineInfo(pipeline_id=pipeline_id, - pipeline_name=spec['name'], - pipeline_link=f"{spec['name']}", - pipeline_type=pipeline_type, - default_catalog = spec['catalog'], - default_schema = spec.get('schema', spec.get('target')), - event_log_source=event_log_source, - tags_map=tags_map, - tags_array=tags_array) - self.event_log_sources.add(event_log_source) - self.log.info(f"Registered pipeline {spec.get('name')} ({pipeline_id}) ...") - - - def register_delta_event_logs_for_pipelines(self, pipeline_ids: Iterable[str]): - """ - Registers a collection of pipelines that are being monitored. This method will extract all necessary metadata. - """ - for pipeline_id in pipeline_ids: - self.register_delta_event_logs_for_one_pipeline(pipeline_id=pipeline_id) + if spec.get("gateway_definition") is not None: + pipeline_type = "gateway" + elif spec.get("ingestion_definition") is not None: + pipeline_type = "ingestion" + else: + pipeline_type = "etl" + + event_log_source = ( + f"`{event_log_info['catalog']}`.`{event_log_info['schema']}`.`{event_log_info['name']}`" + if "name" in event_log_info + else self.other_pipeline_event_logs[pipeline_id] + ) - def register_delta_event_logs_from_pipelines_comma_list(self, pipelines_comma_list: str): - """ - Registers a list of pipelines that are being monitored as a comma-separted list. This is primarily to be - used with spark configuration and notebook parameters - """ - self.register_delta_event_logs_for_pipelines(parse_comma_separated_list(pipelines_comma_list)) + # Extract tags from pipeline spec + tags = spec.get("tags", {}) + # Create a map representation of tags + tags_map = tags if tags else None + # Create an array of "tag:value" strings for AI/BI dashboard filtering + tags_array = [f"{k}:{v}" for k, v in tags.items()] if tags else None + + self.pipeline_infos[pipeline_id] = PipelineInfo( + pipeline_id=pipeline_id, + pipeline_name=spec["name"], + pipeline_link=f"{spec['name']}", + pipeline_type=pipeline_type, + default_catalog=spec["catalog"], + default_schema=spec.get("schema", spec.get("target")), + event_log_source=event_log_source, + tags_map=tags_map, + tags_array=tags_array, + ) + self.event_log_sources.add(event_log_source) + self.log.info(f"Registered pipeline {spec.get('name')} ({pipeline_id}) ...") + def register_delta_event_logs_for_pipelines(self, pipeline_ids: Iterable[str]): + """ + Registers a collection of pipelines that are being monitored. This method will extract all necessary metadata. + """ + for pipeline_id in pipeline_ids: + self.register_delta_event_logs_for_one_pipeline(pipeline_id=pipeline_id) - def register_delta_event_logs_from_pipelines_by_tags(self, tags_str: str): - """ - Registers pipelines that match ANY of the specified tag:value pairs for monitoring. - :param tags_str: Comma-separated list of tag:value pairs (e.g., "env:prod,team:data") - """ - from .common import parse_tag_value_pairs, get_pipeline_ids_by_tags - - tag_groups = parse_tag_value_pairs(tags_str) - if not tag_groups: - self.log.info("No tags specified for pipeline filtering") - return - - self.log.info(f"Fetching pipelines matching tags: {tags_str}") - - # Construct fully qualified table name for the index - index_table_fqn = f"`{self.conf.monitoring_catalog}`.`{self.conf.monitoring_schema}`.`{self.conf.pipeline_tags_index_table_name}`" - - pipeline_ids = get_pipeline_ids_by_tags( - wc=self.wc, - tag_groups=tag_groups, - spark=self.spark, - index_table_fqn=index_table_fqn, - index_enabled=self.conf.pipeline_tags_index_enabled, - index_max_age_hours=self.conf.pipeline_tags_index_max_age_hours, - api_fallback_enabled=self.conf.pipeline_tags_index_api_fallback_enabled, - log=self.log - ) - - if not pipeline_ids: - self.log.warning(f"No pipelines found matching any of the tags: {tags_str}") - else: - self.log.info(f"Found {len(pipeline_ids)} pipeline(s) matching tags, registering for monitoring") - self.register_delta_event_logs_for_pipelines(pipeline_ids) - - def register_one_imported_logs_table(self, imported_logs_table: str, spark: SparkSession): - """ - Detects all pipelines in an imported logs table - """ - if len(imported_logs_table.split('.')) < 3: - # Create a fully qualified name if it is not already - imported_logs_table = ( - f"`{self.conf.monitoring_catalog}`.`{self.conf.monitoring_schema}`.`{imported_logs_table}`" if imported_logs_table[0] != '`' - else f"`{self.conf.monitoring_catalog}`.`{self.conf.monitoring_schema}`.{imported_logs_table}" - ) - - self.log.info(f"Detecting pipelines in imported logs table log {imported_logs_table} ...") - self.imported_event_log_tables.append(imported_logs_table) - other_pipeline_ids = [ r.pipeline_id for r in spark.sql(f"SELECT DISTINCT origin.pipeline_id FROM {imported_logs_table}").collect()] - for pid in other_pipeline_ids: - self.other_pipeline_event_logs[pid] = imported_logs_table - self.register_delta_event_logs_for_one_pipeline(pipeline_id=pid) - - - def register_base_tables_and_views(self, spark: SparkSession): - """ - Registers a set of standard views and tables - """ - self.register_monitored_pipelines(spark) - if len(self.pipeline_infos) > 0: - self.register_event_log_source_views(spark) - self.register_created_pipeline_runs(spark) - self.register_event_logs_bronze(spark) - self.register_monitored_tables(spark) - self.register_pipeline_run_status(spark) - self.register_events_errors(spark) - self.register_events_warnings(spark) - self.register_metric_pipeline_hourly_error_rate(spark) - self.register_pipeline_status(spark) - self.register_events_table_metrics(spark) - self.register_table_status_per_pipeline_run(spark) - self.register_table_status(spark) - self.register_table_expectation_checks(spark) - - def register_imported_logs_tables(self, imported_logs_tables: Iterable[str], spark: SparkSession): - """ - Detects all pipelines in a collection of imported logs tables - """ - for imported_logs_table in imported_logs_tables: - self.register_one_imported_logs_table(imported_logs_table, spark) + def register_delta_event_logs_from_pipelines_comma_list( + self, pipelines_comma_list: str + ): + """ + Registers a list of pipelines that are being monitored as a comma-separted list. This is primarily to be + used with spark configuration and notebook parameters + """ + self.register_delta_event_logs_for_pipelines( + parse_comma_separated_list(pipelines_comma_list) + ) + def register_delta_event_logs_from_pipelines_by_tags(self, tags_str: str): + """ + Registers pipelines that match ANY of the specified tag:value pairs for monitoring. + :param tags_str: Comma-separated list of tag:value pairs (e.g., "env:prod,team:data") + """ + from .common import parse_tag_value_pairs, get_pipeline_ids_by_tags + + tag_groups = parse_tag_value_pairs(tags_str) + if not tag_groups: + self.log.info("No tags specified for pipeline filtering") + return + + self.log.info(f"Fetching pipelines matching tags: {tags_str}") + + # Construct fully qualified table name for the index + index_table_fqn = f"`{self.conf.monitoring_catalog}`.`{self.conf.monitoring_schema}`.`{self.conf.pipeline_tags_index_table_name}`" + + pipeline_ids = get_pipeline_ids_by_tags( + wc=self.wc, + tag_groups=tag_groups, + spark=self.spark, + index_table_fqn=index_table_fqn, + index_enabled=self.conf.pipeline_tags_index_enabled, + index_max_age_hours=self.conf.pipeline_tags_index_max_age_hours, + api_fallback_enabled=self.conf.pipeline_tags_index_api_fallback_enabled, + log=self.log, + ) - def register_imported_logs_tables_from_comma_list(self, imported_logs_tables_comma_list: str, spark: SparkSession): - """ - Detects all pipelines in a comma-separated of imported logs table - """ - self.register_imported_logs_tables(parse_comma_separated_list(imported_logs_tables_comma_list), spark) + if not pipeline_ids: + self.log.warning(f"No pipelines found matching any of the tags: {tags_str}") + else: + self.log.info( + f"Found {len(pipeline_ids)} pipeline(s) matching tags, registering for monitoring" + ) + self.register_delta_event_logs_for_pipelines(pipeline_ids) + + def register_one_imported_logs_table( + self, imported_logs_table: str, spark: SparkSession + ): + """ + Detects all pipelines in an imported logs table + """ + if len(imported_logs_table.split(".")) < 3: + # Create a fully qualified name if it is not already + imported_logs_table = ( + f"`{self.conf.monitoring_catalog}`.`{self.conf.monitoring_schema}`.`{imported_logs_table}`" + if imported_logs_table[0] != "`" + else f"`{self.conf.monitoring_catalog}`.`{self.conf.monitoring_schema}`.{imported_logs_table}" + ) + + self.log.info( + f"Detecting pipelines in imported logs table log {imported_logs_table} ..." + ) + self.imported_event_log_tables.append(imported_logs_table) + other_pipeline_ids = [ + r.pipeline_id + for r in spark.sql( + f"SELECT DISTINCT origin.pipeline_id FROM {imported_logs_table}" + ).collect() + ] + for pid in other_pipeline_ids: + self.other_pipeline_event_logs[pid] = imported_logs_table + self.register_delta_event_logs_for_one_pipeline(pipeline_id=pid) + + def register_base_tables_and_views(self, spark: SparkSession): + """ + Registers a set of standard views and tables + """ + self.register_monitored_pipelines(spark) + if len(self.pipeline_infos) > 0: + self.register_event_log_source_views(spark) + self.register_created_pipeline_runs(spark) + self.register_event_logs_bronze(spark) + self.register_monitored_tables(spark) + self.register_pipeline_run_status(spark) + self.register_events_errors(spark) + self.register_events_warnings(spark) + self.register_metric_pipeline_hourly_error_rate(spark) + self.register_pipeline_status(spark) + self.register_events_table_metrics(spark) + self.register_table_status_per_pipeline_run(spark) + self.register_table_status(spark) + self.register_table_expectation_checks(spark) + + def register_imported_logs_tables( + self, imported_logs_tables: Iterable[str], spark: SparkSession + ): + """ + Detects all pipelines in a collection of imported logs tables + """ + for imported_logs_table in imported_logs_tables: + self.register_one_imported_logs_table(imported_logs_table, spark) + + def register_imported_logs_tables_from_comma_list( + self, imported_logs_tables_comma_list: str, spark: SparkSession + ): + """ + Detects all pipelines in a comma-separated of imported logs table + """ + self.register_imported_logs_tables( + parse_comma_separated_list(imported_logs_tables_comma_list), spark + ) + def register_monitored_pipelines(self, spark: SparkSession): + @dlt.table( + name=MONITORED_PIPELINES.name, + cluster_by=["pipeline_id"], + comment=MONITORED_PIPELINES.table_comment, + table_properties={"delta.enableRowTracking": "true"}, + ) + def monitored_pipelines(): + return spark.createDataFrame( + self.pipeline_infos.values(), + schema="pipeline_id STRING, pipeline_name STRING, pipeline_link STRING, pipeline_type STRING, default_catalog STRING, default_schema STRING, event_log_source STRING, tags_map MAP, tags_array ARRAY", + ) - def register_monitored_pipelines(self, spark: SparkSession): - @dlt.table(name=MONITORED_PIPELINES.name, - cluster_by=['pipeline_id'], - comment=MONITORED_PIPELINES.table_comment, - table_properties={ - "delta.enableRowTracking": "true" - }) - def monitored_pipelines(): - return spark.createDataFrame( - self.pipeline_infos.values(), - schema="pipeline_id STRING, pipeline_name STRING, pipeline_link STRING, pipeline_type STRING, default_catalog STRING, default_schema STRING, event_log_source STRING, tags_map MAP, tags_array ARRAY" - ) + def register_event_log_source_views(self, spark: SparkSession) -> Dict[str, str]: + """ + Generates a view for each event log table. We need to ensure that "skipChangeCommits" is set to true + so we don't break if modification or deletions are done in those tables. + :return: A mapping from event logs source table to its corresponding view + """ - def register_event_log_source_views(self, spark: SparkSession) -> Dict[str, str]: - """ - Generates a view for each event log table. We need to ensure that "skipChangeCommits" is set to true - so we don't break if modification or deletions are done in those tables. + def create_event_log_source_view(event_log_source: str) -> str: + view_name = f"source_{sanitize_string_for_dlt_name(event_log_source)}" + print( + f"Defining source view {view_name} for event log source {event_log_source}" + ) - :return: A mapping from event logs source table to its corresponding view - """ - def create_event_log_source_view(event_log_source: str) -> str: - view_name = f"source_{sanitize_string_for_dlt_name(event_log_source)}" - print(f"Defining source view {view_name} for event log source {event_log_source}") - - @dlt.view(name=view_name) - def event_logs_source_view(): - return spark.readStream.option("skipChangeCommits", "true").table(event_log_source) - - return view_name - - self.event_log_source_views_mapping = { - event_log_source: create_event_log_source_view(event_log_source) - for event_log_source in self.event_log_sources} - - return self.event_log_source_views_mapping - - - def transfom_and_append_event_log_sources(self, - target: str, - flow_prefix: str, - append_def: Callable[[str], DataFrame]): - """ - Creates append flows per event log source into a target table or sink + @dlt.view(name=view_name) + def event_logs_source_view(): + return spark.readStream.option("skipChangeCommits", "true").table( + event_log_source + ) - :param target: the name of the target table or sink - :param flow_prefix: the string to prepend to the name of each flow into the target - :param append_def: a function that defines the append flow; it takes the name of the event log - stream source as a parameter - """ + return view_name - def process_el_source(el_source: str): - flow_name = f"{flow_prefix}_{sanitize_string_for_dlt_name(el_source)}" - log_source = f"STREAM(`{self.event_log_source_views_mapping[el_source]}`)" - print(f"Defining event log flow {flow_name} from {log_source} into {target}") + self.event_log_source_views_mapping = { + event_log_source: create_event_log_source_view(event_log_source) + for event_log_source in self.event_log_sources + } - @dlt.append_flow(name=flow_name, target=target) - def el_append_flow(): - return append_def(log_source) + return self.event_log_source_views_mapping - for el_source in self.event_log_sources: - process_el_source(el_source) + def transfom_and_append_event_log_sources( + self, target: str, flow_prefix: str, append_def: Callable[[str], DataFrame] + ): + """ + Creates append flows per event log source into a target table or sink + :param target: the name of the target table or sink + :param flow_prefix: the string to prepend to the name of each flow into the target + :param append_def: a function that defines the append flow; it takes the name of the event log + stream source as a parameter + """ - def register_created_pipeline_runs(self, spark: SparkSession): - """ - Creates a table and a view of all basic metadata about all pipeline runs detected in event logs of monitored - pipelines. This allows to easily filter out runs that are not part of the normal data processing. - """ + def process_el_source(el_source: str): + flow_name = f"{flow_prefix}_{sanitize_string_for_dlt_name(el_source)}" + log_source = f"STREAM(`{self.event_log_source_views_mapping[el_source]}`)" + print( + f"Defining event log flow {flow_name} from {log_source} into {target}" + ) + + @dlt.append_flow(name=flow_name, target=target) + def el_append_flow(): + return append_def(log_source) - dlt.create_streaming_table(name=Constants.created_pipeline_runs, - cluster_by=['pipeline_id', 'pipeline_run_id'], - comment=""" + for el_source in self.event_log_sources: + process_el_source(el_source) + + def register_created_pipeline_runs(self, spark: SparkSession): + """ + Creates a table and a view of all basic metadata about all pipeline runs detected in event logs of monitored + pipelines. This allows to easily filter out runs that are not part of the normal data processing. + """ + + dlt.create_streaming_table( + name=Constants.created_pipeline_runs, + cluster_by=["pipeline_id", "pipeline_run_id"], + comment=""" A table to keep track of created pipeline runs with some metadata about each one. It is used filter out runs that are not part of the normal data processing. """, - table_properties={ - "delta.enableRowTracking": "true" - }) - - # Definition for flows from event log sources into `created_pipeline_runs` - def append_to_created_pipeline_runs(event_log_source: str): - details_partial_schema="STRUCT>" - return spark.sql(f""" + table_properties={"delta.enableRowTracking": "true"}, + ) + + # Definition for flows from event log sources into `created_pipeline_runs` + def append_to_created_pipeline_runs(event_log_source: str): + details_partial_schema = "STRUCT>" + return spark.sql(f""" SELECT pipeline_id, pipeline_run_id, create_time, @@ -366,26 +423,26 @@ def append_to_created_pipeline_runs(event_log_source: str): WHERE event_type == 'create_update') """) - @dlt.view(name=Constants.standard_pipeline_runs) - def generate_standard_pipeline_runs(): - return spark.sql(f""" + @dlt.view(name=Constants.standard_pipeline_runs) + def generate_standard_pipeline_runs(): + return spark.sql(f""" SELECT pipeline_id, pipeline_run_id FROM `{Constants.created_pipeline_runs}` WHERE NOT is_internal_run """) - self.transfom_and_append_event_log_sources( - target=Constants.created_pipeline_runs, - flow_prefix='cpr', - append_def=append_to_created_pipeline_runs) - + self.transfom_and_append_event_log_sources( + target=Constants.created_pipeline_runs, + flow_prefix="cpr", + append_def=append_to_created_pipeline_runs, + ) - def _get_event_logs_bronze_sql(self, event_log_source: str): - """ - Base definition for append flows from the event log sources into `event_logs_bronze` table. Subclasses can override - this and replace {Constants.sql_fields_def_extension_point} with additional fiels they want to include - """ - return f""" + def _get_event_logs_bronze_sql(self, event_log_source: str): + """ + Base definition for append flows from the event log sources into `event_logs_bronze` table. Subclasses can override + this and replace {Constants.sql_fields_def_extension_point} with additional fiels they want to include + """ + return f""" SELECT id, seq_num, pipeline_id, @@ -445,45 +502,50 @@ def _get_event_logs_bronze_sql(self, event_log_source: str): ) """ + def register_event_logs_bronze(self, spark: SparkSession): + """ + Registers tables and views for the bronze layer of the event logs that contains basic common event log + filters and transformations. This is the root source for most of observability tables. + """ - def register_event_logs_bronze(self, spark: SparkSession): - """ - Registers tables and views for the bronze layer of the event logs that contains basic common event log - filters and transformations. This is the root source for most of observability tables. - """ - - def qualify_table_name_if_needed(table_name: str, default_catalog: str, default_schema: str) -> str: - """ - Event logs sometimes contain a fully qualified table name and sometimes just the base name. This - helper UDF uses the pipeline's default catalog and schema and would include those to unqualified - table names. - """ - if table_name is None or default_catalog is None or table_name.find('.') >= 0: - return table_name - return f"{default_catalog}.{default_schema}.{table_name}" - # Comment out due to ES-1633439 - # spark.udf.register("qualify_table_name_if_needed", qualify_table_name_if_needed) - - # Create a helper table to map flows to target table as the target table names are currently not included - # in the event log consistently - dlt.create_streaming_table( - name="flow_targets", - cluster_by=["pipeline_id", "pipeline_run_id", "flow_name"], - comment="""Keeps track of the target tables for each flow so we can attribute flow_progress events to + def qualify_table_name_if_needed( + table_name: str, default_catalog: str, default_schema: str + ) -> str: + """ + Event logs sometimes contain a fully qualified table name and sometimes just the base name. This + helper UDF uses the pipeline's default catalog and schema and would include those to unqualified + table names. + """ + if ( + table_name is None + or default_catalog is None + or table_name.find(".") >= 0 + ): + return table_name + return f"{default_catalog}.{default_schema}.{table_name}" + + # Comment out due to ES-1633439 + # spark.udf.register("qualify_table_name_if_needed", qualify_table_name_if_needed) + + # Create a helper table to map flows to target table as the target table names are currently not included + # in the event log consistently + dlt.create_streaming_table( + name="flow_targets", + cluster_by=["pipeline_id", "pipeline_run_id", "flow_name"], + comment="""Keeps track of the target tables for each flow so we can attribute flow_progress events to specific tables. """, - table_properties={ - "delta.enableRowTracking": "true" - }) - - # The common transformation of event log sources going into the `flow_targets` table - def append_to_flow_targets(event_log_source: str): - partial_flow_definition_details_schema = """STRUCT>>, schema_json: STRING, spark_conf: ARRAY> > >""" - return spark.sql(f""" + return spark.sql(f""" SELECT pipeline_id, pipeline_run_id, flow_name, @@ -505,56 +567,57 @@ def append_to_flow_targets(event_log_source: str): WHERE event_type='flow_definition') AS fd LEFT JOIN {MONITORED_PIPELINES.name} as mp USING (pipeline_id) """) - - self.transfom_and_append_event_log_sources( - target="flow_targets", - flow_prefix='ft', - append_def=append_to_flow_targets) - - dlt.create_streaming_table(name=EVENT_LOGS_BRONZE.name, - cluster_by=['pipeline_id', 'pipeline_run_id', 'table_name'], - comment=EVENT_LOGS_BRONZE.table_comment, - table_properties={ - "delta.enableRowTracking": "true", - 'delta.feature.variantType-preview': 'supported' - }) - - # Definition of the transformations from the event logs sources into `event_logs_bronze` - def append_to_event_logs_bronze(event_log_source: str): - return spark.sql(self._get_event_logs_bronze_sql(event_log_source)) - - self.transfom_and_append_event_log_sources( - target=EVENT_LOGS_BRONZE.name, - flow_prefix="elb", - append_def=append_to_event_logs_bronze) - - def register_monitored_tables(self, spark: SparkSession): - @dlt.table( - name=MONITORED_TABLES.name, - comment=MONITORED_TABLES.table_comment, - table_properties={ - "delta.enableRowTracking": "true" - }) - def monitored_tables(): - return spark.sql(f""" + + self.transfom_and_append_event_log_sources( + target="flow_targets", flow_prefix="ft", append_def=append_to_flow_targets + ) + + dlt.create_streaming_table( + name=EVENT_LOGS_BRONZE.name, + cluster_by=["pipeline_id", "pipeline_run_id", "table_name"], + comment=EVENT_LOGS_BRONZE.table_comment, + table_properties={ + "delta.enableRowTracking": "true", + "delta.feature.variantType-preview": "supported", + }, + ) + + # Definition of the transformations from the event logs sources into `event_logs_bronze` + def append_to_event_logs_bronze(event_log_source: str): + return spark.sql(self._get_event_logs_bronze_sql(event_log_source)) + + self.transfom_and_append_event_log_sources( + target=EVENT_LOGS_BRONZE.name, + flow_prefix="elb", + append_def=append_to_event_logs_bronze, + ) + + def register_monitored_tables(self, spark: SparkSession): + @dlt.table( + name=MONITORED_TABLES.name, + comment=MONITORED_TABLES.table_comment, + table_properties={"delta.enableRowTracking": "true"}, + ) + def monitored_tables(): + return spark.sql(f""" SELECT DISTINCT pipeline_id, table_name FROM `{EVENT_LOGS_BRONZE.name}` WHERE table_name is not null """) - - def register_pipeline_run_status(self, spark: SparkSession): - """ - Register the flows and tables needed to maintain the latest status of runs of monitored pipelines. - """ - # We filter update_progress event from pipeline runs and use apply_changes() to maintain the latest status of each pipeline run - source_view_name = f"{PIPELINE_RUNS_STATUS.name}_source" - - @dlt.view(name=source_view_name) - def pipeline_runs_status_source(): - """ - Generates an apply_changes() stream for pipeline_updates_agg - """ - return spark.sql(f""" + + def register_pipeline_run_status(self, spark: SparkSession): + """ + Register the flows and tables needed to maintain the latest status of runs of monitored pipelines. + """ + # We filter update_progress event from pipeline runs and use apply_changes() to maintain the latest status of each pipeline run + source_view_name = f"{PIPELINE_RUNS_STATUS.name}_source" + + @dlt.view(name=source_view_name) + def pipeline_runs_status_source(): + """ + Generates an apply_changes() stream for pipeline_updates_agg + """ + return spark.sql(f""" SELECT *, ('' || latest_state || '') AS latest_state_with_color FROM (SELECT pipeline_id, @@ -605,24 +668,27 @@ def pipeline_runs_status_source(): FROM STREAM(`{EVENT_LOGS_BRONZE.name}`)) WHERE event_type == 'create_update' OR event_type == 'update_progress') """) - - dlt.create_streaming_table(name=PIPELINE_RUNS_STATUS.name, - cluster_by=['pipeline_id', 'pipeline_run_id'], - comment=PIPELINE_RUNS_STATUS.table_comment, - table_properties={ - "delta.enableRowTracking": "true", - "delta.enableChangeDataFeed": "true" - }) - dlt.apply_changes( + + dlt.create_streaming_table( + name=PIPELINE_RUNS_STATUS.name, + cluster_by=["pipeline_id", "pipeline_run_id"], + comment=PIPELINE_RUNS_STATUS.table_comment, + table_properties={ + "delta.enableRowTracking": "true", + "delta.enableChangeDataFeed": "true", + }, + ) + dlt.apply_changes( source=source_view_name, target=PIPELINE_RUNS_STATUS.name, - keys = ["pipeline_id", "pipeline_run_id"], - sequence_by = "seq_num", - except_column_list = ['seq_num'], - ignore_null_updates = True) - - def _get_events_errors_sql(self): - return f""" + keys=["pipeline_id", "pipeline_run_id"], + sequence_by="seq_num", + except_column_list=["seq_num"], + ignore_null_updates=True, + ) + + def _get_events_errors_sql(self): + return f""" SELECT pipeline_id, pipeline_run_id, pipeline_run_link, @@ -637,18 +703,18 @@ def _get_events_errors_sql(self): WHERE error_full is not null or level="ERROR" """ - def register_events_errors(self, spark: SparkSession): - @dlt.table(name=EVENTS_ERRORS.name, - cluster_by=["pipeline_id", "pipeline_run_id"], - comment=EVENTS_ERRORS.table_comment, - table_properties={ - "delta.enableRowTracking": "true" - }) - def generate_events_errors(): - return spark.sql(self._get_events_errors_sql()) - - def _get_events_warnings_sql(self): - return f""" + def register_events_errors(self, spark: SparkSession): + @dlt.table( + name=EVENTS_ERRORS.name, + cluster_by=["pipeline_id", "pipeline_run_id"], + comment=EVENTS_ERRORS.table_comment, + table_properties={"delta.enableRowTracking": "true"}, + ) + def generate_events_errors(): + return spark.sql(self._get_events_errors_sql()) + + def _get_events_warnings_sql(self): + return f""" SELECT pipeline_id, pipeline_run_id, pipeline_run_link, @@ -660,25 +726,25 @@ def _get_events_warnings_sql(self): WHERE level="WARN" """ - def register_events_warnings(self, spark: SparkSession): - @dlt.table(name=EVENTS_WARNINGS.name, - cluster_by=["pipeline_id", "pipeline_run_id"], - comment=EVENTS_WARNINGS.table_comment, - table_properties={ - "delta.enableRowTracking": "true" - }) - def generate_events_warnings(): - return spark.sql(self._get_events_warnings_sql()) - - def register_metric_pipeline_hourly_error_rate(self, spark: SparkSession): - @dlt.table(name=METRIC_PIPELINE_HOURLY_ERROR_RATE.name, - comment=METRIC_PIPELINE_HOURLY_ERROR_RATE.table_comment, - cluster_by=['pipeline_id'], - table_properties={ - "delta.enableRowTracking": "true" - }) - def generate_metric_pipeline_hourly_error_rate(): - return spark.sql(f""" + def register_events_warnings(self, spark: SparkSession): + @dlt.table( + name=EVENTS_WARNINGS.name, + cluster_by=["pipeline_id", "pipeline_run_id"], + comment=EVENTS_WARNINGS.table_comment, + table_properties={"delta.enableRowTracking": "true"}, + ) + def generate_events_warnings(): + return spark.sql(self._get_events_warnings_sql()) + + def register_metric_pipeline_hourly_error_rate(self, spark: SparkSession): + @dlt.table( + name=METRIC_PIPELINE_HOURLY_ERROR_RATE.name, + comment=METRIC_PIPELINE_HOURLY_ERROR_RATE.table_comment, + cluster_by=["pipeline_id"], + table_properties={"delta.enableRowTracking": "true"}, + ) + def generate_metric_pipeline_hourly_error_rate(): + return spark.sql(f""" SELECT pipeline_id, date_trunc('hour', event_timestamp) AS hour, count(*) FILTER (WHERE level='ERROR' OR error_full IS NOT NULL) AS num_errors @@ -686,28 +752,28 @@ def generate_metric_pipeline_hourly_error_rate(): GROUP BY 1, 2 """) - def register_pipeline_status(self, spark: SparkSession): - pipeline_runs_status_fqname=f"{self.conf.monitoring_catalog}.{self.conf.monitoring_schema}.{PIPELINE_RUNS_STATUS.name}" - - @dlt.view(name=f"{PIPELINE_RUNS_STATUS.name}_cdf") - def pipeline_runs_status_cdf(): - return ( - spark.readStream - .option("readChangeFeed", "true") - .table(PIPELINE_RUNS_STATUS.name) - .filter("_change_type IN ('insert', 'update_postimage')") + def register_pipeline_status(self, spark: SparkSession): + pipeline_runs_status_fqname = f"{self.conf.monitoring_catalog}.{self.conf.monitoring_schema}.{PIPELINE_RUNS_STATUS.name}" + + @dlt.view(name=f"{PIPELINE_RUNS_STATUS.name}_cdf") + def pipeline_runs_status_cdf(): + return ( + spark.readStream.option("readChangeFeed", "true") + .table(PIPELINE_RUNS_STATUS.name) + .filter("_change_type IN ('insert', 'update_postimage')") + ) + + dlt.create_streaming_table( + name=PIPELINES_STATUS_SILVER.name, + cluster_by=["pipeline_id"], + comment=PIPELINES_STATUS_SILVER.table_comment, + table_properties={"delta.enableRowTracking": "true"}, ) + latest_runs_view_name = f"{PIPELINE_RUNS_STATUS.name}_latest" - dlt.create_streaming_table(name=PIPELINES_STATUS_SILVER.name, - cluster_by = ["pipeline_id"], - comment=PIPELINES_STATUS_SILVER.table_comment, - table_properties={ - "delta.enableRowTracking": "true" - }) - latest_runs_view_name = f"{PIPELINE_RUNS_STATUS.name}_latest" - @dlt.view(name=latest_runs_view_name) - def latest_pipeline_run_progress(): - return spark.sql(f""" + @dlt.view(name=latest_runs_view_name) + def latest_pipeline_run_progress(): + return spark.sql(f""" SELECT pipeline_id, pipeline_run_id as latest_pipeline_run_id, pipeline_run_link as latest_pipeline_run_link, @@ -737,19 +803,21 @@ def latest_pipeline_run_progress(): updated_at FROM STREAM(`{PIPELINE_RUNS_STATUS.name}_cdf`) """) - dlt.create_auto_cdc_flow( - name=f"apply_{latest_runs_view_name}", - source=latest_runs_view_name, - target=PIPELINES_STATUS_SILVER.name, - keys=['pipeline_id'], - sequence_by='updated_at', - ignore_null_updates=True - ) - - successful_runs_view_name = f"{PIPELINE_RUNS_STATUS.name}_successful" - @dlt.view(name=successful_runs_view_name) - def latest_pipeline_successful_run(): - return spark.sql(f""" + + dlt.create_auto_cdc_flow( + name=f"apply_{latest_runs_view_name}", + source=latest_runs_view_name, + target=PIPELINES_STATUS_SILVER.name, + keys=["pipeline_id"], + sequence_by="updated_at", + ignore_null_updates=True, + ) + + successful_runs_view_name = f"{PIPELINE_RUNS_STATUS.name}_successful" + + @dlt.view(name=successful_runs_view_name) + def latest_pipeline_successful_run(): + return spark.sql(f""" SELECT pipeline_id, null as latest_pipeline_run_id, null as latest_pipeline_run_link, @@ -779,20 +847,21 @@ def latest_pipeline_successful_run(): FROM STREAM(`{PIPELINE_RUNS_STATUS.name}_cdf`) WHERE latest_state == 'COMPLETED' """) - - dlt.create_auto_cdc_flow( - name=f"apply_{successful_runs_view_name}", - source=successful_runs_view_name, - target=PIPELINES_STATUS_SILVER.name, - keys=['pipeline_id'], - sequence_by='updated_at', - ignore_null_updates=True - ) - - failed_runs_view_name = f"{PIPELINE_RUNS_STATUS.name}_failed" - @dlt.view(name=failed_runs_view_name) - def latest_pipeline_failed_run(): - return spark.sql(f""" + + dlt.create_auto_cdc_flow( + name=f"apply_{successful_runs_view_name}", + source=successful_runs_view_name, + target=PIPELINES_STATUS_SILVER.name, + keys=["pipeline_id"], + sequence_by="updated_at", + ignore_null_updates=True, + ) + + failed_runs_view_name = f"{PIPELINE_RUNS_STATUS.name}_failed" + + @dlt.view(name=failed_runs_view_name) + def latest_pipeline_failed_run(): + return spark.sql(f""" SELECT pipeline_id, null as latest_pipeline_run_id, null as latest_pipeline_run_link, @@ -823,24 +892,24 @@ def latest_pipeline_failed_run(): FROM STREAM(`{PIPELINE_RUNS_STATUS.name}_cdf`) WHERE latest_state == 'FAILED' """) - - dlt.create_auto_cdc_flow( - name=f"apply_{failed_runs_view_name}", - source=failed_runs_view_name, - target=PIPELINES_STATUS_SILVER.name, - keys=['pipeline_id'], - sequence_by='updated_at', - ignore_null_updates=True - ) - - @dlt.table(name=PIPELINES_STATUS.name, - comment=PIPELINES_STATUS.table_comment, - cluster_by=['pipeline_id'], - table_properties={ - "delta.enableRowTracking": "true" - }) - def pipeline_status(): - return spark.sql(f""" + + dlt.create_auto_cdc_flow( + name=f"apply_{failed_runs_view_name}", + source=failed_runs_view_name, + target=PIPELINES_STATUS_SILVER.name, + keys=["pipeline_id"], + sequence_by="updated_at", + ignore_null_updates=True, + ) + + @dlt.table( + name=PIPELINES_STATUS.name, + comment=PIPELINES_STATUS.table_comment, + cluster_by=["pipeline_id"], + table_properties={"delta.enableRowTracking": "true"}, + ) + def pipeline_status(): + return spark.sql(f""" SELECT latest.*, ifnull(pe.num_errors, 0) latest_pipeline_run_num_errors, ifnull(pw.num_warnings, 0) latest_pipeline_run_num_warnings @@ -856,9 +925,9 @@ def pipeline_status(): GROUP BY 1, 2 ) as pw ON latest.pipeline_id = pw.pipeline_id and latest.latest_pipeline_run_id = pw.pipeline_run_id """) - - def _get_events_table_metrics_sql(self): - return f""" + + def _get_events_table_metrics_sql(self): + return f""" SELECT pipeline_id, pipeline_run_id, pipeline_run_link, @@ -891,18 +960,18 @@ def _get_events_table_metrics_sql(self): OR details:flow_progress.data_quality IS NOT NULL) """ - def register_events_table_metrics(self, spark: SparkSession): - @dlt.table(name=EVENTS_TABLE_METRICS.name, - comment=EVENTS_TABLE_METRICS.table_comment, - cluster_by=['pipeline_id', 'pipeline_run_id', 'table_name'], - table_properties={ - "delta.enableRowTracking": "true" - }) - def generate_events_table_metrics(): - return spark.sql(self._get_events_table_metrics_sql()) - - def _get_table_run_processing_state_sql(self): - return f""" + def register_events_table_metrics(self, spark: SparkSession): + @dlt.table( + name=EVENTS_TABLE_METRICS.name, + comment=EVENTS_TABLE_METRICS.table_comment, + cluster_by=["pipeline_id", "pipeline_run_id", "table_name"], + table_properties={"delta.enableRowTracking": "true"}, + ) + def generate_events_table_metrics(): + return spark.sql(self._get_events_table_metrics_sql()) + + def _get_table_run_processing_state_sql(self): + return f""" SELECT *, ('' || latest_state || '') as latest_state_with_color FROM (SELECT *, @@ -947,55 +1016,58 @@ def _get_table_run_processing_state_sql(self): )) """ - def register_table_status_per_pipeline_run(self, spark: SparkSession): - dlt.create_streaming_table(name=TABLE_STATUS_PER_PIPELINE_RUN.name, - comment=TABLE_STATUS_PER_PIPELINE_RUN.table_comment, - cluster_by=['pipeline_id', 'pipeline_run_id', 'table_name'], - table_properties={ - "delta.enableRowTracking": "true", - "delta.enableChangeDataFeed": "true", - }) - - source_view_name=f"{TABLE_STATUS_PER_PIPELINE_RUN.name}_source" - @dlt.view(name=source_view_name) - def table_run_processing_state_source(): - return spark.sql(self._get_table_run_processing_state_sql()) - - dlt.create_auto_cdc_flow( - name=f"apply_{TABLE_STATUS_PER_PIPELINE_RUN.name}", - source=source_view_name, - target=TABLE_STATUS_PER_PIPELINE_RUN.name, - keys=['pipeline_id', 'pipeline_run_id', 'table_name'], - sequence_by='seq_num', - ignore_null_updates=True, - except_column_list=['seq_num']) - - - def register_table_status(self, spark: SparkSession): - # Use CDF because apply_changes can generate MERGE commits - table_status_per_pipeline_run_cdf = f"{TABLE_STATUS_PER_PIPELINE_RUN.name}_cdf" - - @dlt.view(name=table_status_per_pipeline_run_cdf) - def table_run_processing_state_cdf(): - return ( - spark.readStream - .option("readChangeFeed", "true") - .table(TABLE_STATUS_PER_PIPELINE_RUN.name) - .filter("_change_type IN ('insert', 'update_postimage')") - ) - - silver_table_name = f"{TABLE_STATUS.name}_silver" - dlt.create_streaming_table(name=silver_table_name, - comment="Capture information about the latest state, ingested data and errors for target tables", - cluster_by=['pipeline_id', 'table_name'], - table_properties={ - "delta.enableRowTracking": "true" - }) - - silver_latest_source_view_name = f"{silver_table_name}_latest_source" - @dlt.view(name=silver_latest_source_view_name) - def table_latest_run_processing_state_source(): - return spark.sql(f""" + def register_table_status_per_pipeline_run(self, spark: SparkSession): + dlt.create_streaming_table( + name=TABLE_STATUS_PER_PIPELINE_RUN.name, + comment=TABLE_STATUS_PER_PIPELINE_RUN.table_comment, + cluster_by=["pipeline_id", "pipeline_run_id", "table_name"], + table_properties={ + "delta.enableRowTracking": "true", + "delta.enableChangeDataFeed": "true", + }, + ) + + source_view_name = f"{TABLE_STATUS_PER_PIPELINE_RUN.name}_source" + + @dlt.view(name=source_view_name) + def table_run_processing_state_source(): + return spark.sql(self._get_table_run_processing_state_sql()) + + dlt.create_auto_cdc_flow( + name=f"apply_{TABLE_STATUS_PER_PIPELINE_RUN.name}", + source=source_view_name, + target=TABLE_STATUS_PER_PIPELINE_RUN.name, + keys=["pipeline_id", "pipeline_run_id", "table_name"], + sequence_by="seq_num", + ignore_null_updates=True, + except_column_list=["seq_num"], + ) + + def register_table_status(self, spark: SparkSession): + # Use CDF because apply_changes can generate MERGE commits + table_status_per_pipeline_run_cdf = f"{TABLE_STATUS_PER_PIPELINE_RUN.name}_cdf" + + @dlt.view(name=table_status_per_pipeline_run_cdf) + def table_run_processing_state_cdf(): + return ( + spark.readStream.option("readChangeFeed", "true") + .table(TABLE_STATUS_PER_PIPELINE_RUN.name) + .filter("_change_type IN ('insert', 'update_postimage')") + ) + + silver_table_name = f"{TABLE_STATUS.name}_silver" + dlt.create_streaming_table( + name=silver_table_name, + comment="Capture information about the latest state, ingested data and errors for target tables", + cluster_by=["pipeline_id", "table_name"], + table_properties={"delta.enableRowTracking": "true"}, + ) + + silver_latest_source_view_name = f"{silver_table_name}_latest_source" + + @dlt.view(name=silver_latest_source_view_name) + def table_latest_run_processing_state_source(): + return spark.sql(f""" SELECT pipeline_id, table_name, pipeline_run_id AS latest_pipeline_run_id, @@ -1018,18 +1090,22 @@ def table_latest_run_processing_state_source(): FROM STREAM(`{table_status_per_pipeline_run_cdf}`) """) - dlt.create_auto_cdc_flow( - name=f"{silver_table_name}_apply_latest", - source=silver_latest_source_view_name, - target=silver_table_name, - keys=['pipeline_id', 'table_name'], - sequence_by='updated_at', - ignore_null_updates=True) - - silver_latest_changes_source_view_name = f"{silver_table_name}_latest_changes_source" - @dlt.view(name=silver_latest_changes_source_view_name) - def table_latest_run_processing_state_source(): - return spark.sql(f""" + dlt.create_auto_cdc_flow( + name=f"{silver_table_name}_apply_latest", + source=silver_latest_source_view_name, + target=silver_table_name, + keys=["pipeline_id", "table_name"], + sequence_by="updated_at", + ignore_null_updates=True, + ) + + silver_latest_changes_source_view_name = ( + f"{silver_table_name}_latest_changes_source" + ) + + @dlt.view(name=silver_latest_changes_source_view_name) + def table_latest_run_processing_state_source(): + return spark.sql(f""" SELECT pipeline_id, table_name, null AS latest_pipeline_run_id, @@ -1053,22 +1129,23 @@ def table_latest_run_processing_state_source(): WHERE table_name IS NOT null AND num_written_rows > 0 """) - dlt.create_auto_cdc_flow( - name=f"{silver_table_name}_apply_latest_changes", - source=silver_latest_changes_source_view_name, - target=silver_table_name, - keys=['pipeline_id', 'table_name'], - sequence_by='updated_at', - ignore_null_updates=True) - - @dlt.table(name=TABLE_STATUS.name, - comment=TABLE_STATUS.table_comment, - cluster_by=['pipeline_id', 'table_name'], - table_properties={ - "delta.enableRowTracking": "true" - }) - def table_status(): - return spark.sql(f""" + dlt.create_auto_cdc_flow( + name=f"{silver_table_name}_apply_latest_changes", + source=silver_latest_changes_source_view_name, + target=silver_table_name, + keys=["pipeline_id", "table_name"], + sequence_by="updated_at", + ignore_null_updates=True, + ) + + @dlt.table( + name=TABLE_STATUS.name, + comment=TABLE_STATUS.table_comment, + cluster_by=["pipeline_id", "table_name"], + table_properties={"delta.enableRowTracking": "true"}, + ) + def table_status(): + return spark.sql(f""" SELECT s.*, latest_pipeline_run_num_written_rows FROM {silver_table_name} s @@ -1084,16 +1161,21 @@ def table_status(): AND s.latest_pipeline_run_id = etm.pipeline_run_id AND s.table_name = etm.table_name """) - - def register_table_expectation_checks(self, spark: SparkSession): - @dlt.table(name=TABLE_EVENTS_EXPECTATION_CHECKS.name, - comment=TABLE_EVENTS_EXPECTATION_CHECKS.table_comment, - cluster_by=['pipeline_id', 'pipeline_run_id', 'table_name', 'expectation_name'], - table_properties={ - "delta.enableRowTracking": "true" - }) - def table_expectation_checks(): - return spark.sql(f""" + + def register_table_expectation_checks(self, spark: SparkSession): + @dlt.table( + name=TABLE_EVENTS_EXPECTATION_CHECKS.name, + comment=TABLE_EVENTS_EXPECTATION_CHECKS.table_comment, + cluster_by=[ + "pipeline_id", + "pipeline_run_id", + "table_name", + "expectation_name", + ], + table_properties={"delta.enableRowTracking": "true"}, + ) + def table_expectation_checks(): + return spark.sql(f""" SELECT pipeline_id, pipeline_run_id, pipeline_run_link, @@ -1124,4 +1206,5 @@ def table_expectation_checks(): WHERE details:flow_progress.data_quality IS NOT NULL )) """) - pass + + pass diff --git a/contrib/databricks_ingestion_monitoring/common/lib/databricks_ingestion_monitoring/standard_tables.py b/contrib/databricks_ingestion_monitoring/common/lib/databricks_ingestion_monitoring/standard_tables.py index 0d3b473..4cb987b 100644 --- a/contrib/databricks_ingestion_monitoring/common/lib/databricks_ingestion_monitoring/standard_tables.py +++ b/contrib/databricks_ingestion_monitoring/common/lib/databricks_ingestion_monitoring/standard_tables.py @@ -4,68 +4,82 @@ from pyspark.sql import SparkSession, DataFrame + class TableType(Enum): - STREAMING_TABLE = 1 - MATERIALIZED_VIEW = 2 - DELTA_TABLE = 3 + STREAMING_TABLE = 1 + MATERIALIZED_VIEW = 2 + DELTA_TABLE = 3 + class MonitoringTable: - """ - A helper class that encapsulates logic to generate a monitoring table. All tables are generated within the - default catalog and schema for the pipeline. - """ - def __init__(self, name: str, table_type: TableType, table_comment:str, column_comments: Optional[Dict[str, str]] = None): """ - Constructor - :param name: the name of the table - :param type: 'st' or 'mv' + A helper class that encapsulates logic to generate a monitoring table. All tables are generated within the + default catalog and schema for the pipeline. """ - self.name = name - self.table_type = table_type - self.table_comment = table_comment - self.column_comments = column_comments - self.log = logging.getLogger(f"databricks_ingestion_monitoring.MonitoringTable.{self.name}") - def add_column_comments(self, monitoring_catalog: str, monitoring_schema: str, spark: SparkSession): - """ - Add comments to the columns of the table. This is a workaround because SDP currently does not support - adding those as part of the table definition. This method will use ALTER TABLE ALTER COLUMN ... COMMENT ... - to set the comments on the intersection of the keys of `self.column_comments` and the columns from the - table schema. - """ - if self.column_comments is None or len(self.column_comments) == 0: - return # Nothing to do - + def __init__( + self, + name: str, + table_type: TableType, + table_comment: str, + column_comments: Optional[Dict[str, str]] = None, + ): + """ + Constructor + :param name: the name of the table + :param type: 'st' or 'mv' + """ + self.name = name + self.table_type = table_type + self.table_comment = table_comment + self.column_comments = column_comments + self.log = logging.getLogger( + f"databricks_ingestion_monitoring.MonitoringTable.{self.name}" + ) - self.log.info(f"Adding column comments to table {self.name}") - fq_name = f"`{monitoring_catalog}`.`{monitoring_schema}`.`{self.name}`" + def add_column_comments( + self, monitoring_catalog: str, monitoring_schema: str, spark: SparkSession + ): + """ + Add comments to the columns of the table. This is a workaround because SDP currently does not support + adding those as part of the table definition. This method will use ALTER TABLE ALTER COLUMN ... COMMENT ... + to set the comments on the intersection of the keys of `self.column_comments` and the columns from the + table schema. + """ + if self.column_comments is None or len(self.column_comments) == 0: + return # Nothing to do - if not spark.catalog.tableExists(fq_name): - self.log.warn(f"Table {fq_name} does not exist. Skipping column comments.") - return + self.log.info(f"Adding column comments to table {self.name}") + fq_name = f"`{monitoring_catalog}`.`{monitoring_schema}`.`{self.name}`" - table_schema = spark.table(fq_name).schema - table_column_names = set(table_schema.fieldNames()) + if not spark.catalog.tableExists(fq_name): + self.log.warn(f"Table {fq_name} does not exist. Skipping column comments.") + return + table_schema = spark.table(fq_name).schema + table_column_names = set(table_schema.fieldNames()) - if TableType.STREAMING_TABLE == self.table_type: - alter_type = 'STREAMING TABLE' - elif TableType.MATERIALIZED_VIEW == self.table_type: - alter_type = "MATERIALIZED VIEW" - elif TableType.DELTA_TABLE == self.table_type: - alter_type = "TABLE" - else: - raise AssertionError(f"Unexpected table_type: {self.table_type}") + if TableType.STREAMING_TABLE == self.table_type: + alter_type = "STREAMING TABLE" + elif TableType.MATERIALIZED_VIEW == self.table_type: + alter_type = "MATERIALIZED VIEW" + elif TableType.DELTA_TABLE == self.table_type: + alter_type = "TABLE" + else: + raise AssertionError(f"Unexpected table_type: {self.table_type}") + + for column_name, column_comment in self.column_comments.items(): + if column_name in table_column_names: + column_comment_parts = column_comment.splitlines() + comment_sql = " ".join( + [p.replace("'", "''") for p in column_comment_parts] + ) + sql = f"ALTER {alter_type} {fq_name} ALTER COLUMN `{column_name}` COMMENT '{comment_sql}'" + self.log.debug(f"Running {sql} ...") + spark.sql(sql) + else: + self.log.warn(f"Column {column_name} not found in table {self.name}") - for column_name, column_comment in self.column_comments.items(): - if column_name in table_column_names: - column_comment_parts = column_comment.splitlines() - comment_sql=" ".join([p.replace("'","''") for p in column_comment_parts]) - sql = f"ALTER {alter_type} {fq_name} ALTER COLUMN `{column_name}` COMMENT '{comment_sql}'" - self.log.debug(f"Running {sql} ...") - spark.sql(sql) - else: - self.log.warn(f"Column {column_name} not found in table {self.name}") STANDARD_COLUMN_COMMENTS = { "error_full": "Contains full details about the error that happened (if any)", @@ -110,296 +124,353 @@ def add_column_comments(self, monitoring_catalog: str, monitoring_schema: str, s MONITORED_PIPELINES = MonitoringTable( - name='monitored_pipelines', - table_type=TableType.MATERIALIZED_VIEW, - table_comment="Contains metadata about all monitored pipelines.", - column_comments={ - "pipeline_id": STANDARD_COLUMN_COMMENTS['pipeline_id'], - "pipeline_name": STANDARD_COLUMN_COMMENTS['pipeline_name'], - "pipeline_link": STANDARD_COLUMN_COMMENTS['pipeline_link'], - "pipeline_type": """One of: 'gateway' for CDC Connector gateways, 'ingestion' for other ingestion pipelines, + name="monitored_pipelines", + table_type=TableType.MATERIALIZED_VIEW, + table_comment="Contains metadata about all monitored pipelines.", + column_comments={ + "pipeline_id": STANDARD_COLUMN_COMMENTS["pipeline_id"], + "pipeline_name": STANDARD_COLUMN_COMMENTS["pipeline_name"], + "pipeline_link": STANDARD_COLUMN_COMMENTS["pipeline_link"], + "pipeline_type": """One of: 'gateway' for CDC Connector gateways, 'ingestion' for other ingestion pipelines, 'etl' for all other pipelines""", - "default_catalog": "The default catalog for the pipeline", - "default_schema": "The default schema for the pipeline", - "event_log_source": """The fully qualified name to a Delta table containing the event log for this pipeline. + "default_catalog": "The default catalog for the pipeline", + "default_schema": "The default schema for the pipeline", + "event_log_source": """The fully qualified name to a Delta table containing the event log for this pipeline. This could be the Delta table explicitly configured in the 'event_log' property of the pipeline spec or a table where that log has been imported using the import_event_logs job.""", - "tags_map": "A map of tag keys to tag values for this pipeline. Useful for filtering and grouping pipelines by tags.", - "tags_array": """An array of 'tag:value' strings for this pipeline. Designed for easy filtering in AI/BI dashboards - where you can select a single value as a filtering expression.""" - }) + "tags_map": "A map of tag keys to tag values for this pipeline. Useful for filtering and grouping pipelines by tags.", + "tags_array": """An array of 'tag:value' strings for this pipeline. Designed for easy filtering in AI/BI dashboards + where you can select a single value as a filtering expression.""", + }, +) MONITORED_TABLES = MonitoringTable( - name='monitored_tables', - table_type=TableType.MATERIALIZED_VIEW, - table_comment="Contains a list of all tables detected in monitored pipelines. Used in the observability dashboard for filtering by table.", - column_comments={ - "pipeline_id": STANDARD_COLUMN_COMMENTS['pipeline_id'], - "table_name": STANDARD_COLUMN_COMMENTS['table_name'] - } + name="monitored_tables", + table_type=TableType.MATERIALIZED_VIEW, + table_comment="Contains a list of all tables detected in monitored pipelines. Used in the observability dashboard for filtering by table.", + column_comments={ + "pipeline_id": STANDARD_COLUMN_COMMENTS["pipeline_id"], + "table_name": STANDARD_COLUMN_COMMENTS["table_name"], + }, ) EVENT_LOGS_BRONZE = MonitoringTable( - name='event_logs_bronze', - table_type=TableType.STREAMING_TABLE, - table_comment="Initial filtering and transformations of the input event logs that are shared by most observability tables", - column_comments={ - "id": "This event's unique identifier", - "seq_num": "Contains information about the position of this event in the event log", - "pipeline_id": "The unique identifier of the pipeline for which this event is", - "pipeline_run_id": STANDARD_COLUMN_COMMENTS["pipeline_run_id"], - "pipeline_run_link": STANDARD_COLUMN_COMMENTS["pipeline_run_link"], - "table_name": STANDARD_COLUMN_COMMENTS["table_name"], - "flow_name": STANDARD_COLUMN_COMMENTS["flow_name"], - "batch_id": "The micro-batch id that triggered this event (typically used in metric events)", - "event_timestamp": STANDARD_COLUMN_COMMENTS["event_timestamp"], - "message": STANDARD_COLUMN_COMMENTS["message"], - "level": STANDARD_COLUMN_COMMENTS["level"], - "error_message": STANDARD_COLUMN_COMMENTS["error_message"], - "error_full": STANDARD_COLUMN_COMMENTS["error_full"], - "event_type": """The type of the event. For example 'update_progress' captures state transitions for the current + name="event_logs_bronze", + table_type=TableType.STREAMING_TABLE, + table_comment="Initial filtering and transformations of the input event logs that are shared by most observability tables", + column_comments={ + "id": "This event's unique identifier", + "seq_num": "Contains information about the position of this event in the event log", + "pipeline_id": "The unique identifier of the pipeline for which this event is", + "pipeline_run_id": STANDARD_COLUMN_COMMENTS["pipeline_run_id"], + "pipeline_run_link": STANDARD_COLUMN_COMMENTS["pipeline_run_link"], + "table_name": STANDARD_COLUMN_COMMENTS["table_name"], + "flow_name": STANDARD_COLUMN_COMMENTS["flow_name"], + "batch_id": "The micro-batch id that triggered this event (typically used in metric events)", + "event_timestamp": STANDARD_COLUMN_COMMENTS["event_timestamp"], + "message": STANDARD_COLUMN_COMMENTS["message"], + "level": STANDARD_COLUMN_COMMENTS["level"], + "error_message": STANDARD_COLUMN_COMMENTS["error_message"], + "error_full": STANDARD_COLUMN_COMMENTS["error_full"], + "event_type": """The type of the event. For example 'update_progress' captures state transitions for the current pipeline run, 'flow_progress' captures state transition in the evaluation of a specific flow, etc. Look for more information in `details`: """, - "details": "Contains `event_type`-specific information in the field." - } + "details": "Contains `event_type`-specific information in the field.", + }, ) PIPELINE_RUNS_STATUS = MonitoringTable( - name='pipeline_runs_status', - table_type=TableType.STREAMING_TABLE, - table_comment="Contains the latest status of monitored pipelines runs", - column_comments={ - "pipeline_id": STANDARD_COLUMN_COMMENTS['pipeline_id'], - "pipeline_run_id": STANDARD_COLUMN_COMMENTS["pipeline_run_id"], - "pipeline_run_link": STANDARD_COLUMN_COMMENTS["pipeline_run_link"], - "latest_state": STANDARD_COLUMN_COMMENTS["latest_state"] % (''), - "state_color": STANDARD_COLUMN_COMMENTS["state_color"] % (''), - "latest_state_with_color": STANDARD_COLUMN_COMMENTS["latest_state_with_color"] % (''), - "latest_state_level": STANDARD_COLUMN_COMMENTS["latest_state_level"] % (''), - "create_time": STANDARD_COLUMN_COMMENTS["create_time"] % (''), - "queued_time": "Time when the pipeline run was queued for compute resources (entered WAITING_FOR_RESOURCES state)", - "initialization_start_time": "Time when the pipeline run started initialization (entered INITIALIZING state)", - "running_start_time": "Time when the pipeline starting its execution (entered RUNNING state)", - "end_time": STANDARD_COLUMN_COMMENTS["end_time"] % (''), - "is_complete": STANDARD_COLUMN_COMMENTS["is_complete"] % (''), - "latest_error_log_message": STANDARD_COLUMN_COMMENTS["latest_error_log_message"] % (''), - "latest_error_message": STANDARD_COLUMN_COMMENTS["latest_error_message"] % (''), - "latest_error_code": STANDARD_COLUMN_COMMENTS["latest_error_code"] % (''), - "latest_error_full": "Full stack trace of the latest error in the log", - "updated_at": "Timestamp of latest update (based on the event log timestamp) applied to this row" - } + name="pipeline_runs_status", + table_type=TableType.STREAMING_TABLE, + table_comment="Contains the latest status of monitored pipelines runs", + column_comments={ + "pipeline_id": STANDARD_COLUMN_COMMENTS["pipeline_id"], + "pipeline_run_id": STANDARD_COLUMN_COMMENTS["pipeline_run_id"], + "pipeline_run_link": STANDARD_COLUMN_COMMENTS["pipeline_run_link"], + "latest_state": STANDARD_COLUMN_COMMENTS["latest_state"] % (""), + "state_color": STANDARD_COLUMN_COMMENTS["state_color"] % (""), + "latest_state_with_color": STANDARD_COLUMN_COMMENTS["latest_state_with_color"] + % (""), + "latest_state_level": STANDARD_COLUMN_COMMENTS["latest_state_level"] % (""), + "create_time": STANDARD_COLUMN_COMMENTS["create_time"] % (""), + "queued_time": "Time when the pipeline run was queued for compute resources (entered WAITING_FOR_RESOURCES state)", + "initialization_start_time": "Time when the pipeline run started initialization (entered INITIALIZING state)", + "running_start_time": "Time when the pipeline starting its execution (entered RUNNING state)", + "end_time": STANDARD_COLUMN_COMMENTS["end_time"] % (""), + "is_complete": STANDARD_COLUMN_COMMENTS["is_complete"] % (""), + "latest_error_log_message": STANDARD_COLUMN_COMMENTS["latest_error_log_message"] + % (""), + "latest_error_message": STANDARD_COLUMN_COMMENTS["latest_error_message"] % (""), + "latest_error_code": STANDARD_COLUMN_COMMENTS["latest_error_code"] % (""), + "latest_error_full": "Full stack trace of the latest error in the log", + "updated_at": "Timestamp of latest update (based on the event log timestamp) applied to this row", + }, ) EVENTS_ERRORS = MonitoringTable( - name='events_errors', - table_type=TableType.STREAMING_TABLE, - table_comment="The stream of all errors in pipeline runs", - column_comments={ - "pipeline_id": STANDARD_COLUMN_COMMENTS['pipeline_id'], - "pipeline_run_id": STANDARD_COLUMN_COMMENTS["pipeline_run_id"], - "pipeline_run_link": STANDARD_COLUMN_COMMENTS["pipeline_run_link"], - "table_name": STANDARD_COLUMN_COMMENTS["table_name"] + " affected by the error (if any)", - "flow_name": STANDARD_COLUMN_COMMENTS["flow_name"], - "event_timestamp": STANDARD_COLUMN_COMMENTS["event_timestamp"], - "error_log_message": STANDARD_COLUMN_COMMENTS["message"], - "error_message": STANDARD_COLUMN_COMMENTS["error_message"], - "error_full": STANDARD_COLUMN_COMMENTS["error_full"], - "flow_type": STANDARD_COLUMN_COMMENTS['flow_type'], - } + name="events_errors", + table_type=TableType.STREAMING_TABLE, + table_comment="The stream of all errors in pipeline runs", + column_comments={ + "pipeline_id": STANDARD_COLUMN_COMMENTS["pipeline_id"], + "pipeline_run_id": STANDARD_COLUMN_COMMENTS["pipeline_run_id"], + "pipeline_run_link": STANDARD_COLUMN_COMMENTS["pipeline_run_link"], + "table_name": STANDARD_COLUMN_COMMENTS["table_name"] + + " affected by the error (if any)", + "flow_name": STANDARD_COLUMN_COMMENTS["flow_name"], + "event_timestamp": STANDARD_COLUMN_COMMENTS["event_timestamp"], + "error_log_message": STANDARD_COLUMN_COMMENTS["message"], + "error_message": STANDARD_COLUMN_COMMENTS["error_message"], + "error_full": STANDARD_COLUMN_COMMENTS["error_full"], + "flow_type": STANDARD_COLUMN_COMMENTS["flow_type"], + }, ) EVENTS_WARNINGS = MonitoringTable( - name='events_warnings', - table_type=TableType.STREAMING_TABLE, - table_comment="The stream of all warnings in pipeline runs", - column_comments={ - "pipeline_id": STANDARD_COLUMN_COMMENTS['pipeline_id'], - "pipeline_run_id": STANDARD_COLUMN_COMMENTS["pipeline_run_id"], - "pipeline_run_link": STANDARD_COLUMN_COMMENTS["pipeline_run_link"], - "table_name": STANDARD_COLUMN_COMMENTS["table_name"] + " affected by the warning (if any)", - "flow_name": STANDARD_COLUMN_COMMENTS["flow_name"], - "event_timestamp": STANDARD_COLUMN_COMMENTS["event_timestamp"], - "warning_log_message": STANDARD_COLUMN_COMMENTS["message"], - "flow_type": STANDARD_COLUMN_COMMENTS['flow_type'], - } + name="events_warnings", + table_type=TableType.STREAMING_TABLE, + table_comment="The stream of all warnings in pipeline runs", + column_comments={ + "pipeline_id": STANDARD_COLUMN_COMMENTS["pipeline_id"], + "pipeline_run_id": STANDARD_COLUMN_COMMENTS["pipeline_run_id"], + "pipeline_run_link": STANDARD_COLUMN_COMMENTS["pipeline_run_link"], + "table_name": STANDARD_COLUMN_COMMENTS["table_name"] + + " affected by the warning (if any)", + "flow_name": STANDARD_COLUMN_COMMENTS["flow_name"], + "event_timestamp": STANDARD_COLUMN_COMMENTS["event_timestamp"], + "warning_log_message": STANDARD_COLUMN_COMMENTS["message"], + "flow_type": STANDARD_COLUMN_COMMENTS["flow_type"], + }, ) METRIC_PIPELINE_HOURLY_ERROR_RATE = MonitoringTable( - name='metric_pipeline_error_rate', - table_type=TableType.MATERIALIZED_VIEW, - table_comment="Error rate per hour for all monitored pipelines", - column_comments={ - "pipeline_id": STANDARD_COLUMN_COMMENTS['pipeline_id'], - "hour": "The hour for which the error rate is calculated", - "error_rate": "The number of errors per hour for the pipeline" - } + name="metric_pipeline_error_rate", + table_type=TableType.MATERIALIZED_VIEW, + table_comment="Error rate per hour for all monitored pipelines", + column_comments={ + "pipeline_id": STANDARD_COLUMN_COMMENTS["pipeline_id"], + "hour": "The hour for which the error rate is calculated", + "error_rate": "The number of errors per hour for the pipeline", + }, ) PIPELINES_STATUS_SILVER = MonitoringTable( - name='pipelines_status_silver', - table_type=TableType.STREAMING_TABLE, - table_comment="Keeps track of the latest pipeline run, latest successful run and latest failed run for each pipeline", - column_comments={ - - } + name="pipelines_status_silver", + table_type=TableType.STREAMING_TABLE, + table_comment="Keeps track of the latest pipeline run, latest successful run and latest failed run for each pipeline", + column_comments={}, ) PIPELINES_STATUS = MonitoringTable( - name='pipelines_status', - table_type=TableType.MATERIALIZED_VIEW, - table_comment="Keeps track of the latests status for each monitored pipeline", - column_comments={ - "pipeline_id": STANDARD_COLUMN_COMMENTS['pipeline_id'], - "latest_pipeline_run_id": f"Latest {STANDARD_COLUMN_COMMENTS['pipeline_run_id']}", - "latest_pipeline_run_link": f"Latest {STANDARD_COLUMN_COMMENTS['pipeline_run_link']}", - "latest_pipeline_run_state": STANDARD_COLUMN_COMMENTS["latest_state"] % ('latest'), - "latest_pipeline_run_state_color": STANDARD_COLUMN_COMMENTS["state_color"] % ('latest'), - "latest_pipeline_run_state_with_color": STANDARD_COLUMN_COMMENTS["latest_state_with_color"] % ('latest'), - "latest_pipeline_run_state_level": STANDARD_COLUMN_COMMENTS["latest_state_level"] % ('latest'), - "latest_pipeline_run_create_time": STANDARD_COLUMN_COMMENTS["create_time"] % ('latest'), - "latest_pipeline_run_end_time": STANDARD_COLUMN_COMMENTS["end_time"] % ('latest'), - "latest_pipeline_run_is_complete": STANDARD_COLUMN_COMMENTS["is_complete"] % ('latest'), - "latest_error_log_message": STANDARD_COLUMN_COMMENTS["latest_error_log_message"] % ('latest'), - "latest_error_message": STANDARD_COLUMN_COMMENTS["latest_error_message"] % ('latest'), - "latest_error_code": STANDARD_COLUMN_COMMENTS["latest_error_code"] % ('latest'), - "latest_error_time": "The time of the latest error (event)", - "latest_successful_run_id": f"Latest successful {STANDARD_COLUMN_COMMENTS['pipeline_run_id']}", - "latest_successful_run_link": f"Latest successful {STANDARD_COLUMN_COMMENTS['pipeline_run_link']}", - "latest_successful_run_create_time": STANDARD_COLUMN_COMMENTS["create_time"] % ('successful'), - "latest_successful_run_end_time": STANDARD_COLUMN_COMMENTS["end_time"] % ('successful'), - "latest_failed_run_id": f"Latest failed {STANDARD_COLUMN_COMMENTS['pipeline_run_id']}", - "latest_failed_run_link": f"Latest failed {STANDARD_COLUMN_COMMENTS['pipeline_run_link']}", - "latest_failed_run_create_time": STANDARD_COLUMN_COMMENTS["create_time"] % ('failed'), - "latest_failed_run_end_time": STANDARD_COLUMN_COMMENTS["end_time"] % ('failed'), - "latest_failed_run_error_log_message": STANDARD_COLUMN_COMMENTS["latest_error_log_message"] % ('failed'), - "latest_failed_run_error_message": STANDARD_COLUMN_COMMENTS["latest_error_message"] % ('failed'), - "latest_failed_run_error_code": STANDARD_COLUMN_COMMENTS["latest_error_code"] % ('failed'), - "updated_at": "Timestamp of latest update (based on the event log timestamp) applied to this row", - "latest_pipeline_run_num_errors": "The number of errors in the latest pipeline run", - "latest_pipeline_run_num_warnings": "The number of warnings in the latest pipeline run", - } + name="pipelines_status", + table_type=TableType.MATERIALIZED_VIEW, + table_comment="Keeps track of the latests status for each monitored pipeline", + column_comments={ + "pipeline_id": STANDARD_COLUMN_COMMENTS["pipeline_id"], + "latest_pipeline_run_id": f"Latest {STANDARD_COLUMN_COMMENTS['pipeline_run_id']}", + "latest_pipeline_run_link": f"Latest {STANDARD_COLUMN_COMMENTS['pipeline_run_link']}", + "latest_pipeline_run_state": STANDARD_COLUMN_COMMENTS["latest_state"] + % ("latest"), + "latest_pipeline_run_state_color": STANDARD_COLUMN_COMMENTS["state_color"] + % ("latest"), + "latest_pipeline_run_state_with_color": STANDARD_COLUMN_COMMENTS[ + "latest_state_with_color" + ] + % ("latest"), + "latest_pipeline_run_state_level": STANDARD_COLUMN_COMMENTS[ + "latest_state_level" + ] + % ("latest"), + "latest_pipeline_run_create_time": STANDARD_COLUMN_COMMENTS["create_time"] + % ("latest"), + "latest_pipeline_run_end_time": STANDARD_COLUMN_COMMENTS["end_time"] + % ("latest"), + "latest_pipeline_run_is_complete": STANDARD_COLUMN_COMMENTS["is_complete"] + % ("latest"), + "latest_error_log_message": STANDARD_COLUMN_COMMENTS["latest_error_log_message"] + % ("latest"), + "latest_error_message": STANDARD_COLUMN_COMMENTS["latest_error_message"] + % ("latest"), + "latest_error_code": STANDARD_COLUMN_COMMENTS["latest_error_code"] % ("latest"), + "latest_error_time": "The time of the latest error (event)", + "latest_successful_run_id": f"Latest successful {STANDARD_COLUMN_COMMENTS['pipeline_run_id']}", + "latest_successful_run_link": f"Latest successful {STANDARD_COLUMN_COMMENTS['pipeline_run_link']}", + "latest_successful_run_create_time": STANDARD_COLUMN_COMMENTS["create_time"] + % ("successful"), + "latest_successful_run_end_time": STANDARD_COLUMN_COMMENTS["end_time"] + % ("successful"), + "latest_failed_run_id": f"Latest failed {STANDARD_COLUMN_COMMENTS['pipeline_run_id']}", + "latest_failed_run_link": f"Latest failed {STANDARD_COLUMN_COMMENTS['pipeline_run_link']}", + "latest_failed_run_create_time": STANDARD_COLUMN_COMMENTS["create_time"] + % ("failed"), + "latest_failed_run_end_time": STANDARD_COLUMN_COMMENTS["end_time"] % ("failed"), + "latest_failed_run_error_log_message": STANDARD_COLUMN_COMMENTS[ + "latest_error_log_message" + ] + % ("failed"), + "latest_failed_run_error_message": STANDARD_COLUMN_COMMENTS[ + "latest_error_message" + ] + % ("failed"), + "latest_failed_run_error_code": STANDARD_COLUMN_COMMENTS["latest_error_code"] + % ("failed"), + "updated_at": "Timestamp of latest update (based on the event log timestamp) applied to this row", + "latest_pipeline_run_num_errors": "The number of errors in the latest pipeline run", + "latest_pipeline_run_num_warnings": "The number of warnings in the latest pipeline run", + }, ) EVENTS_TABLE_METRICS = MonitoringTable( - name='events_table_metrics', - table_type=TableType.STREAMING_TABLE, - table_comment="The stream of metric events to target tables", - column_comments={ - "pipeline_id": STANDARD_COLUMN_COMMENTS['pipeline_id'], - "pipeline_run_id": STANDARD_COLUMN_COMMENTS["pipeline_run_id"], - "pipeline_run_link": STANDARD_COLUMN_COMMENTS["pipeline_run_link"], - "table_name": STANDARD_COLUMN_COMMENTS["table_name"], - "flow_name": STANDARD_COLUMN_COMMENTS["flow_name"], - "event_timestamp": STANDARD_COLUMN_COMMENTS["event_timestamp"], - "num_output_rows": "Number of output rows appended to the target table.", - "backlog_bytes": "Total backlog as bytes across all input sources in the flow.", - "backlog_records": "Total backlog records across all input sources in the flow.", - "backlog_files": "Total backlog files across all input sources in the flow.", - "backlog_seconds": "Maximum backlog seconds across all input sources in the flow.", - "executor_time_ms": "Sum of all task execution times in milliseconds of this flow over the reporting period.", - "executor_cpu_time_ms": "Sum of all task execution CPU times in milliseconds of this flow over the reporting period.", - "num_upserted_rows": "Number of output rows upserted into the dataset by an update of this flow.", - "num_deleted_rows": "Number of existing output rows deleted from the dataset by an update of this flow.", - "num_output_bytes": "Number of output bytes written by an update of this flow.", - "num_written_rows": "Total number of rows written to the target table -- combines num_output_rows, num_upserted_rows, num_deleted_rows", - "min_event_time": "The minimum event/commit time of a row processed in the specific micro-batch", - "max_event_time": "The maximum event/commit time of a row processed in the specific micro-batch", - "flow_type": STANDARD_COLUMN_COMMENTS['flow_type'], - "num_expectation_dropped_records": "The number of rows/records that were dropped due to failed DROP expectations.", - } + name="events_table_metrics", + table_type=TableType.STREAMING_TABLE, + table_comment="The stream of metric events to target tables", + column_comments={ + "pipeline_id": STANDARD_COLUMN_COMMENTS["pipeline_id"], + "pipeline_run_id": STANDARD_COLUMN_COMMENTS["pipeline_run_id"], + "pipeline_run_link": STANDARD_COLUMN_COMMENTS["pipeline_run_link"], + "table_name": STANDARD_COLUMN_COMMENTS["table_name"], + "flow_name": STANDARD_COLUMN_COMMENTS["flow_name"], + "event_timestamp": STANDARD_COLUMN_COMMENTS["event_timestamp"], + "num_output_rows": "Number of output rows appended to the target table.", + "backlog_bytes": "Total backlog as bytes across all input sources in the flow.", + "backlog_records": "Total backlog records across all input sources in the flow.", + "backlog_files": "Total backlog files across all input sources in the flow.", + "backlog_seconds": "Maximum backlog seconds across all input sources in the flow.", + "executor_time_ms": "Sum of all task execution times in milliseconds of this flow over the reporting period.", + "executor_cpu_time_ms": "Sum of all task execution CPU times in milliseconds of this flow over the reporting period.", + "num_upserted_rows": "Number of output rows upserted into the dataset by an update of this flow.", + "num_deleted_rows": "Number of existing output rows deleted from the dataset by an update of this flow.", + "num_output_bytes": "Number of output bytes written by an update of this flow.", + "num_written_rows": "Total number of rows written to the target table -- combines num_output_rows, num_upserted_rows, num_deleted_rows", + "min_event_time": "The minimum event/commit time of a row processed in the specific micro-batch", + "max_event_time": "The maximum event/commit time of a row processed in the specific micro-batch", + "flow_type": STANDARD_COLUMN_COMMENTS["flow_type"], + "num_expectation_dropped_records": "The number of rows/records that were dropped due to failed DROP expectations.", + }, ) TABLE_STATUS_PER_PIPELINE_RUN = MonitoringTable( - name='table_status_per_pipeline_run', - table_type=TableType.STREAMING_TABLE, - table_comment="Keeps track of the progress of processing a specific target table in pipeline runs", - column_comments={ - "pipeline_id": STANDARD_COLUMN_COMMENTS['pipeline_id'], - "pipeline_run_id": STANDARD_COLUMN_COMMENTS["pipeline_run_id"], - "pipeline_run_link": STANDARD_COLUMN_COMMENTS["pipeline_run_link"], - "table_name": STANDARD_COLUMN_COMMENTS["table_name"], - "updated_at": "Timestamp of latest update (based on the event log timestamp) applied to this row", - "latest_state": STANDARD_COLUMN_COMMENTS["latest_table_state"], - "table_schema_json": STANDARD_COLUMN_COMMENTS["table_schema_json"] % ('this'), - "table_schema": STANDARD_COLUMN_COMMENTS["table_schema"] % ('this'), - "latest_error_time": STANDARD_COLUMN_COMMENTS["latest_table_error_time"] % ('this'), - "latest_error_log_message": STANDARD_COLUMN_COMMENTS["latest_table_error_log_message"] % ('this'), - "latest_error_message": STANDARD_COLUMN_COMMENTS["latest_table_error_message"] % ('this'), - "latest_error_full": STANDARD_COLUMN_COMMENTS["latest_table_error_full"] % ('this'), - "flow_type": STANDARD_COLUMN_COMMENTS['flow_type'], - "latest_state_level": STANDARD_COLUMN_COMMENTS['latest_table_state_level'], - "latest_state_color": STANDARD_COLUMN_COMMENTS['latest_table_state_color'], - "latest_state_with_color": STANDARD_COLUMN_COMMENTS['latest_table_state_with_color'], - } + name="table_status_per_pipeline_run", + table_type=TableType.STREAMING_TABLE, + table_comment="Keeps track of the progress of processing a specific target table in pipeline runs", + column_comments={ + "pipeline_id": STANDARD_COLUMN_COMMENTS["pipeline_id"], + "pipeline_run_id": STANDARD_COLUMN_COMMENTS["pipeline_run_id"], + "pipeline_run_link": STANDARD_COLUMN_COMMENTS["pipeline_run_link"], + "table_name": STANDARD_COLUMN_COMMENTS["table_name"], + "updated_at": "Timestamp of latest update (based on the event log timestamp) applied to this row", + "latest_state": STANDARD_COLUMN_COMMENTS["latest_table_state"], + "table_schema_json": STANDARD_COLUMN_COMMENTS["table_schema_json"] % ("this"), + "table_schema": STANDARD_COLUMN_COMMENTS["table_schema"] % ("this"), + "latest_error_time": STANDARD_COLUMN_COMMENTS["latest_table_error_time"] + % ("this"), + "latest_error_log_message": STANDARD_COLUMN_COMMENTS[ + "latest_table_error_log_message" + ] + % ("this"), + "latest_error_message": STANDARD_COLUMN_COMMENTS["latest_table_error_message"] + % ("this"), + "latest_error_full": STANDARD_COLUMN_COMMENTS["latest_table_error_full"] + % ("this"), + "flow_type": STANDARD_COLUMN_COMMENTS["flow_type"], + "latest_state_level": STANDARD_COLUMN_COMMENTS["latest_table_state_level"], + "latest_state_color": STANDARD_COLUMN_COMMENTS["latest_table_state_color"], + "latest_state_with_color": STANDARD_COLUMN_COMMENTS[ + "latest_table_state_with_color" + ], + }, ) TABLE_STATUS = MonitoringTable( - name='table_status', - table_type=TableType.MATERIALIZED_VIEW, - table_comment="Keeps track of the latest progress of processing a specific target table", - column_comments={ - "pipeline_id": STANDARD_COLUMN_COMMENTS['pipeline_id'], - "table_name": STANDARD_COLUMN_COMMENTS["table_name"], - "latest_pipeline_run_id": f"Latest {STANDARD_COLUMN_COMMENTS['pipeline_run_id']}", - "latest_pipeline_run_link": f"Latest {STANDARD_COLUMN_COMMENTS['pipeline_run_link']}", - "latest_state": STANDARD_COLUMN_COMMENTS["latest_table_state"], - "latest_state_level": STANDARD_COLUMN_COMMENTS['latest_table_state_level'], - "latest_state_color": STANDARD_COLUMN_COMMENTS['latest_table_state_color'], - "latest_state_with_color": STANDARD_COLUMN_COMMENTS['latest_table_state_with_color'], - "latest_table_schema_json": STANDARD_COLUMN_COMMENTS["table_schema_json"] % ('the latest'), - "latest_table_schema": STANDARD_COLUMN_COMMENTS["table_schema"] % ('the latest'), - "latest_cdc_changes_time": "The latest time when the CDC changes were applied to the target table", - "latest_snapshot_changes_time": "The latest time when the snapshot changes were applied to the target table", - "latest_error_pipeline_run_id": "The pipeline run id with the latest error for the target table", - "latest_error_pipeline_run_link": """An HTML-formatted link for the pipeline run with the latest error + name="table_status", + table_type=TableType.MATERIALIZED_VIEW, + table_comment="Keeps track of the latest progress of processing a specific target table", + column_comments={ + "pipeline_id": STANDARD_COLUMN_COMMENTS["pipeline_id"], + "table_name": STANDARD_COLUMN_COMMENTS["table_name"], + "latest_pipeline_run_id": f"Latest {STANDARD_COLUMN_COMMENTS['pipeline_run_id']}", + "latest_pipeline_run_link": f"Latest {STANDARD_COLUMN_COMMENTS['pipeline_run_link']}", + "latest_state": STANDARD_COLUMN_COMMENTS["latest_table_state"], + "latest_state_level": STANDARD_COLUMN_COMMENTS["latest_table_state_level"], + "latest_state_color": STANDARD_COLUMN_COMMENTS["latest_table_state_color"], + "latest_state_with_color": STANDARD_COLUMN_COMMENTS[ + "latest_table_state_with_color" + ], + "latest_table_schema_json": STANDARD_COLUMN_COMMENTS["table_schema_json"] + % ("the latest"), + "latest_table_schema": STANDARD_COLUMN_COMMENTS["table_schema"] + % ("the latest"), + "latest_cdc_changes_time": "The latest time when the CDC changes were applied to the target table", + "latest_snapshot_changes_time": "The latest time when the snapshot changes were applied to the target table", + "latest_error_pipeline_run_id": "The pipeline run id with the latest error for the target table", + "latest_error_pipeline_run_link": """An HTML-formatted link for the pipeline run with the latest error for the target table; useful in dashboards""", - "latest_error_time": STANDARD_COLUMN_COMMENTS["latest_table_error_time"] % ('a'), - "latest_error_log_message": STANDARD_COLUMN_COMMENTS["latest_table_error_log_message"] % ('a'), - "latest_error_message": STANDARD_COLUMN_COMMENTS["latest_table_error_message"] % ('a'), - "latest_error_full": STANDARD_COLUMN_COMMENTS["latest_table_error_full"] % ('a'), - "latest_error_flow_type": "The flow type ('cdc', 'snapshot') where the latest error occurred for this target table", - } + "latest_error_time": STANDARD_COLUMN_COMMENTS["latest_table_error_time"] + % ("a"), + "latest_error_log_message": STANDARD_COLUMN_COMMENTS[ + "latest_table_error_log_message" + ] + % ("a"), + "latest_error_message": STANDARD_COLUMN_COMMENTS["latest_table_error_message"] + % ("a"), + "latest_error_full": STANDARD_COLUMN_COMMENTS["latest_table_error_full"] + % ("a"), + "latest_error_flow_type": "The flow type ('cdc', 'snapshot') where the latest error occurred for this target table", + }, ) TABLE_EVENTS_EXPECTATION_CHECKS = MonitoringTable( - name='table_events_expectation_checks', - table_type=TableType.STREAMING_TABLE, - table_comment="Keeps track of the results of expectation checks for each pipeline run", - column_comments={ - "pipeline_id": STANDARD_COLUMN_COMMENTS['pipeline_id'], - "pipeline_run_id": STANDARD_COLUMN_COMMENTS['pipeline_run_id'], - "pipeline_run_link": STANDARD_COLUMN_COMMENTS['pipeline_run_link'], - "table_name": STANDARD_COLUMN_COMMENTS["table_name"], - "flow_name": STANDARD_COLUMN_COMMENTS["flow_name"], - "event_timestamp": STANDARD_COLUMN_COMMENTS["event_timestamp"], - "expectation_name": "The name of the expectation", - "num_passed": "The number of rows/records that passed the expectation check", - "num_failed": "The number of rows/records that failed the expectation check", - "failure_pct": "The percentage of rows/records that failed the expectation check", - } + name="table_events_expectation_checks", + table_type=TableType.STREAMING_TABLE, + table_comment="Keeps track of the results of expectation checks for each pipeline run", + column_comments={ + "pipeline_id": STANDARD_COLUMN_COMMENTS["pipeline_id"], + "pipeline_run_id": STANDARD_COLUMN_COMMENTS["pipeline_run_id"], + "pipeline_run_link": STANDARD_COLUMN_COMMENTS["pipeline_run_link"], + "table_name": STANDARD_COLUMN_COMMENTS["table_name"], + "flow_name": STANDARD_COLUMN_COMMENTS["flow_name"], + "event_timestamp": STANDARD_COLUMN_COMMENTS["event_timestamp"], + "expectation_name": "The name of the expectation", + "num_passed": "The number of rows/records that passed the expectation check", + "num_failed": "The number of rows/records that failed the expectation check", + "failure_pct": "The percentage of rows/records that failed the expectation check", + }, ) PIPELINE_TAGS_INDEX = MonitoringTable( - name='pipeline_tags_index', - table_type=TableType.DELTA_TABLE, - table_comment="""Inverted index mapping pipeline tags to pipeline IDs for efficient tag-based pipeline discovery. + name="pipeline_tags_index", + table_type=TableType.DELTA_TABLE, + table_comment="""Inverted index mapping pipeline tags to pipeline IDs for efficient tag-based pipeline discovery. Built and maintained by the 'Build pipeline tags index' job. Used to optimize performance when discovering pipelines by tags instead of querying the Databricks API for every pipeline.""", - column_comments={ - "tag_key": "The tag key (e.g., 'env', 'team', 'critical')", - "tag_value": "The tag value (e.g., 'prod', 'data', 'true')", - "pipeline_ids": """Array of pipeline IDs that have this tag:value pair. Used for efficient lookup when + column_comments={ + "tag_key": "The tag key (e.g., 'env', 'team', 'critical')", + "tag_value": "The tag value (e.g., 'prod', 'data', 'true')", + "pipeline_ids": """Array of pipeline IDs that have this tag:value pair. Used for efficient lookup when discovering pipelines by tags without expensive API calls.""", - "index_build_time": "Timestamp when this index was last built. Used to determine if the index is stale." - } + "index_build_time": "Timestamp when this index was last built. Used to determine if the index is stale.", + }, ) -def set_all_table_column_comments(monitoring_catalog: str, monitoring_schema: str, spark: SparkSession): - for st in [MONITORED_PIPELINES, MONITORED_TABLES, EVENT_LOGS_BRONZE, PIPELINE_RUNS_STATUS, EVENTS_ERRORS, - EVENTS_WARNINGS, METRIC_PIPELINE_HOURLY_ERROR_RATE, PIPELINES_STATUS_SILVER, PIPELINES_STATUS, - EVENTS_TABLE_METRICS, TABLE_STATUS_PER_PIPELINE_RUN, TABLE_STATUS, - TABLE_EVENTS_EXPECTATION_CHECKS, PIPELINE_TAGS_INDEX]: - st.add_column_comments(monitoring_catalog, monitoring_schema, spark) - +def set_all_table_column_comments( + monitoring_catalog: str, monitoring_schema: str, spark: SparkSession +): + for st in [ + MONITORED_PIPELINES, + MONITORED_TABLES, + EVENT_LOGS_BRONZE, + PIPELINE_RUNS_STATUS, + EVENTS_ERRORS, + EVENTS_WARNINGS, + METRIC_PIPELINE_HOURLY_ERROR_RATE, + PIPELINES_STATUS_SILVER, + PIPELINES_STATUS, + EVENTS_TABLE_METRICS, + TABLE_STATUS_PER_PIPELINE_RUN, + TABLE_STATUS, + TABLE_EVENTS_EXPECTATION_CHECKS, + PIPELINE_TAGS_INDEX, + ]: + st.add_column_comments(monitoring_catalog, monitoring_schema, spark) diff --git a/contrib/databricks_ingestion_monitoring/common/src/build_pipeline_tags_index.ipynb b/contrib/databricks_ingestion_monitoring/common/src/build_pipeline_tags_index.ipynb index 9d4e823..e379b1c 100644 --- a/contrib/databricks_ingestion_monitoring/common/src/build_pipeline_tags_index.ipynb +++ b/contrib/databricks_ingestion_monitoring/common/src/build_pipeline_tags_index.ipynb @@ -46,13 +46,15 @@ "dbutils.widgets.text(\"monitoring_schema\", \"\")\n", "dbutils.widgets.text(\"pipeline_tags_index_table_name\", \"pipeline_tags_index\")\n", "\n", - "logging.basicConfig(level=logging.INFO, format=\"%(asctime)s [%(levelname)s] (%(name)s) %(message)s\")\n", + "logging.basicConfig(\n", + " level=logging.INFO, format=\"%(asctime)s [%(levelname)s] (%(name)s) %(message)s\"\n", + ")\n", "\n", "# Build the index\n", "builder = PipelineTagsIndexBuilder(\n", " monitoring_catalog=dbutils.widgets.get(\"monitoring_catalog\"),\n", " monitoring_schema=dbutils.widgets.get(\"monitoring_schema\"),\n", - " index_table_name=dbutils.widgets.get(\"pipeline_tags_index_table_name\")\n", + " index_table_name=dbutils.widgets.get(\"pipeline_tags_index_table_name\"),\n", ")\n", "\n", "builder.build_index(spark)" diff --git a/contrib/databricks_ingestion_monitoring/common/src/create_imported_event_logs_target_table.ipynb b/contrib/databricks_ingestion_monitoring/common/src/create_imported_event_logs_target_table.ipynb index 0f3b4ee..8ac6d5b 100644 --- a/contrib/databricks_ingestion_monitoring/common/src/create_imported_event_logs_target_table.ipynb +++ b/contrib/databricks_ingestion_monitoring/common/src/create_imported_event_logs_target_table.ipynb @@ -50,15 +50,19 @@ "\n", "from databricks_ingestion_monitoring.common import EventLogImporter\n", "\n", - "logging.basicConfig(level=logging.INFO, format=\"%(asctime)s [%(levelname)s] (%(name)s) %(message)s\")\n", + "logging.basicConfig(\n", + " level=logging.INFO, format=\"%(asctime)s [%(levelname)s] (%(name)s) %(message)s\"\n", + ")\n", "\n", "dbutils.widgets.text(\"monitoring_catalog\", \"\")\n", "dbutils.widgets.text(\"monitoring_schema\", \"\")\n", "dbutils.widgets.text(\"imported_event_logs_table_name\", \"imported_event_logs\")\n", "\n", - "importer = EventLogImporter(monitoring_catalog = dbutils.widgets.get(\"monitoring_catalog\"),\n", - " monitoring_schema = dbutils.widgets.get(\"monitoring_schema\"),\n", - " imported_event_logs_table = dbutils.widgets.get(\"imported_event_logs_table_name\"))\n", + "importer = EventLogImporter(\n", + " monitoring_catalog=dbutils.widgets.get(\"monitoring_catalog\"),\n", + " monitoring_schema=dbutils.widgets.get(\"monitoring_schema\"),\n", + " imported_event_logs_table=dbutils.widgets.get(\"imported_event_logs_table_name\"),\n", + ")\n", "importer.create_target_table(spark)" ] } diff --git a/contrib/databricks_ingestion_monitoring/common/src/import_event_logs.ipynb b/contrib/databricks_ingestion_monitoring/common/src/import_event_logs.ipynb index 6e67318..43197d5 100644 --- a/contrib/databricks_ingestion_monitoring/common/src/import_event_logs.ipynb +++ b/contrib/databricks_ingestion_monitoring/common/src/import_event_logs.ipynb @@ -61,7 +61,9 @@ "dbutils.widgets.text(\"pipeline_tags_index_max_age_hours\", \"24\")\n", "dbutils.widgets.text(\"pipeline_tags_index_api_fallback_enabled\", \"true\")\n", "\n", - "logging.basicConfig(level=logging.INFO, format=\"%(asctime)s [%(levelname)s] (%(name)s) %(message)s\")\n", + "logging.basicConfig(\n", + " level=logging.INFO, format=\"%(asctime)s [%(levelname)s] (%(name)s) %(message)s\"\n", + ")\n", "\n", "importer = EventLogImporter(\n", " monitoring_catalog=dbutils.widgets.get(\"monitoring_catalog\"),\n", @@ -70,12 +72,16 @@ " index_table_name=dbutils.widgets.get(\"pipeline_tags_index_table_name\"),\n", " index_enabled=dbutils.widgets.get(\"pipeline_tags_index_enabled\").lower() == \"true\",\n", " index_max_age_hours=int(dbutils.widgets.get(\"pipeline_tags_index_max_age_hours\")),\n", - " api_fallback_enabled=dbutils.widgets.get(\"pipeline_tags_index_api_fallback_enabled\").lower() == \"true\"\n", + " api_fallback_enabled=dbutils.widgets.get(\n", + " \"pipeline_tags_index_api_fallback_enabled\"\n", + " ).lower()\n", + " == \"true\",\n", ")\n", "importer.import_event_logs_for_pipelines_by_ids_and_tags(\n", " dbutils.widgets.get(\"imported_pipeline_ids\"),\n", " dbutils.widgets.get(\"imported_pipeline_tags\"),\n", - " spark)" + " spark,\n", + ")" ] } ], diff --git a/contrib/databricks_ingestion_monitoring/common/src/publish_dashboard.ipynb b/contrib/databricks_ingestion_monitoring/common/src/publish_dashboard.ipynb index f071981..699745a 100644 --- a/contrib/databricks_ingestion_monitoring/common/src/publish_dashboard.ipynb +++ b/contrib/databricks_ingestion_monitoring/common/src/publish_dashboard.ipynb @@ -62,13 +62,17 @@ "import sys\n", "from databricks.sdk import WorkspaceClient\n", "\n", - "sys.path.append('../lib')\n", + "sys.path.append(\"../lib\")\n", "\n", "from databricks_ingestion_monitoring.common import DashboardTemplate\n", "\n", - "logging.basicConfig(level=logging.INFO, format=\"%(asctime)s [%(levelname)s] (%(name)s) %(message)s\")\n", + "logging.basicConfig(\n", + " level=logging.INFO, format=\"%(asctime)s [%(levelname)s] (%(name)s) %(message)s\"\n", + ")\n", "\n", - "d = DashboardTemplate.from_notebook_widgets(widgets=dbutils.widgets, wc=WorkspaceClient())\n", + "d = DashboardTemplate.from_notebook_widgets(\n", + " widgets=dbutils.widgets, wc=WorkspaceClient()\n", + ")\n", "d.publish()" ] } diff --git a/contrib/databricks_ingestion_monitoring/common/src/update_monitoring_tables_meta.ipynb b/contrib/databricks_ingestion_monitoring/common/src/update_monitoring_tables_meta.ipynb index 5fdede9..4fbeaa9 100644 --- a/contrib/databricks_ingestion_monitoring/common/src/update_monitoring_tables_meta.ipynb +++ b/contrib/databricks_ingestion_monitoring/common/src/update_monitoring_tables_meta.ipynb @@ -33,18 +33,24 @@ "\n", "sys.path.append(\"../lib\")\n", "\n", - "from databricks_ingestion_monitoring.standard_tables import set_all_table_column_comments\n", + "from databricks_ingestion_monitoring.standard_tables import (\n", + " set_all_table_column_comments,\n", + ")\n", "from databricks_ingestion_monitoring.common import get_required_widget_parameter\n", "\n", "dbutils.widgets.text(\"monitoring_catalog\", \"\")\n", "dbutils.widgets.text(\"monitoring_schema\", \"\")\n", "\n", - "logging.basicConfig(level=logging.INFO, format=\"%(asctime)s [%(levelname)s] (%(name)s) %(message)s\")\n", + "logging.basicConfig(\n", + " level=logging.INFO, format=\"%(asctime)s [%(levelname)s] (%(name)s) %(message)s\"\n", + ")\n", "logging.getLogger(\"dbx_ingestion_monitoring.MonitoringTable\").setLevel(logging.DEBUG)\n", "\n", - "monitoring_catalog = get_required_widget_parameter(dbutils.widgets, 'monitoring_catalog')\n", - "monitoring_schema = get_required_widget_parameter(dbutils.widgets, 'monitoring_schema')\n", - "set_all_table_column_comments(monitoring_catalog, monitoring_schema, spark)\n" + "monitoring_catalog = get_required_widget_parameter(\n", + " dbutils.widgets, \"monitoring_catalog\"\n", + ")\n", + "monitoring_schema = get_required_widget_parameter(dbutils.widgets, \"monitoring_schema\")\n", + "set_all_table_column_comments(monitoring_catalog, monitoring_schema, spark)" ] } ], diff --git a/contrib/databricks_ingestion_monitoring/common/third_party_sinks/azuremonitor_sink.py b/contrib/databricks_ingestion_monitoring/common/third_party_sinks/azuremonitor_sink.py index 1a735e5..e9f34b4 100644 --- a/contrib/databricks_ingestion_monitoring/common/third_party_sinks/azuremonitor_sink.py +++ b/contrib/databricks_ingestion_monitoring/common/third_party_sinks/azuremonitor_sink.py @@ -12,7 +12,17 @@ from datetime import datetime, timezone from pyspark.sql import SparkSession from pyspark.sql.types import StringType, ArrayType -from pyspark.sql.functions import lit, col, collect_list, concat, expr, udf, struct, explode, regexp_replace +from pyspark.sql.functions import ( + lit, + col, + collect_list, + concat, + expr, + udf, + struct, + explode, + regexp_replace, +) import dlt import threading @@ -36,8 +46,8 @@ "metric_value": {"type": "number"}, "timestamp": {"type": "integer"}, "tags": {"type": "object", "additionalProperties": True}, - "additional_attributes": {"type": "object", "additionalProperties": True} - } + "additional_attributes": {"type": "object", "additionalProperties": True}, + }, } LOGS_SCHEMA = { @@ -49,8 +59,8 @@ "status": {"type": "string"}, "timestamp": {"type": "integer"}, "tags": {"type": "object", "additionalProperties": True}, - "additional_attributes": {"type": "object", "additionalProperties": True} - } + "additional_attributes": {"type": "object", "additionalProperties": True}, + }, } EVENTS_SCHEMA = { @@ -62,17 +72,22 @@ "status": {"type": "string"}, "timestamp": {"type": "integer"}, "tags": {"type": "object", "additionalProperties": True}, - "additional_attributes": {"type": "object", "additionalProperties": True} - } + "additional_attributes": {"type": "object", "additionalProperties": True}, + }, } # ================================================================================ # UTILITIES # ================================================================================ + def get_azure_header(access_token: str = None): """Get headers for the Azure Monitor API.""" - return {"Content-Type": "application/json", "Authorization": f"Bearer {access_token}"} + return { + "Content-Type": "application/json", + "Authorization": f"Bearer {access_token}", + } + def fetch_access_token(config): """ @@ -95,24 +110,23 @@ def fetch_access_token(config): "grant_type": "client_credentials", "client_id": config["client_id"], "client_secret": config["client_secret"], - "scope": "https://monitor.azure.com//.default" + "scope": "https://monitor.azure.com//.default", } now = int(datetime.now(timezone.utc).timestamp()) - token_url = config['access_token_url'] + token_url = config["access_token_url"] try: - response = requests.post(token_url, data=payload, timeout=config['request_timeout_sec']) + response = requests.post( + token_url, data=payload, timeout=config["request_timeout_sec"] + ) response.raise_for_status() token_json = response.json() if "access_token" not in token_json: raise RuntimeError(f"Token response missing 'access_token': {token_json}") - return { - "access_token": token_json["access_token"], - "last_load_timestamp": now - } + return {"access_token": token_json["access_token"], "last_load_timestamp": now} except requests.RequestException as e: error_message = ( @@ -126,7 +140,6 @@ def fetch_access_token(config): raise RuntimeError(error_message) from e - def initialize_global_config(spark_conf): """Initialize global configuration from Spark configuration.""" global _global_config, _log_converter, _events_converter, _metrics_converter @@ -136,12 +149,14 @@ def initialize_global_config(spark_conf): _events_converter = AzureMonitorEventsConverter() _metrics_converter = AzureMonitorMetricsConverter() + def getParam(spark_conf, key: str, default=None): value = spark_conf.get(key, default) if value == "" or value is None: return None return value + def getThirdPartySinkConfigFromSparkConfig(spark_conf): """ Extract and merge configuration from Spark configuration and secret scope. @@ -169,12 +184,14 @@ def getThirdPartySinkConfigFromSparkConfig(spark_conf): "num_rows_per_batch": int(spark_conf.get("num_rows_per_batch", "100")), "max_retry_duration_sec": int(spark_conf.get("max_retry_duration_sec", "300")), "request_timeout_sec": int(spark_conf.get("request_timeout_sec", "30")), - "max_access_token_staleness": int(spark_conf.get("azure_max_access_token_staleness", "3300")), + "max_access_token_staleness": int( + spark_conf.get("azure_max_access_token_staleness", "3300") + ), "client_id": getParam(spark_conf, "azure_client_id"), "client_secret": getParam(spark_conf, "azure_client_secret"), "tenant_id": getParam(spark_conf, "azure_tenant_id"), "host_name": getParam(spark_conf, "host_name"), - "dcr_immutable_id": getParam(spark_conf, "azure_dcr_immutable_id") + "dcr_immutable_id": getParam(spark_conf, "azure_dcr_immutable_id"), } scope = getParam(spark_conf, "secrets_scope") @@ -193,7 +210,7 @@ def getThirdPartySinkConfigFromSparkConfig(spark_conf): # Auto-generate authorization endpoint if not provided if authorization_endpoint is None: - if common_params['tenant_id'] is None: + if common_params["tenant_id"] is None: raise ValueError( "Either 'azure_tenant_id' must be provided to auto-generate authorization endpoint, " "or 'azure_authorization_endpoint' must be explicitly configured." @@ -204,7 +221,7 @@ def getThirdPartySinkConfigFromSparkConfig(spark_conf): # Auto-generate data ingestion endpoints if not provided if not all([metrics_endpoint, logs_endpoint, events_endpoint]): - if common_params['host_name'] is None: + if common_params["host_name"] is None: raise ValueError( "Either 'host_name' must be provided to auto-generate DCE endpoint, " "or all three endpoints (endpoints.metrics, endpoints.logs, endpoints.events) " @@ -212,7 +229,7 @@ def getThirdPartySinkConfigFromSparkConfig(spark_conf): ) dce_endpoint = f"https://{common_params['host_name']}" - if common_params['dcr_immutable_id'] is None: + if common_params["dcr_immutable_id"] is None: raise ValueError( "Either 'dcr_immutable_id' must be provided to auto-generate DCE endpoint, " "or all three endpoints (endpoints.metrics, endpoints.logs, endpoints.events) " @@ -226,7 +243,7 @@ def getThirdPartySinkConfigFromSparkConfig(spark_conf): common_params["endpoints"] = { "metrics": metrics_endpoint, "logs": logs_endpoint, - "events": events_endpoint + "events": events_endpoint, } return common_params @@ -241,30 +258,30 @@ def unix_to_iso(timestamp: int) -> str: dt = datetime.fromtimestamp(ts, tz=timezone.utc) return dt.isoformat().replace("+00:00", "Z") + def timestamp_in_unix_milliseconds(timestamp) -> int: """Convert datetime to Unix timestamp in milliseconds.""" if isinstance(timestamp, datetime): return int(timestamp.timestamp() * 1000) return int(timestamp) + def get_status(status_display: str) -> str: """Map pipeline status to appropriate status level.""" status_lower = status_display.lower() - if status_lower in ['failed', 'error']: - return 'error' - elif status_lower in ['running', 'starting']: - return 'info' - elif status_lower in ['completed', 'success']: - return 'ok' + if status_lower in ["failed", "error"]: + return "error" + elif status_lower in ["running", "starting"]: + return "info" + elif status_lower in ["completed", "success"]: + return "ok" else: - return 'warn' + return "warn" + def serialize_datetime(data): if isinstance(data, dict): - return { - key: serialize_datetime(value) - for key, value in data.items() - } + return {key: serialize_datetime(value) for key, value in data.items()} elif isinstance(data, list): return [serialize_datetime(item) for item in data] elif isinstance(data, datetime): @@ -272,6 +289,7 @@ def serialize_datetime(data): else: return data + def filter_null_fields(data): if isinstance(data, dict): return { @@ -284,12 +302,12 @@ def filter_null_fields(data): else: return data -def enforce_schema(data, schema, path = "root"): + +def enforce_schema(data, schema, path="root"): # Nothing to enforce. if schema is None or data is None: return data - schema_type = schema.get("type") if not schema_type: raise ValueError(f"Failed to get type of the object at {path}.") @@ -337,7 +355,9 @@ def enforce_schema(data, schema, path = "root"): f"Additional property '{k}' at {path} does not match any oneOf schema" ) else: - data[k] = enforce_schema(v, additional_properties, f"{path}.{k}") + data[k] = enforce_schema( + v, additional_properties, f"{path}.{k}" + ) return data @@ -346,7 +366,10 @@ def enforce_schema(data, schema, path = "root"): if schema_type != "array": raise ValueError(f"Expected array at {path}, got {type(data).__name__}") items_schema = schema.get("items", {}) - return [enforce_schema(item, items_schema, f"{path}[{i}]") for i, item in enumerate(data)] + return [ + enforce_schema(item, items_schema, f"{path}[{i}]") + for i, item in enumerate(data) + ] # Handle string elif isinstance(data, str): @@ -354,7 +377,9 @@ def enforce_schema(data, schema, path = "root"): raise ValueError(f"Expected string at {path}, got {type(data).__name__}") acceptable_values = schema.get("enum", []) if acceptable_values and data not in acceptable_values: - raise ValueError(f"Invalid value at {path}: {data}. Allowed: {acceptable_values}") + raise ValueError( + f"Invalid value at {path}: {data}. Allowed: {acceptable_values}" + ) max_length = schema.get("maxLength") if max_length and len(data) > max_length: return data[:max_length] @@ -390,12 +415,14 @@ def enforce_schema(data, schema, path = "root"): return data return data + def create_valid_json_or_fail_with_error(data, schema): data = serialize_datetime(data) data = filter_null_fields(data) data = enforce_schema(data, schema) return json.dumps(data) + # ================================================================================ # HTTP Layer # ================================================================================ @@ -403,6 +430,7 @@ def create_valid_json_or_fail_with_error(data, schema): # Global session for connection pooling session: Optional[requests.Session] = None + class HTTPClient: """ HTTP client for batched POST requests using a persistent session. @@ -413,7 +441,9 @@ class HTTPClient: - payload (binary data): Serialized request body. """ - def __init__(self, max_retry_duration_sec: int = 300, request_timeout_sec: int = 30): + def __init__( + self, max_retry_duration_sec: int = 300, request_timeout_sec: int = 30 + ): """ Initialize the HTTP client. @@ -424,7 +454,6 @@ def __init__(self, max_retry_duration_sec: int = 300, request_timeout_sec: int = self.max_retry_duration_sec = max_retry_duration_sec self.request_timeout_sec = request_timeout_sec - def get_session(self) -> requests.Session: """ Get the global session instance. If not present, create a new one. @@ -437,7 +466,9 @@ def get_session(self) -> requests.Session: session = requests.Session() return session - def _make_request_with_retry(self, url: str, headers: Dict[str, str], payload: bytes): + def _make_request_with_retry( + self, url: str, headers: Dict[str, str], payload: bytes + ): """ Make a POST request to the provided url. @@ -451,7 +482,7 @@ def _make_request_with_retry(self, url: str, headers: Dict[str, str], payload: b """ # Compress payload compressed_payload = gzip.compress(payload) - headers['Content-Encoding'] = 'gzip' + headers["Content-Encoding"] = "gzip" response = None try: @@ -459,19 +490,29 @@ def _make_request_with_retry(self, url: str, headers: Dict[str, str], payload: b url, headers=headers, data=compressed_payload, - timeout=self.request_timeout_sec + timeout=self.request_timeout_sec, ) response.raise_for_status() - print(f"Successfully sent request to URL: {url}, Payload: {payload.decode('utf-8')}, Response: {response.text}") + print( + f"Successfully sent request to URL: {url}, Payload: {payload.decode('utf-8')}, Response: {response.text}" + ) except Exception as e: response_text = "No response" if response is not None: try: response_text = str(response.json()) except: - response_text = response.text if hasattr(response, 'text') else "Unable to read response" - print(f"Request failed for URL: {url}, headers: {str(headers)}, Payload: {payload.decode('utf-8')}, Error: {str(e)}, Response: {response_text}") - raise type(e)(f"Request failed for URL: {url}, headers: {str(headers)}, Payload: {payload.decode('utf-8')}, Error: {str(e)}, Response: {response_text}") from e + response_text = ( + response.text + if hasattr(response, "text") + else "Unable to read response" + ) + print( + f"Request failed for URL: {url}, headers: {str(headers)}, Payload: {payload.decode('utf-8')}, Error: {str(e)}, Response: {response_text}" + ) + raise type(e)( + f"Request failed for URL: {url}, headers: {str(headers)}, Payload: {payload.decode('utf-8')}, Error: {str(e)}, Response: {response_text}" + ) from e def post(self, http_request_specs_df) -> None: """ @@ -484,22 +525,25 @@ def post(self, http_request_specs_df) -> None: for row in http_request_specs_df.collect(): try: - headers = json.loads(getattr(row, 'header', '{}')) + headers = json.loads(getattr(row, "header", "{}")) retry_wrapper = retry( stop=stop_after_delay(self.max_retry_duration_sec), wait=wait_exponential(multiplier=1, min=1, max=10), - reraise=True + reraise=True, + ) + retry_wrapper(self._make_request_with_retry)( + row.endpoint, headers, row.payloadBytes ) - retry_wrapper(self._make_request_with_retry)(row.endpoint, headers, row.payloadBytes) except Exception as e: print(f"ERROR: {str(e)}") - continue # Continue with other requests regardless of success/failure + continue # Continue with other requests regardless of success/failure # ================================================================================ # CONVERSION LAYER # ================================================================================ + class AzureMonitorMetricsConverter: """Converter class to convert metrics to Azure Monitor format.""" @@ -509,7 +553,8 @@ def create_metric( metric_value: float, tags: Dict[str, str], timestamp: int, - additional_attributes: Optional[Dict[str, Any]] = None) -> str: + additional_attributes: Optional[Dict[str, Any]] = None, + ) -> str: """Create an Azure Monitor metric in the proper format. Args: @@ -526,29 +571,38 @@ def create_metric( ValueError if the fields are of unsupported types. """ # Enforce the schema - return create_valid_json_or_fail_with_error({ - "TimeGenerated": unix_to_iso(timestamp), - "metric_name": metric_name, - "metric_value": metric_value, - "tags": tags, - "timestamp": timestamp, - "additional_attributes": additional_attributes - }, METRICS_SCHEMA) - - def create_http_requests_spec(self, df, num_rows_per_batch: int, headers: dict, endpoint: str): + return create_valid_json_or_fail_with_error( + { + "TimeGenerated": unix_to_iso(timestamp), + "metric_name": metric_name, + "metric_value": metric_value, + "tags": tags, + "timestamp": timestamp, + "additional_attributes": additional_attributes, + }, + METRICS_SCHEMA, + ) + + def create_http_requests_spec( + self, df, num_rows_per_batch: int, headers: dict, endpoint: str + ): """Create HTTP request spec DataFrame for metrics.""" - df_with_batch_id = df.withColumn("batch_id", - expr(f"int((row_number() over (order by 1) - 1) / {num_rows_per_batch})")) \ - .withColumn("metrics", regexp_replace(col("metrics"), "\n", "")) - return df_with_batch_id.groupBy("batch_id") \ - .agg(collect_list("metrics").alias("batch_metrics")) \ - .withColumn("payload", concat(lit('['), - expr("concat_ws(',', batch_metrics)"), - lit(']'))) \ - .withColumn("payloadBytes", col("payload").cast("binary")) \ - .withColumn("endpoint", lit(endpoint)) \ - .withColumn("header", lit(json.dumps(headers))) \ + df_with_batch_id = df.withColumn( + "batch_id", + expr(f"int((row_number() over (order by 1) - 1) / {num_rows_per_batch})"), + ).withColumn("metrics", regexp_replace(col("metrics"), "\n", "")) + return ( + df_with_batch_id.groupBy("batch_id") + .agg(collect_list("metrics").alias("batch_metrics")) + .withColumn( + "payload", + concat(lit("["), expr("concat_ws(',', batch_metrics)"), lit("]")), + ) + .withColumn("payloadBytes", col("payload").cast("binary")) + .withColumn("endpoint", lit(endpoint)) + .withColumn("header", lit(json.dumps(headers))) .select("endpoint", "header", "payloadBytes") + ) class AzureMonitorEventsConverter: @@ -560,7 +614,8 @@ def create_event( status: str, tags: Dict[str, str], timestamp: int, - additional_attributes: Optional[Dict[str, Any]] = None) -> str: + additional_attributes: Optional[Dict[str, Any]] = None, + ) -> str: """ Create an Azure Monitor event in the proper format. @@ -577,29 +632,39 @@ def create_event( Raises: ValueError if the fields are of unsupported types. """ - return create_valid_json_or_fail_with_error({ - "TimeGenerated": unix_to_iso(timestamp), - "message": message, - "status": status, - "tags": tags, - "timestamp": timestamp, - "additional_attributes": additional_attributes, - }, EVENTS_SCHEMA) - - def create_http_requests_spec(self, df, num_rows_per_batch: int, headers: dict, endpoint: str): + return create_valid_json_or_fail_with_error( + { + "TimeGenerated": unix_to_iso(timestamp), + "message": message, + "status": status, + "tags": tags, + "timestamp": timestamp, + "additional_attributes": additional_attributes, + }, + EVENTS_SCHEMA, + ) + + def create_http_requests_spec( + self, df, num_rows_per_batch: int, headers: dict, endpoint: str + ): """Create HTTP request spec DataFrame for events.""" - df_with_batch_id = df.withColumn("batch_id", - expr(f"int((row_number() over (order by 1) - 1) / {num_rows_per_batch})")) \ - .withColumn("events", regexp_replace(col("events"), "\n", "")) - return df_with_batch_id.groupBy("batch_id") \ - .agg(collect_list("events").alias("batch_events")) \ - .withColumn("payload", concat(lit('['), - expr("concat_ws(',', batch_events)"), - lit(']'))) \ - .withColumn("payloadBytes", col("payload").cast("binary")) \ - .withColumn("endpoint", lit(endpoint)) \ - .withColumn("header", lit(json.dumps(headers))) \ + df_with_batch_id = df.withColumn( + "batch_id", + expr(f"int((row_number() over (order by 1) - 1) / {num_rows_per_batch})"), + ).withColumn("events", regexp_replace(col("events"), "\n", "")) + return ( + df_with_batch_id.groupBy("batch_id") + .agg(collect_list("events").alias("batch_events")) + .withColumn( + "payload", + concat(lit("["), expr("concat_ws(',', batch_events)"), lit("]")), + ) + .withColumn("payloadBytes", col("payload").cast("binary")) + .withColumn("endpoint", lit(endpoint)) + .withColumn("header", lit(json.dumps(headers))) .select("endpoint", "header", "payloadBytes") + ) + class AzureMonitorLogsConverter: """Converter class to convert logs to Azure Monitor format.""" @@ -610,7 +675,8 @@ def create_log( status: str, tags: Dict[str, str], timestamp: int, - additional_attributes: Optional[Dict[str, Any]] = None) -> str: + additional_attributes: Optional[Dict[str, Any]] = None, + ) -> str: """ Create an Azure Monitor log in the proper format. @@ -627,54 +693,66 @@ def create_log( Raises: ValueError if the fields are of unsupported types. """ - return create_valid_json_or_fail_with_error({ - "TimeGenerated": unix_to_iso(timestamp), - "message": message, - "status": status, - "tags": tags, - "timestamp": timestamp, - "additional_attributes": additional_attributes, - }, LOGS_SCHEMA) - - def create_http_requests_spec(self, df, num_rows_per_batch: int, headers: dict, endpoint: str): + return create_valid_json_or_fail_with_error( + { + "TimeGenerated": unix_to_iso(timestamp), + "message": message, + "status": status, + "tags": tags, + "timestamp": timestamp, + "additional_attributes": additional_attributes, + }, + LOGS_SCHEMA, + ) + + def create_http_requests_spec( + self, df, num_rows_per_batch: int, headers: dict, endpoint: str + ): """Create HTTP request spec DataFrame for logs.""" - df_with_batch_id = df.withColumn("batch_id", - expr(f"int((row_number() over (order by 1) - 1) / {num_rows_per_batch})")) \ - .withColumn("logs", regexp_replace(col("logs"), "\n", "")) - return df_with_batch_id.groupBy("batch_id") \ - .agg(collect_list("logs").alias("batch_logs")) \ - .withColumn("payload", concat(lit('['), - expr("concat_ws(',', batch_logs)"), - lit(']'))) \ - .withColumn("payloadBytes", col("payload").cast("binary")) \ - .withColumn("endpoint", lit(endpoint)) \ - .withColumn("header", lit(json.dumps(headers))) \ + df_with_batch_id = df.withColumn( + "batch_id", + expr(f"int((row_number() over (order by 1) - 1) / {num_rows_per_batch})"), + ).withColumn("logs", regexp_replace(col("logs"), "\n", "")) + return ( + df_with_batch_id.groupBy("batch_id") + .agg(collect_list("logs").alias("batch_logs")) + .withColumn( + "payload", + concat(lit("["), expr("concat_ws(',', batch_logs)"), lit("]")), + ) + .withColumn("payloadBytes", col("payload").cast("binary")) + .withColumn("endpoint", lit(endpoint)) + .withColumn("header", lit(json.dumps(headers))) .select("endpoint", "header", "payloadBytes") + ) + # ================================================================================ # INFERENCE LAYER # ================================================================================ + def convert_row_to_error_log(row): """Convert a row to error log format.""" params = { "message": str(getattr(row, "message", "")), "status": "error", "tags": { - "pipeline_id": getattr(row, 'pipeline_id', ''), - "pipeline_run_id": getattr(row, 'pipeline_run_id', ''), - "table_name": getattr(row, 'table_name', ''), - "flow_name": getattr(row, 'flow_name', ''), - "level": "error" + "pipeline_id": getattr(row, "pipeline_id", ""), + "pipeline_run_id": getattr(row, "pipeline_run_id", ""), + "table_name": getattr(row, "table_name", ""), + "flow_name": getattr(row, "flow_name", ""), + "level": "error", }, "timestamp": timestamp_in_unix_milliseconds(row.event_timestamp), "additional_attributes": { "pipeline_run_link": getattr(row, "pipeline_run_link", None), "error": getattr(row, "error", None), - } + }, } return _log_converter.create_log(**params) + def convert_row_to_table_metrics(row): """Convert a row to table metrics format.""" # Base tags for all metrics @@ -683,7 +761,7 @@ def convert_row_to_table_metrics(row): "pipeline_run_id": getattr(row, "pipeline_run_id", ""), "table_name": getattr(row, "table_name", ""), "flow_name": getattr(row, "flow_name", ""), - "source": SOURCE_NAME + "source": SOURCE_NAME, } # Timestamp for all metrics @@ -695,28 +773,29 @@ def convert_row_to_table_metrics(row): metric_value=getattr(row, "num_upserted_rows", 0) or 0, tags={**base_tags, "metric_type": "count"}, timestamp=timestamp, - additional_attributes={} + additional_attributes={}, ), _metrics_converter.create_metric( metric_name="dlt.table.throughput.deleted_rows", metric_value=getattr(row, "num_deleted_rows", 0) or 0, tags={**base_tags, "metric_type": "count"}, timestamp=timestamp, - additional_attributes={} + additional_attributes={}, ), _metrics_converter.create_metric( metric_name="dlt.table.throughput.output_rows", metric_value=getattr(row, "num_output_rows", 0) or 0, tags={**base_tags, "metric_type": "count"}, timestamp=timestamp, - additional_attributes={} + additional_attributes={}, ), ] + def convert_row_to_pipeline_status_event(row): """Convert a row to pipeline status event format.""" # Determine pipeline status for message - status_display = row.latest_state.upper() if row.latest_state else 'UNKNOWN' + status_display = row.latest_state.upper() if row.latest_state else "UNKNOWN" pipeline_id = getattr(row, "pipeline_id", "") params = { @@ -727,7 +806,7 @@ def convert_row_to_pipeline_status_event(row): "latest_run_id": getattr(row, "pipeline_run_id", ""), "status": status_display.lower(), "source": SOURCE_NAME, - "service": SERVICE_NAME + "service": SERVICE_NAME, }, "timestamp": timestamp_in_unix_milliseconds(row.updated_at), "additional_attributes": { @@ -736,15 +815,17 @@ def convert_row_to_pipeline_status_event(row): "is_complete": getattr(row, "is_complete", None), "running_start_time": getattr(row, "running_start_time", None), "end_time": getattr(row, "end_time", None), - "updated_at": getattr(row, "updated_at", None) , + "updated_at": getattr(row, "updated_at", None), "latest_error_log_message": getattr(row, "latest_error_log_message", None), "latest_error_message": getattr(row, "latest_error_message", None), - } + }, } return _events_converter.create_event(**params) + def convert_row_to_pipeline_metrics(row): """Convert a row to pipeline metrics format.""" + def has_attr(obj, attr): return hasattr(obj, attr) and getattr(obj, attr) is not None @@ -754,7 +835,7 @@ def has_attr(obj, attr): base_tags = { "pipeline_id": getattr(row, "pipeline_id", ""), "pipeline_run_id": getattr(row, "pipeline_run_id", ""), - "source": SOURCE_NAME + "source": SOURCE_NAME, } metrics = [] timestamp = timestamp_in_unix_milliseconds(getattr(row, "create_time", None)) @@ -763,55 +844,68 @@ def has_attr(obj, attr): # Starting seconds: queued_time - create_time starting_seconds = (row.queued_time - row.create_time).total_seconds() - metrics.append(_metrics_converter.create_metric( - metric_name="pipeline.run.starting_seconds", - metric_value=starting_seconds, - tags={**base_tags, "metric_type": "duration", "phase": "starting"}, - timestamp=timestamp - )) + metrics.append( + _metrics_converter.create_metric( + metric_name="pipeline.run.starting_seconds", + metric_value=starting_seconds, + tags={**base_tags, "metric_type": "duration", "phase": "starting"}, + timestamp=timestamp, + ) + ) # Seconds waiting for resources: initialization_start_time - queued_time if not has_attr(row, "initialization_start_time"): return metrics waiting_seconds = (row.initialization_start_time - row.queued_time).total_seconds() - metrics.append(_metrics_converter.create_metric( - metric_name="pipeline.run.waiting_for_resources_seconds", - metric_value=waiting_seconds, - tags={**base_tags, "metric_type": "duration", "phase": "waiting"}, - timestamp=timestamp - )) + metrics.append( + _metrics_converter.create_metric( + metric_name="pipeline.run.waiting_for_resources_seconds", + metric_value=waiting_seconds, + tags={**base_tags, "metric_type": "duration", "phase": "waiting"}, + timestamp=timestamp, + ) + ) # Initialization seconds: running_start_time - initialization_start_time if not has_attr(row, "running_start_time"): return metrics - initialization_seconds = (row.running_start_time - row.initialization_start_time).total_seconds() - metrics.append(_metrics_converter.create_metric( - metric_name="pipeline.run.initialization_seconds", - metric_value=initialization_seconds, - tags={**base_tags, "metric_type": "duration", "phase": "initialization"}, - timestamp=timestamp - )) + initialization_seconds = ( + row.running_start_time - row.initialization_start_time + ).total_seconds() + metrics.append( + _metrics_converter.create_metric( + metric_name="pipeline.run.initialization_seconds", + metric_value=initialization_seconds, + tags={**base_tags, "metric_type": "duration", "phase": "initialization"}, + timestamp=timestamp, + ) + ) # Running seconds: end_time - running_start_time running_seconds = (end_time - row.running_start_time).total_seconds() - metrics.append(_metrics_converter.create_metric( - metric_name="pipeline.run.running_seconds", - metric_value=running_seconds, - tags={**base_tags, "metric_type": "duration", "phase": "running"}, - timestamp=timestamp - )) + metrics.append( + _metrics_converter.create_metric( + metric_name="pipeline.run.running_seconds", + metric_value=running_seconds, + tags={**base_tags, "metric_type": "duration", "phase": "running"}, + timestamp=timestamp, + ) + ) # Total seconds: end_time - create_time total_seconds = (end_time - row.create_time).total_seconds() - metrics.append(_metrics_converter.create_metric( - metric_name="pipeline.run.total_seconds", - metric_value=total_seconds, - tags={**base_tags, "metric_type": "duration", "phase": "total"}, - timestamp=timestamp - )) + metrics.append( + _metrics_converter.create_metric( + metric_name="pipeline.run.total_seconds", + metric_value=total_seconds, + tags={**base_tags, "metric_type": "duration", "phase": "total"}, + timestamp=timestamp, + ) + ) return metrics + # ================================================================================ # MAIN # ================================================================================ @@ -822,33 +916,38 @@ def has_attr(obj, attr): http_client = None + + def getClient(config): """Global HTTP client getter.""" global http_client if http_client is None: http_client = HTTPClient( max_retry_duration_sec=config["max_retry_duration_sec"], - request_timeout_sec=config["request_timeout_sec"] + request_timeout_sec=config["request_timeout_sec"], ) return http_client - -token_details = { - "access_token": None, - "last_load_timestamp": 0 -} +token_details = {"access_token": None, "last_load_timestamp": 0} token_lock = threading.Lock() + def get_access_token(config): global token_details now = int(datetime.now(timezone.utc).timestamp()) - if ((now - token_details["last_load_timestamp"]) < config["max_access_token_staleness"]): + if (now - token_details["last_load_timestamp"]) < config[ + "max_access_token_staleness" + ]: return token_details["access_token"] # Token does not exist or is stale, fetch a new one. with token_lock: - if token_details["access_token"] is None or (now - token_details["last_load_timestamp"]) >= config["max_access_token_staleness"]: + if ( + token_details["access_token"] is None + or (now - token_details["last_load_timestamp"]) + >= config["max_access_token_staleness"] + ): token_details = fetch_access_token(config) return token_details["access_token"] @@ -857,12 +956,16 @@ def register_sink_for_pipeline_events(): @dlt.foreach_batch_sink(name="send_pipeline_status_to_3p_monitoring") def send_pipeline_status_to_3p_monitoring(batch_df, batch_id): destination_format_udf = udf(convert_row_to_pipeline_status_event, StringType()) - events_df = batch_df.withColumn("events", destination_format_udf(struct("*"))).select("events").filter(col("events").isNotNull()) + events_df = ( + batch_df.withColumn("events", destination_format_udf(struct("*"))) + .select("events") + .filter(col("events").isNotNull()) + ) http_request_spec = _events_converter.create_http_requests_spec( events_df, _global_config["num_rows_per_batch"], get_azure_header(get_access_token(_global_config)), - _global_config["endpoints"]["events"] + _global_config["endpoints"]["events"], ) getClient(_global_config).post(http_request_spec) @@ -875,29 +978,44 @@ def register_sink_for_errors(): @dlt.foreach_batch_sink(name="send_errors_to_3p_monitoring") def send_errors_to_3p_monitoring(batch_df, batch_id): destination_format_udf = udf(convert_row_to_error_log, StringType()) - logs_df = batch_df.withColumn("logs", destination_format_udf(struct("*"))).select("logs").filter(col("logs").isNotNull()) + logs_df = ( + batch_df.withColumn("logs", destination_format_udf(struct("*"))) + .select("logs") + .filter(col("logs").isNotNull()) + ) http_request_spec = _log_converter.create_http_requests_spec( logs_df, _global_config["num_rows_per_batch"], get_azure_header(get_access_token(_global_config)), - _global_config["endpoints"]["logs"] + _global_config["endpoints"]["logs"], ) getClient(_global_config).post(http_request_spec) @dlt.append_flow(target="send_errors_to_3p_monitoring") def send_errors_to_sink(): - return spark.readStream.option("skipChangeCommits", "true").table(event_logs_bronze).filter("error IS NOT NULL OR level = 'ERROR'") + return ( + spark.readStream.option("skipChangeCommits", "true") + .table(event_logs_bronze) + .filter("error IS NOT NULL OR level = 'ERROR'") + ) + def register_sink_for_pipeline_metrics(): @dlt.foreach_batch_sink(name="send_pipeline_metrics_to_3p_monitoring") def send_pipeline_metrics_to_3p_monitoring(batch_df, batch_id): - destination_format_udf = udf(convert_row_to_pipeline_metrics, ArrayType(StringType())) - metrics_df = batch_df.withColumn("metrics_array", destination_format_udf(struct("*"))).select(explode("metrics_array").alias("metrics")).filter(col("metrics").isNotNull()) + destination_format_udf = udf( + convert_row_to_pipeline_metrics, ArrayType(StringType()) + ) + metrics_df = ( + batch_df.withColumn("metrics_array", destination_format_udf(struct("*"))) + .select(explode("metrics_array").alias("metrics")) + .filter(col("metrics").isNotNull()) + ) http_request_spec = _metrics_converter.create_http_requests_spec( metrics_df, _global_config["num_rows_per_batch"], get_azure_header(get_access_token(_global_config)), - _global_config["endpoints"]["metrics"] + _global_config["endpoints"]["metrics"], ) getClient(_global_config).post(http_request_spec) @@ -905,34 +1023,48 @@ def send_pipeline_metrics_to_3p_monitoring(batch_df, batch_id): def send_pipeline_metrics_to_sink(): return spark.readStream.table(f"{pipeline_runs_status}_cdf") + def register_sink_for_table_metrics(): @dlt.foreach_batch_sink(name="send_table_metrics_to_3p_monitoring") def send_table_metrics_to_3p_monitoring(batch_df, batch_id): - destination_format_udf = udf(convert_row_to_table_metrics, ArrayType(StringType())) - metrics_df = batch_df.withColumn("metrics_array", destination_format_udf(struct("*"))).select(explode("metrics_array").alias("metrics")).filter(col("metrics").isNotNull()) + destination_format_udf = udf( + convert_row_to_table_metrics, ArrayType(StringType()) + ) + metrics_df = ( + batch_df.withColumn("metrics_array", destination_format_udf(struct("*"))) + .select(explode("metrics_array").alias("metrics")) + .filter(col("metrics").isNotNull()) + ) http_request_spec = _metrics_converter.create_http_requests_spec( metrics_df, _global_config["num_rows_per_batch"], get_azure_header(get_access_token(_global_config)), - _global_config["endpoints"]["metrics"] + _global_config["endpoints"]["metrics"], ) getClient(_global_config).post(http_request_spec) @dlt.append_flow(target="send_table_metrics_to_3p_monitoring") def send_table_metrics_to_sink(): - return spark.readStream.option("skipChangeCommits", "true").table(event_logs_bronze) \ - .filter("table_name is not null AND details:flow_progress.metrics is not null AND event_type = 'flow_progress'") \ + return ( + spark.readStream.option("skipChangeCommits", "true") + .table(event_logs_bronze) + .filter( + "table_name is not null AND details:flow_progress.metrics is not null AND event_type = 'flow_progress'" + ) .selectExpr( - "pipeline_id", - "pipeline_run_id", - "table_name", - "flow_name", - "event_timestamp", - "details:flow_progress.metrics.num_upserted_rows::bigint as num_upserted_rows", - "details:flow_progress.metrics.num_deleted_rows::bigint as num_deleted_rows", - "(details:flow_progress.metrics.num_upserted_rows::bigint + details:flow_progress.metrics.num_deleted_rows::bigint) as num_output_rows" - ) \ - .filter("num_upserted_rows is not null OR num_deleted_rows is not null OR num_output_rows is not null") + "pipeline_id", + "pipeline_run_id", + "table_name", + "flow_name", + "event_timestamp", + "details:flow_progress.metrics.num_upserted_rows::bigint as num_upserted_rows", + "details:flow_progress.metrics.num_deleted_rows::bigint as num_deleted_rows", + "(details:flow_progress.metrics.num_upserted_rows::bigint + details:flow_progress.metrics.num_deleted_rows::bigint) as num_output_rows", + ) + .filter( + "num_upserted_rows is not null OR num_deleted_rows is not null OR num_output_rows is not null" + ) + ) # ================================================================================ @@ -945,4 +1077,4 @@ def send_table_metrics_to_sink(): register_sink_for_errors() register_sink_for_pipeline_events() register_sink_for_table_metrics() - register_sink_for_pipeline_metrics() \ No newline at end of file + register_sink_for_pipeline_metrics() diff --git a/contrib/databricks_ingestion_monitoring/common/third_party_sinks/datadog_sink.py b/contrib/databricks_ingestion_monitoring/common/third_party_sinks/datadog_sink.py index d3a47c3..2adf279 100644 --- a/contrib/databricks_ingestion_monitoring/common/third_party_sinks/datadog_sink.py +++ b/contrib/databricks_ingestion_monitoring/common/third_party_sinks/datadog_sink.py @@ -13,7 +13,17 @@ from datetime import datetime, timezone from pyspark.sql import SparkSession from pyspark.sql.types import StringType, ArrayType -from pyspark.sql.functions import lit, col, collect_list, concat, expr, udf, struct, explode, regexp_replace +from pyspark.sql.functions import ( + lit, + col, + collect_list, + concat, + expr, + udf, + struct, + explode, + regexp_replace, +) import dlt # Global Configuration. @@ -32,14 +42,11 @@ "type": "object", "required": ["metric", "points", "type"], "properties": { - "metric": { - "type": "string", - "description": "The name of the timeseries." - }, + "metric": {"type": "string", "description": "The name of the timeseries."}, "type": { "type": "integer", "enum": [3], - "description": "The type of metric. 0=unspecified, 1=count, 2=rate, 3=gauge." + "description": "The type of metric. 0=unspecified, 1=count, 2=rate, 3=gauge.", }, "points": { "type": "array", @@ -50,23 +57,21 @@ "properties": { "timestamp": { "type": "integer", - "description": "The timestamp should be in seconds, not more than 10 minutes in the future or more than 1 hour in the past." + "description": "The timestamp should be in seconds, not more than 10 minutes in the future or more than 1 hour in the past.", }, "value": { "type": "number", - "description": "The numeric value format should be a 64bit float gauge-type value." - } - } - } + "description": "The numeric value format should be a 64bit float gauge-type value.", + }, + }, + }, }, "tags": { "type": "array", - "items": { - "type": "string" - }, - "description": "A list of tags associated with the metric." - } - } + "items": {"type": "string"}, + "description": "A list of tags associated with the metric.", + }, + }, } LOGS_SCHEMA = { @@ -75,30 +80,27 @@ "properties": { "message": { "type": "string", - "description": "The message reserved attribute of your log." + "description": "The message reserved attribute of your log.", }, "ddsource": { "type": "string", - "description": "The integration name associated with your log: the technology from which the log originated." - }, - "ddtags": { - "type": "string", - "description": "Tags associated with your logs." + "description": "The integration name associated with your log: the technology from which the log originated.", }, + "ddtags": {"type": "string", "description": "Tags associated with your logs."}, "timestamp": { "type": "integer", - "description": "Unix timestamp for the log entry." + "description": "Unix timestamp for the log entry.", }, "status": { "type": "string", - "description": "The status/level of the log entry." + "description": "The status/level of the log entry.", }, "service": { "type": "string", - "description": "The name of the application or service generating the log events." - } + "description": "The name of the application or service generating the log events.", + }, }, - "additionalProperties": True + "additionalProperties": True, } EVENTS_SCHEMA = { @@ -109,37 +111,30 @@ "type": "object", "required": ["type", "attributes"], "properties": { - "type": { - "type": "string", - "enum": ["event"] - }, + "type": {"type": "string", "enum": ["event"]}, "attributes": { "type": "object", - "required": ["category", "title", "message", "timestamp", "tags", "attributes"], + "required": [ + "category", + "title", + "message", + "timestamp", + "tags", + "attributes", + ], "properties": { - "category": { - "type": "string", - "enum": ["alert"] - }, - "title": { - "type": "string", - "maxLength": 500 - }, - "message": { - "type": "string", - "maxLength": 2000 - }, + "category": {"type": "string", "enum": ["alert"]}, + "title": {"type": "string", "maxLength": 500}, + "message": {"type": "string", "maxLength": 2000}, "timestamp": { "type": "string", "format": "date-time", - "description": "ISO 8601 timestamp, must be within 18 hours." + "description": "ISO 8601 timestamp, must be within 18 hours.", }, "tags": { "type": "array", "maxItems": 100, - "items": { - "type": "string" - } + "items": {"type": "string"}, }, "attributes": { "type": "object", @@ -147,28 +142,29 @@ "properties": { "status": { "type": "string", - "enum": ["warn", "error", "ok"] + "enum": ["warn", "error", "ok"], }, "custom": { "type": "object", "description": "Custom key-value attributes for the event.", "additionalProperties": { "type": ["string", "number", "boolean", "null"] - } - } - } - } - } - } - } + }, + }, + }, + }, + }, + }, + }, } - } + }, } # ================================================================================ # UTILITIES # ================================================================================ + def get_datadog_headers(api_key: str): """Get headers for the Datadog API.""" return {"Content-Type": "application/json", "DD-API-KEY": api_key} @@ -183,12 +179,14 @@ def initialize_global_config(spark_conf): _events_converter = DatadogEventsConverter() _metrics_converter = DatadogMetricsConverter() + def getParam(spark_conf, key: str, default=None): value = spark_conf.get(key, default) if value == "" or value is None: return None return value + def getThirdPartySinkConfigFromSparkConfig(spark_conf): """ Extract and merge configuration from Spark configuration and secret scope. @@ -253,7 +251,9 @@ def getThirdPartySinkConfigFromSparkConfig(spark_conf): if logs_endpoint is None: logs_endpoint = f"https://http-intake.logs.{host_name}/api/v2/logs" if events_endpoint is None: - events_endpoint = f"https://event-management-intake.{host_name}/api/v2/events" + events_endpoint = ( + f"https://event-management-intake.{host_name}/api/v2/events" + ) common_params["endpoints"] = { "metrics": metrics_endpoint, @@ -273,28 +273,28 @@ def unix_to_iso(timestamp: int) -> str: dt = datetime.fromtimestamp(ts, tz=timezone.utc) return dt.isoformat().replace("+00:00", "Z") + def timestamp_in_unix_milliseconds(timestamp) -> int: """Convert datetime to Unix timestamp in milliseconds.""" if isinstance(timestamp, datetime): return int(timestamp.timestamp() * 1000) return int(timestamp) + def get_status(status_display: str) -> str: """Map pipeline status to appropriate status level.""" status_lower = status_display.lower() - if status_lower in ['failed', 'error']: - return 'error' - elif status_lower in ['running', 'starting', 'completed', 'success']: - return 'ok' + if status_lower in ["failed", "error"]: + return "error" + elif status_lower in ["running", "starting", "completed", "success"]: + return "ok" else: - return 'warn' + return "warn" + def serialize_datetime(data): if isinstance(data, dict): - return { - key: serialize_datetime(value) - for key, value in data.items() - } + return {key: serialize_datetime(value) for key, value in data.items()} elif isinstance(data, list): return [serialize_datetime(item) for item in data] elif isinstance(data, datetime): @@ -302,6 +302,7 @@ def serialize_datetime(data): else: return data + def filter_null_fields(data): if isinstance(data, dict): return { @@ -314,7 +315,8 @@ def filter_null_fields(data): else: return data -def enforce_schema(data, schema, path = "root"): + +def enforce_schema(data, schema, path="root"): # Nothing to enforce. if schema is None or data is None: return data @@ -357,7 +359,9 @@ def enforce_schema(data, schema, path = "root"): schema_type = allowed_type break else: - raise ValueError(f"Value at {path} does not match any allowed types: {schema_type}") + raise ValueError( + f"Value at {path} does not match any allowed types: {schema_type}" + ) # Validate dictionary if isinstance(data, dict): @@ -402,7 +406,9 @@ def enforce_schema(data, schema, path = "root"): f"Additional property '{k}' at {path} does not match any oneOf schema" ) else: - data[k] = enforce_schema(v, additional_properties, f"{path}.{k}") + data[k] = enforce_schema( + v, additional_properties, f"{path}.{k}" + ) return data @@ -411,7 +417,10 @@ def enforce_schema(data, schema, path = "root"): if schema_type != "array": raise ValueError(f"Expected array at {path}, got {type(data).__name__}") items_schema = schema.get("items", {}) - return [enforce_schema(item, items_schema, f"{path}[{i}]") for i, item in enumerate(data)] + return [ + enforce_schema(item, items_schema, f"{path}[{i}]") + for i, item in enumerate(data) + ] # Handle string elif isinstance(data, str): @@ -419,7 +428,9 @@ def enforce_schema(data, schema, path = "root"): raise ValueError(f"Expected string at {path}, got {type(data).__name__}") acceptable_values = schema.get("enum", []) if acceptable_values and data not in acceptable_values: - raise ValueError(f"Invalid value at {path}: {data}. Allowed: {acceptable_values}") + raise ValueError( + f"Invalid value at {path}: {data}. Allowed: {acceptable_values}" + ) max_length = schema.get("maxLength") if max_length and len(data) > max_length: return data[:max_length] @@ -453,12 +464,14 @@ def enforce_schema(data, schema, path = "root"): return data return data + def create_valid_json_or_fail_with_error(data, schema): data = serialize_datetime(data) data = filter_null_fields(data) data = enforce_schema(data, schema) return json.dumps(data) + # ================================================================================ # HTTP Layer # ================================================================================ @@ -466,6 +479,7 @@ def create_valid_json_or_fail_with_error(data, schema): # Global session for connection pooling session: Optional[requests.Session] = None + class HTTPClient: """ HTTP client for batched POST requests using a persistent session. @@ -476,7 +490,9 @@ class HTTPClient: - payload (binary data): Serialized request body. """ - def __init__(self, max_retry_duration_sec: int = 300, request_timeout_sec: int = 30): + def __init__( + self, max_retry_duration_sec: int = 300, request_timeout_sec: int = 30 + ): """ Initialize the HTTP client. @@ -487,7 +503,6 @@ def __init__(self, max_retry_duration_sec: int = 300, request_timeout_sec: int = self.max_retry_duration_sec = max_retry_duration_sec self.request_timeout_sec = request_timeout_sec - def get_session(self) -> requests.Session: """ Get the global session instance. If not present, create a new one. @@ -500,7 +515,9 @@ def get_session(self) -> requests.Session: session = requests.Session() return session - def _make_request_with_retry(self, url: str, headers: Dict[str, str], payload: bytes): + def _make_request_with_retry( + self, url: str, headers: Dict[str, str], payload: bytes + ): """ Make a POST request to the provided url. @@ -514,7 +531,7 @@ def _make_request_with_retry(self, url: str, headers: Dict[str, str], payload: b """ # Compress payload compressed_payload = gzip.compress(payload) - headers['Content-Encoding'] = 'gzip' + headers["Content-Encoding"] = "gzip" response = None try: @@ -522,22 +539,34 @@ def _make_request_with_retry(self, url: str, headers: Dict[str, str], payload: b url, headers=headers, data=compressed_payload, - timeout=self.request_timeout_sec + timeout=self.request_timeout_sec, ) if response.status_code >= 400 and response.status_code < 500: - logging.warning(f"Ignoring client-side error for URL: {url}, headers: {str(headers)}, Payload: {payload.decode('utf-8')}, Response: {response.text}") + logging.warning( + f"Ignoring client-side error for URL: {url}, headers: {str(headers)}, Payload: {payload.decode('utf-8')}, Response: {response.text}" + ) else: response.raise_for_status() - logging.debug(f"Successfully sent request to URL: {url}, Payload: {payload.decode('utf-8')}, Response: {response.text}") + logging.debug( + f"Successfully sent request to URL: {url}, Payload: {payload.decode('utf-8')}, Response: {response.text}" + ) except Exception as e: response_text = "No response" if response is not None: try: response_text = str(response.json()) except: - response_text = response.text if hasattr(response, 'text') else "Unable to read response" - logging.error(f"Request failed for URL: {url}, headers: {str(headers)}, Payload: {payload.decode('utf-8')}, Error: {str(e)}, Response: {response_text}") - raise type(e)(f"Request failed for URL: {url}, headers: {str(headers)}, Payload: {payload.decode('utf-8')}, Error: {str(e)}, Response: {response_text}") from e + response_text = ( + response.text + if hasattr(response, "text") + else "Unable to read response" + ) + logging.error( + f"Request failed for URL: {url}, headers: {str(headers)}, Payload: {payload.decode('utf-8')}, Error: {str(e)}, Response: {response_text}" + ) + raise type(e)( + f"Request failed for URL: {url}, headers: {str(headers)}, Payload: {payload.decode('utf-8')}, Error: {str(e)}, Response: {response_text}" + ) from e def post(self, http_request_specs_df) -> None: """ @@ -550,34 +579,45 @@ def post(self, http_request_specs_df) -> None: rows = http_request_specs_df.collect() total_requests = len(rows) logging.info(f"[HTTPClient] Starting to send {total_requests} HTTP request(s)") - + success_count = 0 failure_count = 0 - + for idx, row in enumerate(rows, 1): try: - logging.info(f"[HTTPClient] Sending request {idx}/{total_requests} to {row.endpoint}") - headers = json.loads(getattr(row, 'header', '{}')) + logging.info( + f"[HTTPClient] Sending request {idx}/{total_requests} to {row.endpoint}" + ) + headers = json.loads(getattr(row, "header", "{}")) retry_wrapper = retry( stop=stop_after_delay(self.max_retry_duration_sec), wait=wait_exponential(multiplier=1, min=1, max=10), - reraise=True + reraise=True, + ) + retry_wrapper(self._make_request_with_retry)( + row.endpoint, headers, row.payloadBytes ) - retry_wrapper(self._make_request_with_retry)(row.endpoint, headers, row.payloadBytes) success_count += 1 - logging.info(f"[HTTPClient] Successfully sent request {idx}/{total_requests}") + logging.info( + f"[HTTPClient] Successfully sent request {idx}/{total_requests}" + ) except Exception as e: failure_count += 1 - logging.error(f"[HTTPClient] Failed to send request {idx}/{total_requests}: {str(e)}") - continue # Continue with other requests regardless of success/failure - - logging.info(f"[HTTPClient] Completed sending requests: {success_count} succeeded, {failure_count} failed out of {total_requests} total") + logging.error( + f"[HTTPClient] Failed to send request {idx}/{total_requests}: {str(e)}" + ) + continue # Continue with other requests regardless of success/failure + + logging.info( + f"[HTTPClient] Completed sending requests: {success_count} succeeded, {failure_count} failed out of {total_requests} total" + ) # ================================================================================ # CONVERSION LAYER # ================================================================================ + class DatadogMetricsConverter: """Converter class to convert metrics to datadog format.""" @@ -587,7 +627,8 @@ def create_metric( metric_value: float, tags: Dict[str, str], timestamp: int, - additional_attributes: Optional[Dict[str, Any]] = None) -> str: + additional_attributes: Optional[Dict[str, Any]] = None, + ) -> str: """Create a Datadog Gauge metric in the proper format. Args: @@ -609,43 +650,55 @@ def create_metric( "metric": metric_name, "type": 3, # Gauge metric type "points": [{"timestamp": timestamp, "value": metric_value}], - "tags": [f"{k}:{v}" for k, v in tags.items()] + "tags": [f"{k}:{v}" for k, v in tags.items()], } # Add additional attributes if provided if additional_attributes: - metric["tags"].extend([f"{k}:{v}" for k, v in additional_attributes.items()]) + metric["tags"].extend( + [f"{k}:{v}" for k, v in additional_attributes.items()] + ) # Enforce the schema return create_valid_json_or_fail_with_error(metric, METRICS_SCHEMA) - def create_http_requests_spec(self, df, num_rows_per_batch: int, headers: dict, endpoint: str): + def create_http_requests_spec( + self, df, num_rows_per_batch: int, headers: dict, endpoint: str + ): """Create HTTP request spec DataFrame for metrics.""" - df_with_batch_id = df.withColumn("batch_id", - expr(f"int((row_number() over (order by 1) - 1) / {num_rows_per_batch})")) \ - .withColumn("metrics", regexp_replace(col("metrics"), "\n", "")) - return df_with_batch_id.groupBy("batch_id") \ - .agg(collect_list("metrics").alias("batch_metrics")) \ - .withColumn("payload", concat(lit('{"series": ['), - expr("concat_ws(',', batch_metrics)"), - lit(']}'))) \ - .withColumn("payloadBytes", col("payload").cast("binary")) \ - .withColumn("endpoint", lit(endpoint)) \ - .withColumn("header", lit(json.dumps(headers))) \ + df_with_batch_id = df.withColumn( + "batch_id", + expr(f"int((row_number() over (order by 1) - 1) / {num_rows_per_batch})"), + ).withColumn("metrics", regexp_replace(col("metrics"), "\n", "")) + return ( + df_with_batch_id.groupBy("batch_id") + .agg(collect_list("metrics").alias("batch_metrics")) + .withColumn( + "payload", + concat( + lit('{"series": ['), + expr("concat_ws(',', batch_metrics)"), + lit("]}"), + ), + ) + .withColumn("payloadBytes", col("payload").cast("binary")) + .withColumn("endpoint", lit(endpoint)) + .withColumn("header", lit(json.dumps(headers))) .select("endpoint", "header", "payloadBytes") + ) class DatadogEventsConverter: """Converter class to convert events to datadog format.""" - def create_event( self, title: str, status: str, tags: Dict[str, str], timestamp: int, - additional_attributes: Optional[Dict[str, Any]] = None) -> str: + additional_attributes: Optional[Dict[str, Any]] = None, + ) -> str: """ Create a Datadog event in the proper format. @@ -673,30 +726,33 @@ def create_event( "tags": [f"{k}:{v}" for k, v in tags.items()], # Limit to 100 tags "attributes": { "status": status, - "custom": { - "source": SOURCE_NAME, - "service": SERVICE_NAME - } - } - } + "custom": {"source": SOURCE_NAME, "service": SERVICE_NAME}, + }, + }, } } # Add additional attributes if provided if additional_attributes: - event["data"]["attributes"]["attributes"]["custom"].update(additional_attributes) + event["data"]["attributes"]["attributes"]["custom"].update( + additional_attributes + ) # Enforce the schema return create_valid_json_or_fail_with_error(event, EVENTS_SCHEMA) - def create_http_requests_spec(self, df, num_rows_per_batch: int, headers: dict, endpoint: str): + def create_http_requests_spec( + self, df, num_rows_per_batch: int, headers: dict, endpoint: str + ): """Create HTTP request spec DataFrame for events.""" - return df \ - .withColumn("events", regexp_replace(col("events"), "\n", "")) \ - .withColumn("payloadBytes", col("events").cast("binary")) \ - .withColumn("endpoint", lit(endpoint)) \ - .withColumn("header", lit(json.dumps(headers))) \ + return ( + df.withColumn("events", regexp_replace(col("events"), "\n", "")) + .withColumn("payloadBytes", col("events").cast("binary")) + .withColumn("endpoint", lit(endpoint)) + .withColumn("header", lit(json.dumps(headers))) .select("endpoint", "header", "payloadBytes") + ) + class DatadogLogsConverter: """Converter class to convert metrics to datadog format.""" @@ -707,7 +763,8 @@ def create_log( status: str, tags: Dict[str, str], timestamp: int, - additional_attributes: Optional[Dict[str, Any]] = None) -> str: + additional_attributes: Optional[Dict[str, Any]] = None, + ) -> str: """ Create a Datadog log in the proper format. @@ -732,7 +789,7 @@ def create_log( "ddtags": ",".join([f"{k}:{v}" for k, v in tags.items()]), "timestamp": timestamp, "status": status, - "service": SERVICE_NAME + "service": SERVICE_NAME, } # Add additional attributes if provided @@ -742,46 +799,54 @@ def create_log( # Enforce the schema return create_valid_json_or_fail_with_error(log, LOGS_SCHEMA) - def create_http_requests_spec(self, df, num_rows_per_batch: int, headers: dict, endpoint: str): + def create_http_requests_spec( + self, df, num_rows_per_batch: int, headers: dict, endpoint: str + ): """Create HTTP request spec DataFrame for logs.""" - df_with_batch_id = df.withColumn("batch_id", - expr(f"int((row_number() over (order by 1) - 1) / {num_rows_per_batch})")) \ - .withColumn("logs", regexp_replace(col("logs"), "\n", "")) - return df_with_batch_id.groupBy("batch_id") \ - .agg(collect_list("logs").alias("batch_logs")) \ - .withColumn("payload", concat(lit('['), - expr("concat_ws(',', batch_logs)"), - lit(']'))) \ - .withColumn("payloadBytes", col("payload").cast("binary")) \ - .withColumn("endpoint", lit(endpoint)) \ - .withColumn("header", lit(json.dumps(headers))) \ + df_with_batch_id = df.withColumn( + "batch_id", + expr(f"int((row_number() over (order by 1) - 1) / {num_rows_per_batch})"), + ).withColumn("logs", regexp_replace(col("logs"), "\n", "")) + return ( + df_with_batch_id.groupBy("batch_id") + .agg(collect_list("logs").alias("batch_logs")) + .withColumn( + "payload", + concat(lit("["), expr("concat_ws(',', batch_logs)"), lit("]")), + ) + .withColumn("payloadBytes", col("payload").cast("binary")) + .withColumn("endpoint", lit(endpoint)) + .withColumn("header", lit(json.dumps(headers))) .select("endpoint", "header", "payloadBytes") + ) # ================================================================================ # INFERENCE LAYER # ================================================================================ + def convert_row_to_error_log(row): """Convert a row to error log format.""" params = { "title": str(getattr(row, "error_message", "")), "status": "error", "tags": { - "pipeline_id": getattr(row, 'pipeline_id', ''), - "pipeline_run_id": getattr(row, 'pipeline_run_id', ''), - "table_name": getattr(row, 'table_name', ''), - "flow_name": getattr(row, 'flow_name', ''), - "level": "error" + "pipeline_id": getattr(row, "pipeline_id", ""), + "pipeline_run_id": getattr(row, "pipeline_run_id", ""), + "table_name": getattr(row, "table_name", ""), + "flow_name": getattr(row, "flow_name", ""), + "level": "error", }, "timestamp": timestamp_in_unix_milliseconds(row.event_timestamp), "additional_attributes": { "pipeline_run_link": getattr(row, "pipeline_run_link", None), "error": getattr(row, "error_code", None), - } + }, } return _log_converter.create_log(**params) + def convert_row_to_table_metrics(row): """Convert a row to table metrics format.""" # Base tags for all metrics @@ -790,7 +855,7 @@ def convert_row_to_table_metrics(row): "pipeline_run_id": getattr(row, "pipeline_run_id", ""), "table_name": getattr(row, "table_name", ""), "flow_name": getattr(row, "flow_name", ""), - "source": SOURCE_NAME + "source": SOURCE_NAME, } # Timestamp for all metrics @@ -802,28 +867,29 @@ def convert_row_to_table_metrics(row): metric_value=getattr(row, "num_upserted_rows", 0) or 0, tags={**base_tags, "metric_type": "count"}, timestamp=timestamp, - additional_attributes={} + additional_attributes={}, ), _metrics_converter.create_metric( metric_name="dlt.table.throughput.deleted_rows", metric_value=getattr(row, "num_deleted_rows", 0) or 0, tags={**base_tags, "metric_type": "count"}, timestamp=timestamp, - additional_attributes={} + additional_attributes={}, ), _metrics_converter.create_metric( metric_name="dlt.table.throughput.output_rows", metric_value=getattr(row, "num_output_rows", 0) or 0, tags={**base_tags, "metric_type": "count"}, timestamp=timestamp, - additional_attributes={} + additional_attributes={}, ), ] + def convert_row_to_pipeline_status_event(row): """Convert a row to pipeline status event format.""" # Determine pipeline status for title - status_display = row.latest_state.upper() if row.latest_state else 'UNKNOWN' + status_display = row.latest_state.upper() if row.latest_state else "UNKNOWN" pipeline_id = getattr(row, "pipeline_id", "") params = { @@ -834,7 +900,7 @@ def convert_row_to_pipeline_status_event(row): "latest_run_id": getattr(row, "pipeline_run_id", ""), "status": status_display.lower(), "source": SOURCE_NAME, - "service": SERVICE_NAME + "service": SERVICE_NAME, }, "timestamp": timestamp_in_unix_milliseconds(row.updated_at), "additional_attributes": { @@ -843,15 +909,17 @@ def convert_row_to_pipeline_status_event(row): "is_complete": getattr(row, "is_complete", None), "running_start_time": getattr(row, "running_start_time", None), "end_time": getattr(row, "end_time", None), - "updated_at": getattr(row, "updated_at", None) , + "updated_at": getattr(row, "updated_at", None), "latest_error_log_message": getattr(row, "latest_error_log_message", None), "latest_error_message": getattr(row, "latest_error_message", None), - } + }, } return _events_converter.create_event(**params) + def convert_row_to_pipeline_metrics(row): """Convert a row to pipeline metrics format.""" + def has_attr(obj, attr): return hasattr(obj, attr) and getattr(obj, attr) is not None @@ -861,64 +929,79 @@ def has_attr(obj, attr): base_tags = { "pipeline_id": getattr(row, "pipeline_id", ""), "pipeline_run_id": getattr(row, "pipeline_run_id", ""), - "source": SOURCE_NAME + "source": SOURCE_NAME, } metrics = [] timestamp = timestamp_in_unix_milliseconds(getattr(row, "create_time", None)) - end_time = getattr(row, "end_time", None) or datetime.now(timezone.utc).replace(tzinfo=None) + end_time = getattr(row, "end_time", None) or datetime.now(timezone.utc).replace( + tzinfo=None + ) # Starting seconds: queued_time - create_time starting_seconds = (row.queued_time - row.create_time).total_seconds() - metrics.append(_metrics_converter.create_metric( - metric_name="pipeline.run.starting_seconds", - metric_value=starting_seconds, - tags={**base_tags, "metric_type": "duration", "phase": "starting"}, - timestamp=timestamp - )) + metrics.append( + _metrics_converter.create_metric( + metric_name="pipeline.run.starting_seconds", + metric_value=starting_seconds, + tags={**base_tags, "metric_type": "duration", "phase": "starting"}, + timestamp=timestamp, + ) + ) # Seconds waiting for resources: initialization_start_time - queued_time if not has_attr(row, "initialization_start_time"): return metrics waiting_seconds = (row.initialization_start_time - row.queued_time).total_seconds() - metrics.append(_metrics_converter.create_metric( - metric_name="pipeline.run.waiting_for_resources_seconds", - metric_value=waiting_seconds, - tags={**base_tags, "metric_type": "duration", "phase": "waiting"}, - timestamp=timestamp - )) + metrics.append( + _metrics_converter.create_metric( + metric_name="pipeline.run.waiting_for_resources_seconds", + metric_value=waiting_seconds, + tags={**base_tags, "metric_type": "duration", "phase": "waiting"}, + timestamp=timestamp, + ) + ) # Initialization seconds: running_start_time - initialization_start_time if not has_attr(row, "running_start_time"): return metrics - initialization_seconds = (row.running_start_time - row.initialization_start_time).total_seconds() - metrics.append(_metrics_converter.create_metric( - metric_name="pipeline.run.initialization_seconds", - metric_value=initialization_seconds, - tags={**base_tags, "metric_type": "duration", "phase": "initialization"}, - timestamp=timestamp - )) + initialization_seconds = ( + row.running_start_time - row.initialization_start_time + ).total_seconds() + metrics.append( + _metrics_converter.create_metric( + metric_name="pipeline.run.initialization_seconds", + metric_value=initialization_seconds, + tags={**base_tags, "metric_type": "duration", "phase": "initialization"}, + timestamp=timestamp, + ) + ) # Running seconds: end_time - running_start_time running_seconds = (end_time - row.running_start_time).total_seconds() - metrics.append(_metrics_converter.create_metric( - metric_name="pipeline.run.running_seconds", - metric_value=running_seconds, - tags={**base_tags, "metric_type": "duration", "phase": "running"}, - timestamp=timestamp - )) + metrics.append( + _metrics_converter.create_metric( + metric_name="pipeline.run.running_seconds", + metric_value=running_seconds, + tags={**base_tags, "metric_type": "duration", "phase": "running"}, + timestamp=timestamp, + ) + ) # Total seconds: end_time - create_time total_seconds = (end_time - row.create_time).total_seconds() - metrics.append(_metrics_converter.create_metric( - metric_name="pipeline.run.total_seconds", - metric_value=total_seconds, - tags={**base_tags, "metric_type": "duration", "phase": "total"}, - timestamp=timestamp - )) + metrics.append( + _metrics_converter.create_metric( + metric_name="pipeline.run.total_seconds", + metric_value=total_seconds, + tags={**base_tags, "metric_type": "duration", "phase": "total"}, + timestamp=timestamp, + ) + ) return metrics + # ================================================================================ # MAIN # ================================================================================ @@ -929,41 +1012,57 @@ def has_attr(obj, attr): http_client = None + + def getClient(config): """Global HTTP client getter.""" global http_client if http_client is None: http_client = HTTPClient( max_retry_duration_sec=config["max_retry_duration_sec"], - request_timeout_sec=config["request_timeout_sec"] + request_timeout_sec=config["request_timeout_sec"], ) return http_client + def register_sink_for_pipeline_events(): @dlt.foreach_batch_sink(name="send_pipeline_status_to_3p_monitoring") def send_pipeline_status_to_3p_monitoring(batch_df, batch_id): input_count = batch_df.count() - logging.info(f"[Pipeline Events] Processing batch {batch_id} with {input_count} rows") - + logging.info( + f"[Pipeline Events] Processing batch {batch_id} with {input_count} rows" + ) + destination_format_udf = udf(convert_row_to_pipeline_status_event, StringType()) - events_df = batch_df.withColumn("events", destination_format_udf(struct("*"))).select("events").filter(col("events").isNotNull()).cache() - + events_df = ( + batch_df.withColumn("events", destination_format_udf(struct("*"))) + .select("events") + .filter(col("events").isNotNull()) + .cache() + ) + events_count = events_df.count() - logging.info(f"[Pipeline Events] Converted {events_count} events from {input_count} input rows") - + logging.info( + f"[Pipeline Events] Converted {events_count} events from {input_count} input rows" + ) + if events_count == 0: - logging.info(f"[Pipeline Events] Skipping batch {batch_id} - no events to send") + logging.info( + f"[Pipeline Events] Skipping batch {batch_id} - no events to send" + ) return - + http_request_spec = _events_converter.create_http_requests_spec( events_df, _global_config["num_rows_per_batch"], get_datadog_headers(_global_config["api_key"]), - _global_config["endpoints"]["events"] + _global_config["endpoints"]["events"], ).cache() - + request_count = http_request_spec.count() - logging.info(f"[Pipeline Events] Sending {request_count} HTTP requests for batch {batch_id}") + logging.info( + f"[Pipeline Events] Sending {request_count} HTTP requests for batch {batch_id}" + ) getClient(_global_config).post(http_request_spec) logging.info(f"[Pipeline Events] Completed batch {batch_id}") @@ -977,58 +1076,87 @@ def register_sink_for_errors(): def send_errors_to_3p_monitoring(batch_df, batch_id): input_count = batch_df.count() logging.info(f"[Errors] Processing batch {batch_id} with {input_count} rows") - + destination_format_udf = udf(convert_row_to_error_log, StringType()) - logs_df = batch_df.withColumn("logs", destination_format_udf(struct("*"))).select("logs").filter(col("logs").isNotNull()).cache() - + logs_df = ( + batch_df.withColumn("logs", destination_format_udf(struct("*"))) + .select("logs") + .filter(col("logs").isNotNull()) + .cache() + ) + logs_count = logs_df.count() - logging.info(f"[Errors] Converted {logs_count} error logs from {input_count} input rows") - + logging.info( + f"[Errors] Converted {logs_count} error logs from {input_count} input rows" + ) + if logs_count == 0: logging.info(f"[Errors] Skipping batch {batch_id} - no error logs to send") return - + http_request_spec = _log_converter.create_http_requests_spec( logs_df, _global_config["num_rows_per_batch"], get_datadog_headers(_global_config["api_key"]), - _global_config["endpoints"]["logs"] + _global_config["endpoints"]["logs"], ).cache() - + request_count = http_request_spec.count() - logging.info(f"[Errors] Sending {request_count} HTTP requests for batch {batch_id}") + logging.info( + f"[Errors] Sending {request_count} HTTP requests for batch {batch_id}" + ) getClient(_global_config).post(http_request_spec) logging.info(f"[Errors] Completed batch {batch_id}") @dlt.append_flow(target="send_errors_to_3p_monitoring") def send_errors_to_sink(): - return spark.readStream.option("skipChangeCommits", "true").table(event_logs_bronze).filter("error_message IS NOT NULL OR level = 'ERROR'") + return ( + spark.readStream.option("skipChangeCommits", "true") + .table(event_logs_bronze) + .filter("error_message IS NOT NULL OR level = 'ERROR'") + ) + def register_sink_for_pipeline_metrics(): @dlt.foreach_batch_sink(name="send_pipeline_metrics_to_3p_monitoring") def send_pipeline_metrics_to_3p_monitoring(batch_df, batch_id): input_count = batch_df.count() - logging.info(f"[Pipeline Metrics] Processing batch {batch_id} with {input_count} rows") - - destination_format_udf = udf(convert_row_to_pipeline_metrics, ArrayType(StringType())) - metrics_df = batch_df.withColumn("metrics_array", destination_format_udf(struct("*"))).select(explode("metrics_array").alias("metrics")).filter(col("metrics").isNotNull()).cache() - + logging.info( + f"[Pipeline Metrics] Processing batch {batch_id} with {input_count} rows" + ) + + destination_format_udf = udf( + convert_row_to_pipeline_metrics, ArrayType(StringType()) + ) + metrics_df = ( + batch_df.withColumn("metrics_array", destination_format_udf(struct("*"))) + .select(explode("metrics_array").alias("metrics")) + .filter(col("metrics").isNotNull()) + .cache() + ) + metrics_count = metrics_df.count() - logging.info(f"[Pipeline Metrics] Converted {metrics_count} metrics from {input_count} input rows") - + logging.info( + f"[Pipeline Metrics] Converted {metrics_count} metrics from {input_count} input rows" + ) + if metrics_count == 0: - logging.info(f"[Pipeline Metrics] Skipping batch {batch_id} - no metrics to send") + logging.info( + f"[Pipeline Metrics] Skipping batch {batch_id} - no metrics to send" + ) return - + http_request_spec = _metrics_converter.create_http_requests_spec( metrics_df, _global_config["num_rows_per_batch"], get_datadog_headers(_global_config["api_key"]), - _global_config["endpoints"]["metrics"] + _global_config["endpoints"]["metrics"], ).cache() - + request_count = http_request_spec.count() - logging.info(f"[Pipeline Metrics] Sending {request_count} HTTP requests for batch {batch_id}") + logging.info( + f"[Pipeline Metrics] Sending {request_count} HTTP requests for batch {batch_id}" + ) getClient(_global_config).post(http_request_spec) logging.info(f"[Pipeline Metrics] Completed batch {batch_id}") @@ -1036,38 +1164,58 @@ def send_pipeline_metrics_to_3p_monitoring(batch_df, batch_id): def send_pipeline_metrics_to_sink(): return spark.readStream.table(f"{pipeline_runs_status}_cdf") + def register_sink_for_table_metrics(): @dlt.foreach_batch_sink(name="send_table_metrics_to_3p_monitoring") def send_table_metrics_to_3p_monitoring(batch_df, batch_id): input_count = batch_df.count() - logging.info(f"[Table Metrics] Processing batch {batch_id} with {input_count} rows") - - destination_format_udf = udf(convert_row_to_table_metrics, ArrayType(StringType())) - metrics_df = batch_df.withColumn("metrics_array", destination_format_udf(struct("*"))).select(explode("metrics_array").alias("metrics")).filter(col("metrics").isNotNull()).cache() - + logging.info( + f"[Table Metrics] Processing batch {batch_id} with {input_count} rows" + ) + + destination_format_udf = udf( + convert_row_to_table_metrics, ArrayType(StringType()) + ) + metrics_df = ( + batch_df.withColumn("metrics_array", destination_format_udf(struct("*"))) + .select(explode("metrics_array").alias("metrics")) + .filter(col("metrics").isNotNull()) + .cache() + ) + metrics_count = metrics_df.count() - logging.info(f"[Table Metrics] Converted {metrics_count} metrics from {input_count} input rows") - + logging.info( + f"[Table Metrics] Converted {metrics_count} metrics from {input_count} input rows" + ) + if metrics_count == 0: - logging.info(f"[Table Metrics] Skipping batch {batch_id} - no metrics to send") + logging.info( + f"[Table Metrics] Skipping batch {batch_id} - no metrics to send" + ) return - + http_request_spec = _metrics_converter.create_http_requests_spec( metrics_df, _global_config["num_rows_per_batch"], get_datadog_headers(_global_config["api_key"]), - _global_config["endpoints"]["metrics"] + _global_config["endpoints"]["metrics"], ).cache() - + request_count = http_request_spec.count() - logging.info(f"[Table Metrics] Sending {request_count} HTTP requests for batch {batch_id}") + logging.info( + f"[Table Metrics] Sending {request_count} HTTP requests for batch {batch_id}" + ) getClient(_global_config).post(http_request_spec) logging.info(f"[Table Metrics] Completed batch {batch_id}") @dlt.append_flow(target="send_table_metrics_to_3p_monitoring") def send_table_metrics_to_sink(): - return spark.readStream.option("skipChangeCommits", "true").table(event_logs_bronze) \ - .filter("table_name is not null AND details:flow_progress.metrics is not null AND event_type = 'flow_progress'") \ + return ( + spark.readStream.option("skipChangeCommits", "true") + .table(event_logs_bronze) + .filter( + "table_name is not null AND details:flow_progress.metrics is not null AND event_type = 'flow_progress'" + ) .selectExpr( "pipeline_id", "pipeline_run_id", @@ -1076,9 +1224,13 @@ def send_table_metrics_to_sink(): "event_timestamp", "details:flow_progress.metrics.num_upserted_rows::bigint as num_upserted_rows", "details:flow_progress.metrics.num_deleted_rows::bigint as num_deleted_rows", - "(details:flow_progress.metrics.num_upserted_rows::bigint + details:flow_progress.metrics.num_deleted_rows::bigint) as num_output_rows" - ) \ - .filter("num_upserted_rows is not null OR num_deleted_rows is not null OR num_output_rows is not null") + "(details:flow_progress.metrics.num_upserted_rows::bigint + details:flow_progress.metrics.num_deleted_rows::bigint) as num_output_rows", + ) + .filter( + "num_upserted_rows is not null OR num_deleted_rows is not null OR num_output_rows is not null" + ) + ) + # ================================================================================ # MAIN INITIALIZATION @@ -1090,4 +1242,4 @@ def send_table_metrics_to_sink(): register_sink_for_errors() register_sink_for_pipeline_events() register_sink_for_table_metrics() - register_sink_for_pipeline_metrics() \ No newline at end of file + register_sink_for_pipeline_metrics() diff --git a/contrib/databricks_ingestion_monitoring/common/third_party_sinks/newrelic_sink.py b/contrib/databricks_ingestion_monitoring/common/third_party_sinks/newrelic_sink.py index 940a8e3..68a456d 100644 --- a/contrib/databricks_ingestion_monitoring/common/third_party_sinks/newrelic_sink.py +++ b/contrib/databricks_ingestion_monitoring/common/third_party_sinks/newrelic_sink.py @@ -12,7 +12,17 @@ from datetime import datetime, timezone from pyspark.sql import SparkSession from pyspark.sql.types import StringType, ArrayType -from pyspark.sql.functions import lit, col, collect_list, concat, expr, udf, struct, explode, regexp_replace +from pyspark.sql.functions import ( + lit, + col, + collect_list, + concat, + expr, + udf, + struct, + explode, + regexp_replace, +) import dlt # Global Configuration. @@ -31,72 +41,47 @@ "type": "object", "required": ["name", "value", "timestamp"], "properties": { - "name": { - "type": "string", - "maxLength": 255 - }, - "value": { - "oneOf": [ - {"type": "number"}, - {"type": "object"} - ] - }, + "name": {"type": "string", "maxLength": 255}, + "value": {"oneOf": [{"type": "number"}, {"type": "object"}]}, "timestamp": {"type": "integer"}, - "interval.ms": { - "type": "integer", - "minimum": 1 - }, - "type": { - "type": "string", - "enum": ["gauge"] - }, + "interval.ms": {"type": "integer", "minimum": 1}, + "type": {"type": "string", "enum": ["gauge"]}, "attributes": { "type": "object", "additionalProperties": { - "oneOf": [ - {"type": "string"}, - {"type": "number"}, - {"type": "boolean"} - ] - } - } + "oneOf": [{"type": "string"}, {"type": "number"}, {"type": "boolean"}] + }, + }, }, - "additionalProperties": False + "additionalProperties": False, } LOGS_SCHEMA = { "type": "object", "required": ["timestamp", "message"], - "properties": { - "timestamp": {"type": "integer"}, - "message": {"type": "string"} - }, - "additionalProperties": True + "properties": {"timestamp": {"type": "integer"}, "message": {"type": "string"}}, + "additionalProperties": True, } EVENTS_SCHEMA = { "type": "object", "required": ["timestamp"], "maxProperties": 255, - "properties": { - "timestamp": {"type": "integer"} - }, + "properties": {"timestamp": {"type": "integer"}}, "additionalProperties": { "oneOf": [ - { - "type": "string", - "maxLength": 4096 - }, + {"type": "string", "maxLength": 4096}, {"type": "number"}, - {"type": "boolean"} + {"type": "boolean"}, ] - } + }, } # ================================================================================ # UTILITIES # ================================================================================ + def get_newrelic_headers(api_key: str): """Get headers for the NewRelic API.""" return {"Content-Type": "application/json", "Api-key": api_key} @@ -111,12 +96,14 @@ def initialize_global_config(spark_conf): _events_converter = NewRelicEventsConverter() _metrics_converter = NewRelicMetricsConverter() + def getParam(spark_conf, key: str, default=None): value = spark_conf.get(key, default) if value == "" or value is None: return None return value + def getThirdPartySinkConfigFromSparkConfig(spark_conf): """ Extract and merge configuration from Spark configuration and secret scope. @@ -208,30 +195,30 @@ def unix_to_iso(timestamp: int) -> str: dt = datetime.fromtimestamp(ts, tz=timezone.utc) return dt.isoformat().replace("+00:00", "Z") + def timestamp_in_unix_milliseconds(timestamp) -> int: """Convert datetime to Unix timestamp in milliseconds.""" if isinstance(timestamp, datetime): return int(timestamp.timestamp() * 1000) return int(timestamp) + def get_status(status_display: str) -> str: """Map pipeline status to appropriate status level.""" status_lower = status_display.lower() - if status_lower in ['failed', 'error']: - return 'error' - elif status_lower in ['running', 'starting']: - return 'info' - elif status_lower in ['completed', 'success']: - return 'ok' + if status_lower in ["failed", "error"]: + return "error" + elif status_lower in ["running", "starting"]: + return "info" + elif status_lower in ["completed", "success"]: + return "ok" else: - return 'warn' + return "warn" + def serialize_datetime(data): if isinstance(data, dict): - return { - key: serialize_datetime(value) - for key, value in data.items() - } + return {key: serialize_datetime(value) for key, value in data.items()} elif isinstance(data, list): return [serialize_datetime(item) for item in data] elif isinstance(data, datetime): @@ -239,6 +226,7 @@ def serialize_datetime(data): else: return data + def filter_null_fields(data): if isinstance(data, dict): return { @@ -251,7 +239,8 @@ def filter_null_fields(data): else: return data -def enforce_schema(data, schema, path = "root"): + +def enforce_schema(data, schema, path="root"): # Nothing to enforce. if schema is None or data is None: return data @@ -294,7 +283,9 @@ def enforce_schema(data, schema, path = "root"): schema_type = allowed_type break else: - raise ValueError(f"Value at {path} does not match any allowed types: {schema_type}") + raise ValueError( + f"Value at {path} does not match any allowed types: {schema_type}" + ) # Validate dictionary if isinstance(data, dict): @@ -339,7 +330,9 @@ def enforce_schema(data, schema, path = "root"): f"Additional property '{k}' at {path} does not match any oneOf schema" ) else: - data[k] = enforce_schema(v, additional_properties, f"{path}.{k}") + data[k] = enforce_schema( + v, additional_properties, f"{path}.{k}" + ) return data @@ -348,7 +341,10 @@ def enforce_schema(data, schema, path = "root"): if schema_type != "array": raise ValueError(f"Expected array at {path}, got {type(data).__name__}") items_schema = schema.get("items", {}) - return [enforce_schema(item, items_schema, f"{path}[{i}]") for i, item in enumerate(data)] + return [ + enforce_schema(item, items_schema, f"{path}[{i}]") + for i, item in enumerate(data) + ] # Handle string elif isinstance(data, str): @@ -356,12 +352,14 @@ def enforce_schema(data, schema, path = "root"): raise ValueError(f"Expected string at {path}, got {type(data).__name__}") acceptable_values = schema.get("enum", []) if acceptable_values and data not in acceptable_values: - raise ValueError(f"Invalid value at {path}: {data}. Allowed: {acceptable_values}") + raise ValueError( + f"Invalid value at {path}: {data}. Allowed: {acceptable_values}" + ) max_length = schema.get("maxLength") if max_length and len(data) > max_length: return data[:max_length] return data - + # Handle datetime elif isinstance(data, datetime): if schema_type == "string": @@ -370,7 +368,7 @@ def enforce_schema(data, schema, path = "root"): return data.timestamp() else: raise ValueError(f"Cannot convert datetime to {schema_type}") - + # Handle integer elif isinstance(data, int): if schema_type == "integer" or schema_type == "number": @@ -390,12 +388,14 @@ def enforce_schema(data, schema, path = "root"): return data return data + def create_valid_json_or_fail_with_error(data, schema): data = serialize_datetime(data) data = filter_null_fields(data) data = enforce_schema(data, schema) return json.dumps(data) + # ================================================================================ # HTTP Layer # ================================================================================ @@ -403,6 +403,7 @@ def create_valid_json_or_fail_with_error(data, schema): # Global session for connection pooling session: Optional[requests.Session] = None + class HTTPClient: """ HTTP client for batched POST requests using a persistent session. @@ -413,7 +414,9 @@ class HTTPClient: - payload (binary data): Serialized request body. """ - def __init__(self, max_retry_duration_sec: int = 300, request_timeout_sec: int = 30): + def __init__( + self, max_retry_duration_sec: int = 300, request_timeout_sec: int = 30 + ): """ Initialize the HTTP client. @@ -424,7 +427,6 @@ def __init__(self, max_retry_duration_sec: int = 300, request_timeout_sec: int = self.max_retry_duration_sec = max_retry_duration_sec self.request_timeout_sec = request_timeout_sec - def get_session(self) -> requests.Session: """ Get the global session instance. If not present, create a new one. @@ -437,7 +439,9 @@ def get_session(self) -> requests.Session: session = requests.Session() return session - def _make_request_with_retry(self, url: str, headers: Dict[str, str], payload: bytes): + def _make_request_with_retry( + self, url: str, headers: Dict[str, str], payload: bytes + ): """ Make a POST request to the provided url. @@ -451,7 +455,7 @@ def _make_request_with_retry(self, url: str, headers: Dict[str, str], payload: b """ # Compress payload compressed_payload = gzip.compress(payload) - headers['Content-Encoding'] = 'gzip' + headers["Content-Encoding"] = "gzip" response = None try: @@ -459,19 +463,29 @@ def _make_request_with_retry(self, url: str, headers: Dict[str, str], payload: b url, headers=headers, data=compressed_payload, - timeout=self.request_timeout_sec + timeout=self.request_timeout_sec, ) response.raise_for_status() - print(f"Successfully sent request to URL: {url}, Payload: {payload.decode('utf-8')}, Response: {response.text}") + print( + f"Successfully sent request to URL: {url}, Payload: {payload.decode('utf-8')}, Response: {response.text}" + ) except Exception as e: response_text = "No response" if response is not None: try: response_text = str(response.json()) except: - response_text = response.text if hasattr(response, 'text') else "Unable to read response" - print(f"Request failed for URL: {url}, headers: {str(headers)}, Payload: {payload.decode('utf-8')}, Error: {str(e)}, Response: {response_text}") - raise type(e)(f"Request failed for URL: {url}, headers: {str(headers)}, Payload: {payload.decode('utf-8')}, Error: {str(e)}, Response: {response_text}") from e + response_text = ( + response.text + if hasattr(response, "text") + else "Unable to read response" + ) + print( + f"Request failed for URL: {url}, headers: {str(headers)}, Payload: {payload.decode('utf-8')}, Error: {str(e)}, Response: {response_text}" + ) + raise type(e)( + f"Request failed for URL: {url}, headers: {str(headers)}, Payload: {payload.decode('utf-8')}, Error: {str(e)}, Response: {response_text}" + ) from e def post(self, http_request_specs_df) -> None: """ @@ -484,22 +498,25 @@ def post(self, http_request_specs_df) -> None: for row in http_request_specs_df.collect(): try: - headers = json.loads(getattr(row, 'header', '{}')) + headers = json.loads(getattr(row, "header", "{}")) retry_wrapper = retry( stop=stop_after_delay(self.max_retry_duration_sec), wait=wait_exponential(multiplier=1, min=1, max=10), - reraise=True + reraise=True, + ) + retry_wrapper(self._make_request_with_retry)( + row.endpoint, headers, row.payloadBytes ) - retry_wrapper(self._make_request_with_retry)(row.endpoint, headers, row.payloadBytes) except Exception as e: print(f"ERROR: {str(e)}") - continue # Continue with other requests regardless of success/failure + continue # Continue with other requests regardless of success/failure # ================================================================================ # CONVERSION LAYER # ================================================================================ + class NewRelicMetricsConverter: """Converter class to convert metrics to New Relic format.""" @@ -509,7 +526,8 @@ def create_metric( metric_value: float, tags: Dict[str, str], timestamp: int, - additional_attributes: Optional[Dict[str, Any]] = None) -> str: + additional_attributes: Optional[Dict[str, Any]] = None, + ) -> str: """Create a New Relic Gauge metric in the proper format. Args: @@ -537,25 +555,31 @@ def create_metric( "type": "gauge", "value": metric_value, "timestamp": timestamp, - "attributes": attributes + "attributes": attributes, } # Enforce the schema return create_valid_json_or_fail_with_error(metric, METRICS_SCHEMA) - def create_http_requests_spec(self, df, num_rows_per_batch: int, headers: dict, endpoint: str): + def create_http_requests_spec( + self, df, num_rows_per_batch: int, headers: dict, endpoint: str + ): """Create HTTP request spec DataFrame for metrics.""" - df_with_batch_id = df.withColumn("batch_id", - expr(f"int((row_number() over (order by 1) - 1) / {num_rows_per_batch})")) \ - .withColumn("metrics", regexp_replace(col("metrics"), "\n", "")) - return df_with_batch_id.groupBy("batch_id") \ - .agg(collect_list("metrics").alias("batch_metrics")) \ - .withColumn("payload", concat(lit('['), - expr("concat_ws(',', batch_metrics)"), - lit(']'))) \ - .withColumn("payloadBytes", col("payload").cast("binary")) \ - .withColumn("endpoint", lit(endpoint)) \ - .withColumn("header", lit(json.dumps(headers))) \ + df_with_batch_id = df.withColumn( + "batch_id", + expr(f"int((row_number() over (order by 1) - 1) / {num_rows_per_batch})"), + ).withColumn("metrics", regexp_replace(col("metrics"), "\n", "")) + return ( + df_with_batch_id.groupBy("batch_id") + .agg(collect_list("metrics").alias("batch_metrics")) + .withColumn( + "payload", + concat(lit("["), expr("concat_ws(',', batch_metrics)"), lit("]")), + ) + .withColumn("payloadBytes", col("payload").cast("binary")) + .withColumn("endpoint", lit(endpoint)) + .withColumn("header", lit(json.dumps(headers))) .select("endpoint", "header", "payloadBytes") + ) class NewRelicEventsConverter: @@ -567,7 +591,8 @@ def create_event( status: str, tags: Dict[str, str], timestamp: int, - additional_attributes: Optional[Dict[str, Any]] = None) -> str: + additional_attributes: Optional[Dict[str, Any]] = None, + ) -> str: """ Create a New Relic event in the proper format. @@ -591,7 +616,7 @@ def create_event( "status": status, "message": f"Event: {title}", "source": SOURCE_NAME, - "service": SERVICE_NAME + "service": SERVICE_NAME, } event.update(tags) @@ -601,20 +626,27 @@ def create_event( # Enforce the schema return create_valid_json_or_fail_with_error(event, EVENTS_SCHEMA) - def create_http_requests_spec(self, df, num_rows_per_batch: int, headers: dict, endpoint: str): + def create_http_requests_spec( + self, df, num_rows_per_batch: int, headers: dict, endpoint: str + ): """Create HTTP request spec DataFrame for events.""" - df_with_batch_id = df.withColumn("batch_id", - expr(f"int((row_number() over (order by 1) - 1) / {num_rows_per_batch})")) \ - .withColumn("events", regexp_replace(col("events"), "\n", "")) - return df_with_batch_id.groupBy("batch_id") \ - .agg(collect_list("events").alias("batch_events")) \ - .withColumn("payload", concat(lit('['), - expr("concat_ws(',', batch_events)"), - lit(']'))) \ - .withColumn("payloadBytes", col("payload").cast("binary")) \ - .withColumn("endpoint", lit(endpoint)) \ - .withColumn("header", lit(json.dumps(headers))) \ + df_with_batch_id = df.withColumn( + "batch_id", + expr(f"int((row_number() over (order by 1) - 1) / {num_rows_per_batch})"), + ).withColumn("events", regexp_replace(col("events"), "\n", "")) + return ( + df_with_batch_id.groupBy("batch_id") + .agg(collect_list("events").alias("batch_events")) + .withColumn( + "payload", + concat(lit("["), expr("concat_ws(',', batch_events)"), lit("]")), + ) + .withColumn("payloadBytes", col("payload").cast("binary")) + .withColumn("endpoint", lit(endpoint)) + .withColumn("header", lit(json.dumps(headers))) .select("endpoint", "header", "payloadBytes") + ) + class NewRelicLogsConverter: """Converter class to convert logs to New Relic format.""" @@ -625,7 +657,8 @@ def create_log( status: str, tags: Dict[str, str], timestamp: int, - additional_attributes: Optional[Dict[str, Any]] = None) -> str: + additional_attributes: Optional[Dict[str, Any]] = None, + ) -> str: """ Create a New Relic log in the proper format. @@ -648,7 +681,7 @@ def create_log( "timestamp": timestamp, "level": status.upper(), "service": SERVICE_NAME, - "source": SOURCE_NAME + "source": SOURCE_NAME, } # Add tag attributes @@ -661,45 +694,54 @@ def create_log( # Enforce the schema return create_valid_json_or_fail_with_error(log, LOGS_SCHEMA) - def create_http_requests_spec(self, df, num_rows_per_batch: int, headers: dict, endpoint: str): + def create_http_requests_spec( + self, df, num_rows_per_batch: int, headers: dict, endpoint: str + ): """Create HTTP request spec DataFrame for logs.""" - df_with_batch_id = df.withColumn("batch_id", - expr(f"int((row_number() over (order by 1) - 1) / {num_rows_per_batch})")) \ - .withColumn("logs", regexp_replace(col("logs"), "\n", "")) - return df_with_batch_id.groupBy("batch_id") \ - .agg(collect_list("logs").alias("batch_logs")) \ - .withColumn("payload", concat(lit('['), - expr("concat_ws(',', batch_logs)"), - lit(']'))) \ - .withColumn("payloadBytes", col("payload").cast("binary")) \ - .withColumn("endpoint", lit(endpoint)) \ - .withColumn("header", lit(json.dumps(headers))) \ + df_with_batch_id = df.withColumn( + "batch_id", + expr(f"int((row_number() over (order by 1) - 1) / {num_rows_per_batch})"), + ).withColumn("logs", regexp_replace(col("logs"), "\n", "")) + return ( + df_with_batch_id.groupBy("batch_id") + .agg(collect_list("logs").alias("batch_logs")) + .withColumn( + "payload", + concat(lit("["), expr("concat_ws(',', batch_logs)"), lit("]")), + ) + .withColumn("payloadBytes", col("payload").cast("binary")) + .withColumn("endpoint", lit(endpoint)) + .withColumn("header", lit(json.dumps(headers))) .select("endpoint", "header", "payloadBytes") + ) + # ================================================================================ # INFERENCE LAYER # ================================================================================ + def convert_row_to_error_log(row): """Convert a row to error log format.""" params = { "title": str(getattr(row, "message", "")), "status": "error", "tags": { - "pipeline_id": getattr(row, 'pipeline_id', ''), - "pipeline_run_id": getattr(row, 'pipeline_run_id', ''), - "table_name": getattr(row, 'table_name', ''), - "flow_name": getattr(row, 'flow_name', ''), - "level": "error" + "pipeline_id": getattr(row, "pipeline_id", ""), + "pipeline_run_id": getattr(row, "pipeline_run_id", ""), + "table_name": getattr(row, "table_name", ""), + "flow_name": getattr(row, "flow_name", ""), + "level": "error", }, "timestamp": timestamp_in_unix_milliseconds(row.event_timestamp), "additional_attributes": { "pipeline_run_link": getattr(row, "pipeline_run_link", None), "error": getattr(row, "error", None), - } + }, } return _log_converter.create_log(**params) + def convert_row_to_table_metrics(row): """Convert a row to table metrics format.""" # Base tags for all metrics @@ -708,7 +750,7 @@ def convert_row_to_table_metrics(row): "pipeline_run_id": getattr(row, "pipeline_run_id", ""), "table_name": getattr(row, "table_name", ""), "flow_name": getattr(row, "flow_name", ""), - "source": SOURCE_NAME + "source": SOURCE_NAME, } # Timestamp for all metrics @@ -720,28 +762,29 @@ def convert_row_to_table_metrics(row): metric_value=getattr(row, "num_upserted_rows", 0) or 0, tags={**base_tags, "metric_type": "count"}, timestamp=timestamp, - additional_attributes={} + additional_attributes={}, ), _metrics_converter.create_metric( metric_name="dlt.table.throughput.deleted_rows", metric_value=getattr(row, "num_deleted_rows", 0) or 0, tags={**base_tags, "metric_type": "count"}, timestamp=timestamp, - additional_attributes={} + additional_attributes={}, ), _metrics_converter.create_metric( metric_name="dlt.table.throughput.output_rows", metric_value=getattr(row, "num_output_rows", 0) or 0, tags={**base_tags, "metric_type": "count"}, timestamp=timestamp, - additional_attributes={} + additional_attributes={}, ), ] + def convert_row_to_pipeline_status_event(row): """Convert a row to pipeline status event format.""" # Determine pipeline status for title - status_display = row.latest_state.upper() if row.latest_state else 'UNKNOWN' + status_display = row.latest_state.upper() if row.latest_state else "UNKNOWN" pipeline_id = getattr(row, "pipeline_id", "") params = { @@ -752,7 +795,7 @@ def convert_row_to_pipeline_status_event(row): "latest_run_id": getattr(row, "pipeline_run_id", ""), "status": status_display.lower(), "source": SOURCE_NAME, - "service": SERVICE_NAME + "service": SERVICE_NAME, }, "timestamp": timestamp_in_unix_milliseconds(row.updated_at), "additional_attributes": { @@ -761,15 +804,17 @@ def convert_row_to_pipeline_status_event(row): "is_complete": getattr(row, "is_complete", None), "running_start_time": getattr(row, "running_start_time", None), "end_time": getattr(row, "end_time", None), - "updated_at": getattr(row, "updated_at", None) , + "updated_at": getattr(row, "updated_at", None), "latest_error_log_message": getattr(row, "latest_error_log_message", None), "latest_error_message": getattr(row, "latest_error_message", None), - } + }, } return _events_converter.create_event(**params) + def convert_row_to_pipeline_metrics(row): """Convert a row to pipeline metrics format.""" + def has_attr(obj, attr): return hasattr(obj, attr) and getattr(obj, attr) is not None @@ -779,7 +824,7 @@ def has_attr(obj, attr): base_tags = { "pipeline_id": getattr(row, "pipeline_id", ""), "pipeline_run_id": getattr(row, "pipeline_run_id", ""), - "source": SOURCE_NAME + "source": SOURCE_NAME, } metrics = [] timestamp = timestamp_in_unix_milliseconds(getattr(row, "create_time", None)) @@ -788,55 +833,68 @@ def has_attr(obj, attr): # Starting seconds: queued_time - create_time starting_seconds = (row.queued_time - row.create_time).total_seconds() - metrics.append(_metrics_converter.create_metric( - metric_name="pipeline.run.starting_seconds", - metric_value=starting_seconds, - tags={**base_tags, "metric_type": "duration", "phase": "starting"}, - timestamp=timestamp - )) + metrics.append( + _metrics_converter.create_metric( + metric_name="pipeline.run.starting_seconds", + metric_value=starting_seconds, + tags={**base_tags, "metric_type": "duration", "phase": "starting"}, + timestamp=timestamp, + ) + ) # Seconds waiting for resources: initialization_start_time - queued_time if not has_attr(row, "initialization_start_time"): return metrics waiting_seconds = (row.initialization_start_time - row.queued_time).total_seconds() - metrics.append(_metrics_converter.create_metric( - metric_name="pipeline.run.waiting_for_resources_seconds", - metric_value=waiting_seconds, - tags={**base_tags, "metric_type": "duration", "phase": "waiting"}, - timestamp=timestamp - )) + metrics.append( + _metrics_converter.create_metric( + metric_name="pipeline.run.waiting_for_resources_seconds", + metric_value=waiting_seconds, + tags={**base_tags, "metric_type": "duration", "phase": "waiting"}, + timestamp=timestamp, + ) + ) # Initialization seconds: running_start_time - initialization_start_time if not has_attr(row, "running_start_time"): return metrics - initialization_seconds = (row.running_start_time - row.initialization_start_time).total_seconds() - metrics.append(_metrics_converter.create_metric( - metric_name="pipeline.run.initialization_seconds", - metric_value=initialization_seconds, - tags={**base_tags, "metric_type": "duration", "phase": "initialization"}, - timestamp=timestamp - )) + initialization_seconds = ( + row.running_start_time - row.initialization_start_time + ).total_seconds() + metrics.append( + _metrics_converter.create_metric( + metric_name="pipeline.run.initialization_seconds", + metric_value=initialization_seconds, + tags={**base_tags, "metric_type": "duration", "phase": "initialization"}, + timestamp=timestamp, + ) + ) # Running seconds: end_time - running_start_time running_seconds = (end_time - row.running_start_time).total_seconds() - metrics.append(_metrics_converter.create_metric( - metric_name="pipeline.run.running_seconds", - metric_value=running_seconds, - tags={**base_tags, "metric_type": "duration", "phase": "running"}, - timestamp=timestamp - )) + metrics.append( + _metrics_converter.create_metric( + metric_name="pipeline.run.running_seconds", + metric_value=running_seconds, + tags={**base_tags, "metric_type": "duration", "phase": "running"}, + timestamp=timestamp, + ) + ) # Total seconds: end_time - create_time total_seconds = (end_time - row.create_time).total_seconds() - metrics.append(_metrics_converter.create_metric( - metric_name="pipeline.run.total_seconds", - metric_value=total_seconds, - tags={**base_tags, "metric_type": "duration", "phase": "total"}, - timestamp=timestamp - )) + metrics.append( + _metrics_converter.create_metric( + metric_name="pipeline.run.total_seconds", + metric_value=total_seconds, + tags={**base_tags, "metric_type": "duration", "phase": "total"}, + timestamp=timestamp, + ) + ) return metrics + # ================================================================================ # MAIN # ================================================================================ @@ -847,26 +905,33 @@ def has_attr(obj, attr): http_client = None + + def getClient(config): """Global HTTP client getter.""" global http_client if http_client is None: http_client = HTTPClient( max_retry_duration_sec=config["max_retry_duration_sec"], - request_timeout_sec=config["request_timeout_sec"] + request_timeout_sec=config["request_timeout_sec"], ) return http_client + def register_sink_for_pipeline_events(): @dlt.foreach_batch_sink(name="send_pipeline_status_to_3p_monitoring") def send_pipeline_status_to_3p_monitoring(batch_df, batch_id): destination_format_udf = udf(convert_row_to_pipeline_status_event, StringType()) - events_df = batch_df.withColumn("events", destination_format_udf(struct("*"))).select("events").filter(col("events").isNotNull()) + events_df = ( + batch_df.withColumn("events", destination_format_udf(struct("*"))) + .select("events") + .filter(col("events").isNotNull()) + ) http_request_spec = _events_converter.create_http_requests_spec( events_df, _global_config["num_rows_per_batch"], get_newrelic_headers(_global_config["api_key"]), - _global_config["endpoints"]["events"] + _global_config["endpoints"]["events"], ) getClient(_global_config).post(http_request_spec) @@ -879,29 +944,44 @@ def register_sink_for_errors(): @dlt.foreach_batch_sink(name="send_errors_to_3p_monitoring") def send_errors_to_3p_monitoring(batch_df, batch_id): destination_format_udf = udf(convert_row_to_error_log, StringType()) - logs_df = batch_df.withColumn("logs", destination_format_udf(struct("*"))).select("logs").filter(col("logs").isNotNull()) + logs_df = ( + batch_df.withColumn("logs", destination_format_udf(struct("*"))) + .select("logs") + .filter(col("logs").isNotNull()) + ) http_request_spec = _log_converter.create_http_requests_spec( logs_df, _global_config["num_rows_per_batch"], get_newrelic_headers(_global_config["api_key"]), - _global_config["endpoints"]["logs"] + _global_config["endpoints"]["logs"], ) getClient(_global_config).post(http_request_spec) @dlt.append_flow(target="send_errors_to_3p_monitoring") def send_errors_to_sink(): - return spark.readStream.option("skipChangeCommits", "true").table(event_logs_bronze).filter("error IS NOT NULL OR level = 'ERROR'") + return ( + spark.readStream.option("skipChangeCommits", "true") + .table(event_logs_bronze) + .filter("error IS NOT NULL OR level = 'ERROR'") + ) + def register_sink_for_pipeline_metrics(): @dlt.foreach_batch_sink(name="send_pipeline_metrics_to_3p_monitoring") def send_pipeline_metrics_to_3p_monitoring(batch_df, batch_id): - destination_format_udf = udf(convert_row_to_pipeline_metrics, ArrayType(StringType())) - metrics_df = batch_df.withColumn("metrics_array", destination_format_udf(struct("*"))).select(explode("metrics_array").alias("metrics")).filter(col("metrics").isNotNull()) + destination_format_udf = udf( + convert_row_to_pipeline_metrics, ArrayType(StringType()) + ) + metrics_df = ( + batch_df.withColumn("metrics_array", destination_format_udf(struct("*"))) + .select(explode("metrics_array").alias("metrics")) + .filter(col("metrics").isNotNull()) + ) http_request_spec = _metrics_converter.create_http_requests_spec( metrics_df, _global_config["num_rows_per_batch"], get_newrelic_headers(_global_config["api_key"]), - _global_config["endpoints"]["metrics"] + _global_config["endpoints"]["metrics"], ) getClient(_global_config).post(http_request_spec) @@ -909,23 +989,34 @@ def send_pipeline_metrics_to_3p_monitoring(batch_df, batch_id): def send_pipeline_metrics_to_sink(): return spark.readStream.table(f"{pipeline_runs_status}_cdf") + def register_sink_for_table_metrics(): @dlt.foreach_batch_sink(name="send_table_metrics_to_3p_monitoring") def send_table_metrics_to_3p_monitoring(batch_df, batch_id): - destination_format_udf = udf(convert_row_to_table_metrics, ArrayType(StringType())) - metrics_df = batch_df.withColumn("metrics_array", destination_format_udf(struct("*"))).select(explode("metrics_array").alias("metrics")).filter(col("metrics").isNotNull()) + destination_format_udf = udf( + convert_row_to_table_metrics, ArrayType(StringType()) + ) + metrics_df = ( + batch_df.withColumn("metrics_array", destination_format_udf(struct("*"))) + .select(explode("metrics_array").alias("metrics")) + .filter(col("metrics").isNotNull()) + ) http_request_spec = _metrics_converter.create_http_requests_spec( metrics_df, _global_config["num_rows_per_batch"], get_newrelic_headers(_global_config["api_key"]), - _global_config["endpoints"]["metrics"] + _global_config["endpoints"]["metrics"], ) getClient(_global_config).post(http_request_spec) @dlt.append_flow(target="send_table_metrics_to_3p_monitoring") def send_table_metrics_to_sink(): - return spark.readStream.option("skipChangeCommits", "true").table(event_logs_bronze) \ - .filter("table_name is not null AND details:flow_progress.metrics is not null AND event_type = 'flow_progress'") \ + return ( + spark.readStream.option("skipChangeCommits", "true") + .table(event_logs_bronze) + .filter( + "table_name is not null AND details:flow_progress.metrics is not null AND event_type = 'flow_progress'" + ) .selectExpr( "pipeline_id", "pipeline_run_id", @@ -934,9 +1025,13 @@ def send_table_metrics_to_sink(): "event_timestamp", "details:flow_progress.metrics.num_upserted_rows::bigint as num_upserted_rows", "details:flow_progress.metrics.num_deleted_rows::bigint as num_deleted_rows", - "(details:flow_progress.metrics.num_upserted_rows::bigint + details:flow_progress.metrics.num_deleted_rows::bigint) as num_output_rows" - ) \ - .filter("num_upserted_rows is not null OR num_deleted_rows is not null OR num_output_rows is not null") + "(details:flow_progress.metrics.num_upserted_rows::bigint + details:flow_progress.metrics.num_deleted_rows::bigint) as num_output_rows", + ) + .filter( + "num_upserted_rows is not null OR num_deleted_rows is not null OR num_output_rows is not null" + ) + ) + # ================================================================================ # MAIN INITIALIZATION @@ -948,4 +1043,4 @@ def send_table_metrics_to_sink(): register_sink_for_errors() register_sink_for_pipeline_events() register_sink_for_table_metrics() - register_sink_for_pipeline_metrics() \ No newline at end of file + register_sink_for_pipeline_metrics() diff --git a/contrib/databricks_ingestion_monitoring/common/third_party_sinks/splunk_observability_sink.py b/contrib/databricks_ingestion_monitoring/common/third_party_sinks/splunk_observability_sink.py index 8f7a943..579d9e5 100644 --- a/contrib/databricks_ingestion_monitoring/common/third_party_sinks/splunk_observability_sink.py +++ b/contrib/databricks_ingestion_monitoring/common/third_party_sinks/splunk_observability_sink.py @@ -12,7 +12,17 @@ from datetime import datetime, timezone from pyspark.sql import SparkSession from pyspark.sql.types import StringType, ArrayType -from pyspark.sql.functions import lit, col, collect_list, concat, expr, udf, struct, explode, regexp_replace +from pyspark.sql.functions import ( + lit, + col, + collect_list, + concat, + expr, + udf, + struct, + explode, + regexp_replace, +) import dlt # Global Configuration. @@ -32,68 +42,57 @@ "type": "object", "required": ["metric", "value", "timestamp"], "properties": { - "metric": { - "type": "string", - "description": "The name of the metric." - }, - "value": { - "type": "number", - "description": "The numeric value for the metric." - }, + "metric": {"type": "string", "description": "The name of the metric."}, + "value": {"type": "number", "description": "The numeric value for the metric."}, "timestamp": { "type": "integer", - "description": "Unix timestamp in milliseconds." + "description": "Unix timestamp in milliseconds.", }, "dimensions": { "type": "object", "description": "Key-value pairs for metric dimensions/tags.", - "additionalProperties": { - "type": "string" - } - } - } + "additionalProperties": {"type": "string"}, + }, + }, } EVENTS_SCHEMA = { "type": "object", "required": ["eventType", "category", "timestamp"], "properties": { - "eventType": { - "type": "string", - "description": "The type of event." - }, + "eventType": {"type": "string", "description": "The type of event."}, "category": { "type": "string", "enum": ["USER_DEFINED", "ALERT", "AUDIT", "JOB"], - "description": "The category of the event." + "description": "The category of the event.", }, "timestamp": { "type": "integer", - "description": "Unix timestamp in milliseconds." + "description": "Unix timestamp in milliseconds.", }, "dimensions": { "type": "object", "description": "Key-value pairs for event dimensions.", - "additionalProperties": { - "type": "string" - } + "additionalProperties": {"type": "string"}, }, "properties": { "type": "object", "description": "Additional event properties.", - "additionalProperties": True - } - } + "additionalProperties": True, + }, + }, } # ================================================================================ # UTILITIES # ================================================================================ + def get_signalfx_headers(access_token: str): """Get headers for the SignalFx/Splunk Observability API.""" return {"Content-Type": "application/json", "X-SF-TOKEN": access_token} + def initialize_global_config(spark_conf): """Initialize global configuration from Spark configuration.""" global _global_config, _log_converter, _events_converter, _metrics_converter @@ -103,12 +102,14 @@ def initialize_global_config(spark_conf): _events_converter = SplunkEventsConverter() _metrics_converter = SplunkMetricsConverter() + def getParam(spark_conf, key: str, default=None): value = spark_conf.get(key, default) if value == "" or value is None: return None return value + def getThirdPartySinkConfigFromSparkConfig(spark_conf): """ Extract and merge configuration from Spark configuration and secret scope. @@ -137,7 +138,7 @@ def getThirdPartySinkConfigFromSparkConfig(spark_conf): "max_retry_duration_sec": int(spark_conf.get("max_retry_duration_sec", "300")), "request_timeout_sec": int(spark_conf.get("request_timeout_sec", "30")), "splunk_access_token": getParam(spark_conf, "splunk_access_token"), - "host_name": getParam(spark_conf, "host_name") + "host_name": getParam(spark_conf, "host_name"), } # Merge secrets from a scope if scope is provided. @@ -150,8 +151,10 @@ def getThirdPartySinkConfigFromSparkConfig(spark_conf): common_params.update(secrets) # Validate required credentials - if common_params['splunk_access_token'] is None: - raise ValueError(f"Splunk access token is required for {destination} destination") + if common_params["splunk_access_token"] is None: + raise ValueError( + f"Splunk access token is required for {destination} destination" + ) # Get endpoints (allow override) metrics_endpoint = getParam(spark_conf, "endpoints.metrics") @@ -160,7 +163,7 @@ def getThirdPartySinkConfigFromSparkConfig(spark_conf): # Auto-generate endpoints if not provided if not all([metrics_endpoint, logs_endpoint, events_endpoint]): - if common_params['host_name'] is None: + if common_params["host_name"] is None: raise ValueError( "Either 'host_name' must be provided to auto-generate SignalFx endpoints, " "or all three endpoints (endpoints.metrics, endpoints.logs, endpoints.events) " @@ -202,36 +205,37 @@ def unix_to_iso(timestamp: int) -> str: dt = datetime.fromtimestamp(ts, tz=timezone.utc) return dt.isoformat().replace("+00:00", "Z") + def timestamp_in_unix_milliseconds(timestamp) -> int: """Convert datetime to Unix timestamp in milliseconds.""" if isinstance(timestamp, datetime): return int(timestamp.timestamp() * 1000) return int(timestamp) + def timestamp_in_unix_seconds(timestamp) -> float: """Convert datetime to Unix timestamp in seconds.""" if isinstance(timestamp, datetime): return timestamp.timestamp() return float(timestamp) / 1000.0 + def get_status(status_display: str) -> str: """Map pipeline status to appropriate status level.""" status_lower = status_display.lower() - if status_lower in ['failed', 'error']: - return 'error' - elif status_lower in ['running', 'starting']: - return 'info' - elif status_lower in ['completed', 'success']: - return 'ok' + if status_lower in ["failed", "error"]: + return "error" + elif status_lower in ["running", "starting"]: + return "info" + elif status_lower in ["completed", "success"]: + return "ok" else: - return 'warn' + return "warn" + def serialize_datetime(data): if isinstance(data, dict): - return { - key: serialize_datetime(value) - for key, value in data.items() - } + return {key: serialize_datetime(value) for key, value in data.items()} elif isinstance(data, list): return [serialize_datetime(item) for item in data] elif isinstance(data, datetime): @@ -239,6 +243,7 @@ def serialize_datetime(data): else: return data + def filter_null_fields(data): if isinstance(data, dict): return { @@ -251,12 +256,12 @@ def filter_null_fields(data): else: return data -def enforce_schema(data, schema, path = "root"): + +def enforce_schema(data, schema, path="root"): # Nothing to enforce. if schema is None or data is None: return data - schema_type = schema.get("type") if not schema_type: raise ValueError(f"Failed to get type of the object at {path}.") @@ -304,7 +309,9 @@ def enforce_schema(data, schema, path = "root"): f"Additional property '{k}' at {path} does not match any oneOf schema" ) else: - data[k] = enforce_schema(v, additional_properties, f"{path}.{k}") + data[k] = enforce_schema( + v, additional_properties, f"{path}.{k}" + ) return data @@ -313,7 +320,10 @@ def enforce_schema(data, schema, path = "root"): if schema_type != "array": raise ValueError(f"Expected array at {path}, got {type(data).__name__}") items_schema = schema.get("items", {}) - return [enforce_schema(item, items_schema, f"{path}[{i}]") for i, item in enumerate(data)] + return [ + enforce_schema(item, items_schema, f"{path}[{i}]") + for i, item in enumerate(data) + ] # Handle string elif isinstance(data, str): @@ -321,7 +331,9 @@ def enforce_schema(data, schema, path = "root"): raise ValueError(f"Expected string at {path}, got {type(data).__name__}") acceptable_values = schema.get("enum", []) if acceptable_values and data not in acceptable_values: - raise ValueError(f"Invalid value at {path}: {data}. Allowed: {acceptable_values}") + raise ValueError( + f"Invalid value at {path}: {data}. Allowed: {acceptable_values}" + ) max_length = schema.get("maxLength") if max_length and len(data) > max_length: return data[:max_length] @@ -357,12 +369,14 @@ def enforce_schema(data, schema, path = "root"): return data return data + def create_valid_json_or_fail_with_error(data, schema): data = serialize_datetime(data) data = filter_null_fields(data) data = enforce_schema(data, schema) return json.dumps(data) + # ================================================================================ # HTTP Layer # ================================================================================ @@ -370,6 +384,7 @@ def create_valid_json_or_fail_with_error(data, schema): # Global session for connection pooling session: Optional[requests.Session] = None + class HTTPClient: """ HTTP client for batched POST requests using a persistent session. @@ -380,7 +395,9 @@ class HTTPClient: - payload (binary data): Serialized request body. """ - def __init__(self, max_retry_duration_sec: int = 300, request_timeout_sec: int = 30): + def __init__( + self, max_retry_duration_sec: int = 300, request_timeout_sec: int = 30 + ): """ Initialize the HTTP client. @@ -391,7 +408,6 @@ def __init__(self, max_retry_duration_sec: int = 300, request_timeout_sec: int = self.max_retry_duration_sec = max_retry_duration_sec self.request_timeout_sec = request_timeout_sec - def get_session(self) -> requests.Session: """ Get the global session instance. If not present, create a new one. @@ -404,7 +420,9 @@ def get_session(self) -> requests.Session: session = requests.Session() return session - def _make_request_with_retry(self, url: str, headers: Dict[str, str], payload: bytes): + def _make_request_with_retry( + self, url: str, headers: Dict[str, str], payload: bytes + ): """ Make a POST request to the provided url. @@ -418,7 +436,7 @@ def _make_request_with_retry(self, url: str, headers: Dict[str, str], payload: b """ # Compress payload compressed_payload = gzip.compress(payload) - headers['Content-Encoding'] = 'gzip' + headers["Content-Encoding"] = "gzip" response = None try: @@ -426,19 +444,29 @@ def _make_request_with_retry(self, url: str, headers: Dict[str, str], payload: b url, headers=headers, data=compressed_payload, - timeout=self.request_timeout_sec + timeout=self.request_timeout_sec, ) response.raise_for_status() - print(f"Successfully sent request to URL: {url}, Payload: {payload.decode('utf-8')}, Response: {response.text}") + print( + f"Successfully sent request to URL: {url}, Payload: {payload.decode('utf-8')}, Response: {response.text}" + ) except Exception as e: response_text = "No response" if response is not None: try: response_text = str(response.json()) except: - response_text = response.text if hasattr(response, 'text') else "Unable to read response" - print(f"Request failed for URL: {url}, headers: {str(headers)}, Payload: {payload.decode('utf-8')}, Error: {str(e)}, Response: {response_text}") - raise type(e)(f"Request failed for URL: {url}, headers: {str(headers)}, Payload: {payload.decode('utf-8')}, Error: {str(e)}, Response: {response_text}") from e + response_text = ( + response.text + if hasattr(response, "text") + else "Unable to read response" + ) + print( + f"Request failed for URL: {url}, headers: {str(headers)}, Payload: {payload.decode('utf-8')}, Error: {str(e)}, Response: {response_text}" + ) + raise type(e)( + f"Request failed for URL: {url}, headers: {str(headers)}, Payload: {payload.decode('utf-8')}, Error: {str(e)}, Response: {response_text}" + ) from e def post(self, http_request_specs_df) -> None: """ @@ -451,22 +479,25 @@ def post(self, http_request_specs_df) -> None: for row in http_request_specs_df.collect(): try: - headers = json.loads(getattr(row, 'header', '{}')) + headers = json.loads(getattr(row, "header", "{}")) retry_wrapper = retry( stop=stop_after_delay(self.max_retry_duration_sec), wait=wait_exponential(multiplier=1, min=1, max=10), - reraise=True + reraise=True, + ) + retry_wrapper(self._make_request_with_retry)( + row.endpoint, headers, row.payloadBytes ) - retry_wrapper(self._make_request_with_retry)(row.endpoint, headers, row.payloadBytes) except Exception as e: print(f"ERROR: {str(e)}") - continue # Continue with other requests regardless of success/failure + continue # Continue with other requests regardless of success/failure # ================================================================================ # CONVERSION LAYER # ================================================================================ + class SplunkMetricsConverter: """Converter class to convert metrics to Splunk Observability format.""" @@ -476,7 +507,8 @@ def create_metric( metric_value: float, tags: Dict[str, str], timestamp: int, - additional_attributes: Optional[Dict[str, Any]] = None) -> str: + additional_attributes: Optional[Dict[str, Any]] = None, + ) -> str: """Create a Splunk Observability metric in the proper format. Args: @@ -502,39 +534,47 @@ def create_metric( "metric": metric_name, "value": metric_value, "timestamp": timestamp, - "dimensions": dimensions + "dimensions": dimensions, } # Enforce the schema return create_valid_json_or_fail_with_error(metric, METRICS_SCHEMA) - def create_http_requests_spec(self, df, num_rows_per_batch: int, headers: dict, endpoint: str): + def create_http_requests_spec( + self, df, num_rows_per_batch: int, headers: dict, endpoint: str + ): """Create HTTP request spec DataFrame for metrics.""" - df_with_batch_id = df.withColumn("batch_id", - expr(f"int((row_number() over (order by 1) - 1) / {num_rows_per_batch})")) \ - .withColumn("metrics", regexp_replace(col("metrics"), "\n", "")) - return df_with_batch_id.groupBy("batch_id") \ - .agg(collect_list("metrics").alias("batch_metrics")) \ - .withColumn("payload", concat(lit('{"gauge": ['), - expr("concat_ws(',', batch_metrics)"), - lit(']}'))) \ - .withColumn("payloadBytes", col("payload").cast("binary")) \ - .withColumn("endpoint", lit(endpoint)) \ - .withColumn("header", lit(json.dumps(headers))) \ + df_with_batch_id = df.withColumn( + "batch_id", + expr(f"int((row_number() over (order by 1) - 1) / {num_rows_per_batch})"), + ).withColumn("metrics", regexp_replace(col("metrics"), "\n", "")) + return ( + df_with_batch_id.groupBy("batch_id") + .agg(collect_list("metrics").alias("batch_metrics")) + .withColumn( + "payload", + concat( + lit('{"gauge": ['), expr("concat_ws(',', batch_metrics)"), lit("]}") + ), + ) + .withColumn("payloadBytes", col("payload").cast("binary")) + .withColumn("endpoint", lit(endpoint)) + .withColumn("header", lit(json.dumps(headers))) .select("endpoint", "header", "payloadBytes") + ) class SplunkEventsConverter: """Converter class to convert events to Splunk Observability format.""" - def create_event( self, title: str, status: str, tags: Dict[str, str], timestamp: int, - additional_attributes: Optional[Dict[str, Any]] = None) -> str: + additional_attributes: Optional[Dict[str, Any]] = None, + ) -> str: """ Create a Splunk Observability event in the proper format. @@ -559,8 +599,8 @@ def create_event( "properties": { "status": status, "source": SOURCE_NAME, - "service": SERVICE_NAME - } + "service": SERVICE_NAME, + }, } # Add additional attributes if provided @@ -570,20 +610,27 @@ def create_event( # Enforce the schema return create_valid_json_or_fail_with_error(event, EVENTS_SCHEMA) - def create_http_requests_spec(self, df, num_rows_per_batch: int, headers: dict, endpoint: str): + def create_http_requests_spec( + self, df, num_rows_per_batch: int, headers: dict, endpoint: str + ): """Create HTTP request spec DataFrame for events.""" - df_with_batch_id = df.withColumn("batch_id", - expr(f"int((row_number() over (order by 1) - 1) / {num_rows_per_batch})")) \ - .withColumn("events", regexp_replace(col("events"), "\n", "")) - return df_with_batch_id.groupBy("batch_id") \ - .agg(collect_list("events").alias("batch_events")) \ - .withColumn("payload", concat(lit('['), - expr("concat_ws(',', batch_events)"), - lit(']'))) \ - .withColumn("payloadBytes", col("payload").cast("binary")) \ - .withColumn("endpoint", lit(endpoint)) \ - .withColumn("header", lit(json.dumps(headers))) \ + df_with_batch_id = df.withColumn( + "batch_id", + expr(f"int((row_number() over (order by 1) - 1) / {num_rows_per_batch})"), + ).withColumn("events", regexp_replace(col("events"), "\n", "")) + return ( + df_with_batch_id.groupBy("batch_id") + .agg(collect_list("events").alias("batch_events")) + .withColumn( + "payload", + concat(lit("["), expr("concat_ws(',', batch_events)"), lit("]")), + ) + .withColumn("payloadBytes", col("payload").cast("binary")) + .withColumn("endpoint", lit(endpoint)) + .withColumn("header", lit(json.dumps(headers))) .select("endpoint", "header", "payloadBytes") + ) + class SplunkLogsConverter: """Converter class to convert logs to Splunk Observability events format.""" @@ -594,7 +641,8 @@ def create_log( status: str, tags: Dict[str, str], timestamp: int, - additional_attributes: Optional[Dict[str, Any]] = None) -> str: + additional_attributes: Optional[Dict[str, Any]] = None, + ) -> str: """ Create a Splunk Observability event for log data. @@ -621,8 +669,8 @@ def create_log( "properties": { "status": status, "source": SOURCE_NAME, - "service": SERVICE_NAME - } + "service": SERVICE_NAME, + }, } # Add additional attributes if provided @@ -632,46 +680,54 @@ def create_log( # Enforce the schema return create_valid_json_or_fail_with_error(event, EVENTS_SCHEMA) - def create_http_requests_spec(self, df, num_rows_per_batch: int, headers: dict, endpoint: str): + def create_http_requests_spec( + self, df, num_rows_per_batch: int, headers: dict, endpoint: str + ): """Create HTTP request spec DataFrame for logs (sent as events).""" - df_with_batch_id = df.withColumn("batch_id", - expr(f"int((row_number() over (order by 1) - 1) / {num_rows_per_batch})")) \ - .withColumn("logs", regexp_replace(col("logs"), "\n", "")) - return df_with_batch_id.groupBy("batch_id") \ - .agg(collect_list("logs").alias("batch_logs")) \ - .withColumn("payload", concat(lit('['), - expr("concat_ws(',', batch_logs)"), - lit(']'))) \ - .withColumn("payloadBytes", col("payload").cast("binary")) \ - .withColumn("endpoint", lit(endpoint)) \ - .withColumn("header", lit(json.dumps(headers))) \ + df_with_batch_id = df.withColumn( + "batch_id", + expr(f"int((row_number() over (order by 1) - 1) / {num_rows_per_batch})"), + ).withColumn("logs", regexp_replace(col("logs"), "\n", "")) + return ( + df_with_batch_id.groupBy("batch_id") + .agg(collect_list("logs").alias("batch_logs")) + .withColumn( + "payload", + concat(lit("["), expr("concat_ws(',', batch_logs)"), lit("]")), + ) + .withColumn("payloadBytes", col("payload").cast("binary")) + .withColumn("endpoint", lit(endpoint)) + .withColumn("header", lit(json.dumps(headers))) .select("endpoint", "header", "payloadBytes") + ) # ================================================================================ # INFERENCE LAYER # ================================================================================ + def convert_row_to_error_log(row): """Convert a row to error log format.""" params = { "title": str(getattr(row, "message", "")), "status": "error", "tags": { - "pipeline_id": getattr(row, 'pipeline_id', ''), - "pipeline_run_id": getattr(row, 'pipeline_run_id', ''), - "table_name": getattr(row, 'table_name', ''), - "flow_name": getattr(row, 'flow_name', ''), - "level": "error" + "pipeline_id": getattr(row, "pipeline_id", ""), + "pipeline_run_id": getattr(row, "pipeline_run_id", ""), + "table_name": getattr(row, "table_name", ""), + "flow_name": getattr(row, "flow_name", ""), + "level": "error", }, "timestamp": timestamp_in_unix_milliseconds(row.event_timestamp), "additional_attributes": { "pipeline_run_link": getattr(row, "pipeline_run_link", None), "error": getattr(row, "error", None), - } + }, } return _log_converter.create_log(**params) + def convert_row_to_table_metrics(row): """Convert a row to table metrics format.""" # Base tags for all metrics @@ -680,7 +736,7 @@ def convert_row_to_table_metrics(row): "pipeline_run_id": getattr(row, "pipeline_run_id", ""), "table_name": getattr(row, "table_name", ""), "flow_name": getattr(row, "flow_name", ""), - "source": SOURCE_NAME + "source": SOURCE_NAME, } # Timestamp for all metrics @@ -692,28 +748,29 @@ def convert_row_to_table_metrics(row): metric_value=getattr(row, "num_upserted_rows", 0) or 0, tags={**base_tags, "metric_type": "count"}, timestamp=timestamp, - additional_attributes={} + additional_attributes={}, ), _metrics_converter.create_metric( metric_name="dlt.table.throughput.deleted_rows", metric_value=getattr(row, "num_deleted_rows", 0) or 0, tags={**base_tags, "metric_type": "count"}, timestamp=timestamp, - additional_attributes={} + additional_attributes={}, ), _metrics_converter.create_metric( metric_name="dlt.table.throughput.output_rows", metric_value=getattr(row, "num_output_rows", 0) or 0, tags={**base_tags, "metric_type": "count"}, timestamp=timestamp, - additional_attributes={} + additional_attributes={}, ), ] + def convert_row_to_pipeline_status_event(row): """Convert a row to pipeline status event format.""" # Determine pipeline status for title - status_display = row.latest_state.upper() if row.latest_state else 'UNKNOWN' + status_display = row.latest_state.upper() if row.latest_state else "UNKNOWN" pipeline_id = getattr(row, "pipeline_id", "") params = { @@ -724,7 +781,7 @@ def convert_row_to_pipeline_status_event(row): "latest_run_id": getattr(row, "pipeline_run_id", ""), "status": status_display.lower(), "source": SOURCE_NAME, - "service": SERVICE_NAME + "service": SERVICE_NAME, }, "timestamp": timestamp_in_unix_milliseconds(row.updated_at), "additional_attributes": { @@ -733,15 +790,17 @@ def convert_row_to_pipeline_status_event(row): "is_complete": getattr(row, "is_complete", None), "running_start_time": getattr(row, "running_start_time", None), "end_time": getattr(row, "end_time", None), - "updated_at": getattr(row, "updated_at", None) , + "updated_at": getattr(row, "updated_at", None), "latest_error_log_message": getattr(row, "latest_error_log_message", None), "latest_error_message": getattr(row, "latest_error_message", None), - } + }, } return _events_converter.create_event(**params) + def convert_row_to_pipeline_metrics(row): """Convert a row to pipeline metrics format.""" + def has_attr(obj, attr): return hasattr(obj, attr) and getattr(obj, attr) is not None @@ -751,7 +810,7 @@ def has_attr(obj, attr): base_tags = { "pipeline_id": getattr(row, "pipeline_id", ""), "pipeline_run_id": getattr(row, "pipeline_run_id", ""), - "source": SOURCE_NAME + "source": SOURCE_NAME, } metrics = [] timestamp = timestamp_in_unix_milliseconds(getattr(row, "create_time", None)) @@ -760,55 +819,68 @@ def has_attr(obj, attr): # Starting seconds: queued_time - create_time starting_seconds = (row.queued_time - row.create_time).total_seconds() - metrics.append(_metrics_converter.create_metric( - metric_name="pipeline.run.starting_seconds", - metric_value=starting_seconds, - tags={**base_tags, "metric_type": "duration", "phase": "starting"}, - timestamp=timestamp - )) + metrics.append( + _metrics_converter.create_metric( + metric_name="pipeline.run.starting_seconds", + metric_value=starting_seconds, + tags={**base_tags, "metric_type": "duration", "phase": "starting"}, + timestamp=timestamp, + ) + ) # Seconds waiting for resources: initialization_start_time - queued_time if not has_attr(row, "initialization_start_time"): return metrics waiting_seconds = (row.initialization_start_time - row.queued_time).total_seconds() - metrics.append(_metrics_converter.create_metric( - metric_name="pipeline.run.waiting_for_resources_seconds", - metric_value=waiting_seconds, - tags={**base_tags, "metric_type": "duration", "phase": "waiting"}, - timestamp=timestamp - )) + metrics.append( + _metrics_converter.create_metric( + metric_name="pipeline.run.waiting_for_resources_seconds", + metric_value=waiting_seconds, + tags={**base_tags, "metric_type": "duration", "phase": "waiting"}, + timestamp=timestamp, + ) + ) # Initialization seconds: running_start_time - initialization_start_time if not has_attr(row, "running_start_time"): return metrics - initialization_seconds = (row.running_start_time - row.initialization_start_time).total_seconds() - metrics.append(_metrics_converter.create_metric( - metric_name="pipeline.run.initialization_seconds", - metric_value=initialization_seconds, - tags={**base_tags, "metric_type": "duration", "phase": "initialization"}, - timestamp=timestamp - )) + initialization_seconds = ( + row.running_start_time - row.initialization_start_time + ).total_seconds() + metrics.append( + _metrics_converter.create_metric( + metric_name="pipeline.run.initialization_seconds", + metric_value=initialization_seconds, + tags={**base_tags, "metric_type": "duration", "phase": "initialization"}, + timestamp=timestamp, + ) + ) # Running seconds: end_time - running_start_time running_seconds = (end_time - row.running_start_time).total_seconds() - metrics.append(_metrics_converter.create_metric( - metric_name="pipeline.run.running_seconds", - metric_value=running_seconds, - tags={**base_tags, "metric_type": "duration", "phase": "running"}, - timestamp=timestamp - )) + metrics.append( + _metrics_converter.create_metric( + metric_name="pipeline.run.running_seconds", + metric_value=running_seconds, + tags={**base_tags, "metric_type": "duration", "phase": "running"}, + timestamp=timestamp, + ) + ) # Total seconds: end_time - create_time total_seconds = (end_time - row.create_time).total_seconds() - metrics.append(_metrics_converter.create_metric( - metric_name="pipeline.run.total_seconds", - metric_value=total_seconds, - tags={**base_tags, "metric_type": "duration", "phase": "total"}, - timestamp=timestamp - )) + metrics.append( + _metrics_converter.create_metric( + metric_name="pipeline.run.total_seconds", + metric_value=total_seconds, + tags={**base_tags, "metric_type": "duration", "phase": "total"}, + timestamp=timestamp, + ) + ) return metrics + # ================================================================================ # MAIN # ================================================================================ @@ -819,26 +891,33 @@ def has_attr(obj, attr): http_client = None + + def getClient(config): """Global HTTP client getter.""" global http_client if http_client is None: http_client = HTTPClient( max_retry_duration_sec=config["max_retry_duration_sec"], - request_timeout_sec=config["request_timeout_sec"] + request_timeout_sec=config["request_timeout_sec"], ) return http_client + def register_sink_for_pipeline_events(): @dlt.foreach_batch_sink(name="send_pipeline_status_to_3p_monitoring") def send_pipeline_status_to_3p_monitoring(batch_df, batch_id): destination_format_udf = udf(convert_row_to_pipeline_status_event, StringType()) - events_df = batch_df.withColumn("events", destination_format_udf(struct("*"))).select("events").filter(col("events").isNotNull()) + events_df = ( + batch_df.withColumn("events", destination_format_udf(struct("*"))) + .select("events") + .filter(col("events").isNotNull()) + ) http_request_spec = _events_converter.create_http_requests_spec( events_df, _global_config["num_rows_per_batch"], get_signalfx_headers(_global_config["splunk_access_token"]), - _global_config["endpoints"]["events"] + _global_config["endpoints"]["events"], ) getClient(_global_config).post(http_request_spec) @@ -851,30 +930,45 @@ def register_sink_for_errors(): @dlt.foreach_batch_sink(name="send_errors_to_3p_monitoring") def send_errors_to_3p_monitoring(batch_df, batch_id): destination_format_udf = udf(convert_row_to_error_log, StringType()) - logs_df = batch_df.withColumn("logs", destination_format_udf(struct("*"))).select("logs").filter(col("logs").isNotNull()) + logs_df = ( + batch_df.withColumn("logs", destination_format_udf(struct("*"))) + .select("logs") + .filter(col("logs").isNotNull()) + ) http_request_spec = _log_converter.create_http_requests_spec( logs_df, _global_config["num_rows_per_batch"], get_signalfx_headers(_global_config["splunk_access_token"]), - _global_config["endpoints"]["logs"] + _global_config["endpoints"]["logs"], ) getClient(_global_config).post(http_request_spec) @dlt.append_flow(target="send_errors_to_3p_monitoring") def send_errors_to_sink(): - return spark.readStream.option("skipChangeCommits", "true").table(event_logs_bronze).filter("error IS NOT NULL OR level = 'ERROR'") + return ( + spark.readStream.option("skipChangeCommits", "true") + .table(event_logs_bronze) + .filter("error IS NOT NULL OR level = 'ERROR'") + ) + def register_sink_for_pipeline_metrics(): @dlt.foreach_batch_sink(name="send_pipeline_metrics_to_3p_monitoring") def send_pipeline_metrics_to_3p_monitoring(batch_df, batch_id): # DataFrame conversion logic - destination_format_udf = udf(convert_row_to_pipeline_metrics, ArrayType(StringType())) - metrics_df = batch_df.withColumn("metrics_array", destination_format_udf(struct("*"))).select(explode("metrics_array").alias("metrics")).filter(col("metrics").isNotNull()) + destination_format_udf = udf( + convert_row_to_pipeline_metrics, ArrayType(StringType()) + ) + metrics_df = ( + batch_df.withColumn("metrics_array", destination_format_udf(struct("*"))) + .select(explode("metrics_array").alias("metrics")) + .filter(col("metrics").isNotNull()) + ) http_request_spec = _metrics_converter.create_http_requests_spec( metrics_df, _global_config["num_rows_per_batch"], get_signalfx_headers(_global_config["splunk_access_token"]), - _global_config["endpoints"]["metrics"] + _global_config["endpoints"]["metrics"], ) getClient(_global_config).post(http_request_spec) @@ -882,23 +976,34 @@ def send_pipeline_metrics_to_3p_monitoring(batch_df, batch_id): def send_pipeline_metrics_to_sink(): return spark.readStream.table(f"{pipeline_runs_status}_cdf") + def register_sink_for_table_metrics(): @dlt.foreach_batch_sink(name="send_table_metrics_to_3p_monitoring") def send_table_metrics_to_3p_monitoring(batch_df, batch_id): - destination_format_udf = udf(convert_row_to_table_metrics, ArrayType(StringType())) - metrics_df = batch_df.withColumn("metrics_array", destination_format_udf(struct("*"))).select(explode("metrics_array").alias("metrics")).filter(col("metrics").isNotNull()) + destination_format_udf = udf( + convert_row_to_table_metrics, ArrayType(StringType()) + ) + metrics_df = ( + batch_df.withColumn("metrics_array", destination_format_udf(struct("*"))) + .select(explode("metrics_array").alias("metrics")) + .filter(col("metrics").isNotNull()) + ) http_request_spec = _metrics_converter.create_http_requests_spec( metrics_df, _global_config["num_rows_per_batch"], get_signalfx_headers(_global_config["splunk_access_token"]), - _global_config["endpoints"]["metrics"] + _global_config["endpoints"]["metrics"], ) getClient(_global_config).post(http_request_spec) @dlt.append_flow(target="send_table_metrics_to_3p_monitoring") def send_table_metrics_to_sink(): - return spark.readStream.option("skipChangeCommits", "true").table(event_logs_bronze) \ - .filter("table_name is not null AND details:flow_progress.metrics is not null AND event_type = 'flow_progress'") \ + return ( + spark.readStream.option("skipChangeCommits", "true") + .table(event_logs_bronze) + .filter( + "table_name is not null AND details:flow_progress.metrics is not null AND event_type = 'flow_progress'" + ) .selectExpr( "pipeline_id", "pipeline_run_id", @@ -907,9 +1012,13 @@ def send_table_metrics_to_sink(): "event_timestamp", "details:flow_progress.metrics.num_upserted_rows::bigint as num_upserted_rows", "details:flow_progress.metrics.num_deleted_rows::bigint as num_deleted_rows", - "(details:flow_progress.metrics.num_upserted_rows::bigint + details:flow_progress.metrics.num_deleted_rows::bigint) as num_output_rows" - ) \ - .filter("num_upserted_rows is not null OR num_deleted_rows is not null OR num_output_rows is not null") + "(details:flow_progress.metrics.num_upserted_rows::bigint + details:flow_progress.metrics.num_deleted_rows::bigint) as num_output_rows", + ) + .filter( + "num_upserted_rows is not null OR num_deleted_rows is not null OR num_output_rows is not null" + ) + ) + # ================================================================================ # MAIN INITIALIZATION diff --git a/contrib/databricks_ingestion_monitoring/generic_sdp_monitoring_dab/monitoring_etl/sdp_monitoring_pipeline_main.py b/contrib/databricks_ingestion_monitoring/generic_sdp_monitoring_dab/monitoring_etl/sdp_monitoring_pipeline_main.py index e7a839e..e58f595 100644 --- a/contrib/databricks_ingestion_monitoring/generic_sdp_monitoring_dab/monitoring_etl/sdp_monitoring_pipeline_main.py +++ b/contrib/databricks_ingestion_monitoring/generic_sdp_monitoring_dab/monitoring_etl/sdp_monitoring_pipeline_main.py @@ -4,12 +4,14 @@ sys.path.append("../../common/lib") -from databricks_ingestion_monitoring.common_ldp import Configuration, MonitoringEtlPipeline +from databricks_ingestion_monitoring.common_ldp import ( + Configuration, + MonitoringEtlPipeline, +) # Configure logging logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' + level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" ) logger = logging.getLogger(__name__) logger.info("Starting Generic SDP Monitoring ETL Pipeline") @@ -18,4 +20,4 @@ conf = Configuration(spark.conf) pipeline = MonitoringEtlPipeline(conf, spark) -pipeline.register_base_tables_and_views(spark) \ No newline at end of file +pipeline.register_base_tables_and_views(spark)