From b09557029a8353110e96883eabf22ba101850acb Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Wed, 29 Oct 2025 16:45:33 +0100 Subject: [PATCH 1/9] Cleanup service monitors, add new stacakble-generic service monitor that works with all products. --- .../prometheus-service-monitors.yaml | 159 +++++++----------- 1 file changed, 61 insertions(+), 98 deletions(-) diff --git a/stacks/monitoring/prometheus-service-monitors.yaml b/stacks/monitoring/prometheus-service-monitors.yaml index b5231790..0abfd472 100644 --- a/stacks/monitoring/prometheus-service-monitors.yaml +++ b/stacks/monitoring/prometheus-service-monitors.yaml @@ -40,10 +40,12 @@ spec: - airflow - druid - hive - - nifi # This only works for NiFi 1, NiFi 2 has a special ServiceMonitor below + - kafka + - nifi # This only works for NiFi 1, NiFi 2 works via stackable-generic - opa - superset - trino + - zookeeper endpoints: - scheme: http port: metrics @@ -55,10 +57,25 @@ spec: - app.kubernetes.io/role-group - app.kubernetes.io/version --- +# Utilize `prometheus.io/scheme`, `prometheus.io/port`, `prometheus.io/path` annotations set by the operators +# to scrape all Stackable products. +# [x] Airflow - relabel drop filter on airflow container +# [x] Druid +# [x] HBase +# [X] Hadoop HDFS - relabel drop filter on empty container +# [x] Hive +# [~] Kafka - TODO: listener services have metrics? +# [x] NiFi 1 + 2 +# [ ] OpenSearch +# [x] Spark: Connect, HistoryServer +# [x] Superset - relabel drop filter on superset container +# [x] Trino +# [x] ZooKeeper +# [x] OPA apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: - name: stackable-native-metrics + name: stackable-generic labels: stackable.tech/vendor: Stackable release: prometheus @@ -69,46 +86,49 @@ spec: matchLabels: stackable.tech/vendor: Stackable prometheus.io/scrape: "true" - matchExpressions: - - key: app.kubernetes.io/name - operator: In - values: - - zookeeper endpoints: - - scheme: http - port: native-metrics - path: /metrics - podTargetLabels: - - app.kubernetes.io/name - - app.kubernetes.io/instance - - app.kubernetes.io/component - - app.kubernetes.io/role-group - - app.kubernetes.io/version ---- -# Kafka is special in that the operator totally messes up services: -# 1. The metrics Service is missing -# 2. The role level simple-kafka-broker-default has the prometheus.io/scrape label, but exposes no ports... -# 3. The role level simple-kafka-broker-default is labeled with app.kubernetes.io/name: listener??? -# So we have a dedicated config for it -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - name: stackable-kafka - labels: - stackable.tech/vendor: Stackable - release: prometheus -spec: - namespaceSelector: - any: true - selector: - matchLabels: - stackable.tech/vendor: Stackable - app.kubernetes.io/name: listener # Dafuq? - app.kubernetes.io/component: broker # We need to filter on brokers instead, as the app.kubernetes.io/name is messed up - endpoints: - - scheme: http - port: metrics - path: /metrics + - relabelings: + - sourceLabels: + - __meta_kubernetes_pod_container_name + # Pods show up twice due to multiple containers, we only keep the main / product container. + # Except for Airflow and Superset, where we chose the metrics container (otherwise scheduler, worker etc. + # which only have the metrics container are not getting picked up). + # - airflow: airflow + # - superset: superset + # - empty: filter when container label does not exist: hdfs + regex: ^(airflow|superset|)$ + action: drop + - sourceLabels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + action: replace + targetLabel: __scheme__ + regex: (https?) + - sourceLabels: + - __meta_kubernetes_service_annotation_prometheus_io_path + action: replace + targetLabel: __metrics_path__ + regex: (.+) + - sourceLabels: + - __meta_kubernetes_service_name + - __meta_kubernetes_namespace + - __meta_kubernetes_service_annotation_prometheus_io_port + action: replace + targetLabel: __address__ + regex: (.+);(.+);(\d+) + # TODO: We could set the cluster domain via annotation as well and pick it up here. + replacement: $1.$2.svc.cluster.local:$3 + tlsConfig: + ca: + secret: + name: prometheus-tls-certificate + key: ca.crt + cert: + secret: + name: prometheus-tls-certificate + key: tls.crt + keySecret: + name: prometheus-tls-certificate + key: tls.key podTargetLabels: - app.kubernetes.io/name - app.kubernetes.io/instance @@ -219,63 +239,6 @@ spec: - app.kubernetes.io/role-group - app.kubernetes.io/version --- -# NiFI 2 is a beast on it's own... -# We need to use mTLS (otherwise we get a 401) and can not use the PodIP -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - name: stackable-nifi-2 - labels: - stackable.tech/vendor: Stackable - release: prometheus -spec: - namespaceSelector: - any: true - selector: - matchLabels: - stackable.tech/vendor: Stackable - prometheus.io/scrape: "true" - matchExpressions: - - key: app.kubernetes.io/name - operator: In - values: - - nifi - endpoints: - - scheme: https - port: https - path: /nifi-api/flow/metrics/prometheus - # See https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api-reference/api.md#monitoring.coreos.com/v1.TLSConfig - tlsConfig: - ca: - secret: - name: prometheus-tls-certificate - key: ca.crt - cert: - secret: - name: prometheus-tls-certificate - key: tls.crt - keySecret: - name: prometheus-tls-certificate - key: tls.key - # We need to talk to the Pod via the FQDN of the Pod because of the stupid SNI check of NiFi. - # We can not use the typical PodIP, as it is not contained in the NiFi certificate, - # see https://github.com/stackabletech/secret-operator/issues/620 - relabelings: - - sourceLabels: - - __meta_kubernetes_pod_name - - __meta_kubernetes_service_name - - __meta_kubernetes_namespace - - __meta_kubernetes_pod_container_port_number - targetLabel: __address__ - replacement: ${1}.${2}-headless.${3}.svc.cluster.local:${4} - regex: (.+);(.+?)(?:-metrics)?;(.+);(.+) - podTargetLabels: - - app.kubernetes.io/name - - app.kubernetes.io/instance - - app.kubernetes.io/component - - app.kubernetes.io/role-group - - app.kubernetes.io/version ---- # spark-k8s-operator does not deploy any Services at all (at least for SparkApplications). # We currently only scrape the driver, going forward we might want to scrape the executors as well. # In the future we might also want to scrape SparkConnect and HistoryServers. From 51aed32faad6784d6b663e4ecaf914d0739dbe33 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Thu, 30 Oct 2025 09:47:14 +0100 Subject: [PATCH 2/9] cleanup obsolete service monitors --- .../prometheus-service-monitors.yaml | 187 +----------------- 1 file changed, 6 insertions(+), 181 deletions(-) diff --git a/stacks/monitoring/prometheus-service-monitors.yaml b/stacks/monitoring/prometheus-service-monitors.yaml index 0abfd472..135c19af 100644 --- a/stacks/monitoring/prometheus-service-monitors.yaml +++ b/stacks/monitoring/prometheus-service-monitors.yaml @@ -1,62 +1,11 @@ +--- # Use something like this to check for metrics: # count by (app_kubernetes_io_name, app_kubernetes_io_instance, pod) ({app_kubernetes_io_name!="",pod!=""}) # # Products metrics covered by the ServiceMonitors below. The list also includes whether the # ServiceMonitor scrapes native metrics or a statsd/JMX exporter. # -# [x] Airflow - exporter -# [x] Druid - native -# [x] HBase - native -# [x] Hadoop HDFS - native -# [x] Hive - exporter -# [x] Kafka - exporter -# [x] NiFi 1 - native -# [x] NiFi 2 - native -# [x] OpenSearch - native -# [ ] Spark - native - partially done, see comment on it below -# [x] Superset - exporter -# [x] Trino - native -# [x] ZooKeeper - native -# [x] OPA - native ---- -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - name: stackable - labels: - stackable.tech/vendor: Stackable - release: prometheus -spec: - namespaceSelector: - any: true - selector: - matchLabels: - stackable.tech/vendor: Stackable - prometheus.io/scrape: "true" - matchExpressions: - - key: app.kubernetes.io/name - operator: In - values: - - airflow - - druid - - hive - - kafka - - nifi # This only works for NiFi 1, NiFi 2 works via stackable-generic - - opa - - superset - - trino - - zookeeper - endpoints: - - scheme: http - port: metrics - path: /metrics - podTargetLabels: - - app.kubernetes.io/name - - app.kubernetes.io/instance - - app.kubernetes.io/component - - app.kubernetes.io/role-group - - app.kubernetes.io/version ---- +# # Utilize `prometheus.io/scheme`, `prometheus.io/port`, `prometheus.io/path` annotations set by the operators # to scrape all Stackable products. # [x] Airflow - relabel drop filter on airflow container @@ -64,9 +13,9 @@ spec: # [x] HBase # [X] Hadoop HDFS - relabel drop filter on empty container # [x] Hive -# [~] Kafka - TODO: listener services have metrics? +# [x] Kafka - TODO: listener services have metrics? # [x] NiFi 1 + 2 -# [ ] OpenSearch +# [x] OpenSearch # [x] Spark: Connect, HistoryServer # [x] Superset - relabel drop filter on superset container # [x] Trino @@ -75,7 +24,7 @@ spec: apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: - name: stackable-generic + name: stackable labels: stackable.tech/vendor: Stackable release: prometheus @@ -91,8 +40,7 @@ spec: - sourceLabels: - __meta_kubernetes_pod_container_name # Pods show up twice due to multiple containers, we only keep the main / product container. - # Except for Airflow and Superset, where we chose the metrics container (otherwise scheduler, worker etc. - # which only have the metrics container are not getting picked up). + # Except for Airflow and Superset, where we chose the metrics container. # - airflow: airflow # - superset: superset # - empty: filter when container label does not exist: hdfs @@ -136,109 +84,6 @@ spec: - app.kubernetes.io/role-group - app.kubernetes.io/version --- -# We prefer the native metrics over the statsd-exporter -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - name: stackable-hdfs - labels: - stackable.tech/vendor: Stackable - release: prometheus -spec: - namespaceSelector: - any: true - selector: - matchLabels: - stackable.tech/vendor: Stackable - prometheus.io/scrape: "true" - app.kubernetes.io/name: hdfs - endpoints: - - scheme: http - port: http # Don't use the "metrics" exporter port, we want native metrics instead - path: /prom - podTargetLabels: - - app.kubernetes.io/name - - app.kubernetes.io/instance - - app.kubernetes.io/component - - app.kubernetes.io/role-group - - app.kubernetes.io/version ---- -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - name: stackable-hbase - labels: - stackable.tech/vendor: Stackable - release: prometheus -spec: - namespaceSelector: - any: true - selector: - matchLabels: - stackable.tech/vendor: Stackable - prometheus.io/scrape: "true" - app.kubernetes.io/name: hbase - endpoints: - - scheme: http - port: metrics - path: /prometheus - podTargetLabels: - - app.kubernetes.io/name - - app.kubernetes.io/instance - - app.kubernetes.io/component - - app.kubernetes.io/role-group - - app.kubernetes.io/version ---- -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - name: stackable-opensearch - labels: - stackable.tech/vendor: Stackable - release: prometheus -spec: - namespaceSelector: - any: true - selector: - matchLabels: - stackable.tech/vendor: Stackable - prometheus.io/scrape: "true" - app.kubernetes.io/name: opensearch - endpoints: - - relabelings: - - sourceLabels: - - __meta_kubernetes_service_annotation_prometheus_io_scheme - action: replace - targetLabel: __scheme__ - regex: (https?) - - sourceLabels: - - __meta_kubernetes_service_annotation_prometheus_io_path - action: replace - targetLabel: __metrics_path__ - regex: (.+) - # Use the FQDN instead of the IP address because the IP address - # is not contained in the certificate. - - sourceLabels: - - __meta_kubernetes_pod_name - - __meta_kubernetes_service_name - - __meta_kubernetes_namespace - - __meta_kubernetes_service_annotation_prometheus_io_port - action: replace - targetLabel: __address__ - regex: (.+);(.+);(.+);(\d+) - replacement: $1.$2.$3.svc.cluster.local:$4 - tlsConfig: - ca: - secret: - name: prometheus-tls-certificate - key: ca.crt - podTargetLabels: - - app.kubernetes.io/name - - app.kubernetes.io/instance - - app.kubernetes.io/component - - app.kubernetes.io/role-group - - app.kubernetes.io/version ---- # spark-k8s-operator does not deploy any Services at all (at least for SparkApplications). # We currently only scrape the driver, going forward we might want to scrape the executors as well. # In the future we might also want to scrape SparkConnect and HistoryServers. @@ -271,26 +116,6 @@ spec: --- apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor -metadata: - name: stackable-minio-http - labels: - stackable.tech/vendor: Stackable - release: prometheus -spec: - namespaceSelector: - any: true - selector: - matchLabels: - # stackable.tech/vendor: Stackable # This is not always set, e.g. missing in the nifi-kafka-druid-water-level-data demo - app: minio - monitoring: "true" - endpoints: - - scheme: http - port: http - path: /minio/v2/metrics/cluster ---- -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor metadata: name: stackable-minio-https labels: From 6d16630747232bd13dd6f150b2a8f5fe6d4b2f75 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Thu, 30 Oct 2025 09:49:17 +0100 Subject: [PATCH 3/9] add spark connect / history comment --- stacks/monitoring/prometheus-service-monitors.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/stacks/monitoring/prometheus-service-monitors.yaml b/stacks/monitoring/prometheus-service-monitors.yaml index 135c19af..45d6587b 100644 --- a/stacks/monitoring/prometheus-service-monitors.yaml +++ b/stacks/monitoring/prometheus-service-monitors.yaml @@ -84,9 +84,8 @@ spec: - app.kubernetes.io/role-group - app.kubernetes.io/version --- -# spark-k8s-operator does not deploy any Services at all (at least for SparkApplications). # We currently only scrape the driver, going forward we might want to scrape the executors as well. -# In the future we might also want to scrape SparkConnect and HistoryServers. +# SparkConnect and HistoryServers are scraped via the `stackable` ServiceMonitor. apiVersion: monitoring.coreos.com/v1 kind: PodMonitor metadata: From 8fafbe10cb62a79a54c30e0493c1a77ed2050e1c Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Thu, 30 Oct 2025 10:53:01 +0100 Subject: [PATCH 4/9] make cluster domain configurable via annotation --- stacks/monitoring/prometheus-service-monitors.yaml | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/stacks/monitoring/prometheus-service-monitors.yaml b/stacks/monitoring/prometheus-service-monitors.yaml index 45d6587b..0a33ff9d 100644 --- a/stacks/monitoring/prometheus-service-monitors.yaml +++ b/stacks/monitoring/prometheus-service-monitors.yaml @@ -40,12 +40,17 @@ spec: - sourceLabels: - __meta_kubernetes_pod_container_name # Pods show up twice due to multiple containers, we only keep the main / product container. - # Except for Airflow and Superset, where we chose the metrics container. + # Except for Airflow and Superset, where we choose the metrics container. # - airflow: airflow # - superset: superset # - empty: filter when container label does not exist: hdfs regex: ^(airflow|superset|)$ action: drop + - sourceLabels: + - __meta_kubernetes_service_annotation_prometheus_io_clusterdomain + targetLabel: __cluster_domain__ + regex: ^$ + replacement: "cluster.local" - sourceLabels: - __meta_kubernetes_service_annotation_prometheus_io_scheme action: replace @@ -59,12 +64,12 @@ spec: - sourceLabels: - __meta_kubernetes_service_name - __meta_kubernetes_namespace + - __cluster_domain__ - __meta_kubernetes_service_annotation_prometheus_io_port action: replace targetLabel: __address__ - regex: (.+);(.+);(\d+) - # TODO: We could set the cluster domain via annotation as well and pick it up here. - replacement: $1.$2.svc.cluster.local:$3 + regex: (.+);(.+);(.+);(\d+) + replacement: $1.$2.svc.$3:$4 tlsConfig: ca: secret: From c748fade96ce70fa120fc867e9fe9243eb48e3ab Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Thu, 30 Oct 2025 10:57:35 +0100 Subject: [PATCH 5/9] improve comments --- stacks/monitoring/prometheus-service-monitors.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/stacks/monitoring/prometheus-service-monitors.yaml b/stacks/monitoring/prometheus-service-monitors.yaml index 0a33ff9d..48499556 100644 --- a/stacks/monitoring/prometheus-service-monitors.yaml +++ b/stacks/monitoring/prometheus-service-monitors.yaml @@ -6,8 +6,9 @@ # ServiceMonitor scrapes native metrics or a statsd/JMX exporter. # # -# Utilize `prometheus.io/scheme`, `prometheus.io/port`, `prometheus.io/path` annotations set by the operators -# to scrape all Stackable products. +# Utilize `prometheus.io/scheme`, `prometheus.io/port`, `prometheus.io/path` (and optionally `prometheus.io/clusterdomain`) +# annotations set by the operators to scrape all Stackable products. +# # [x] Airflow - relabel drop filter on airflow container # [x] Druid # [x] HBase From a05dac9a0506b59408db009b39380be5304d4c77 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Thu, 30 Oct 2025 14:15:09 +0100 Subject: [PATCH 6/9] use tmp label and improve label defaulting --- .../monitoring/prometheus-service-monitors.yaml | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/stacks/monitoring/prometheus-service-monitors.yaml b/stacks/monitoring/prometheus-service-monitors.yaml index 48499556..8cf8f254 100644 --- a/stacks/monitoring/prometheus-service-monitors.yaml +++ b/stacks/monitoring/prometheus-service-monitors.yaml @@ -47,27 +47,32 @@ spec: # - empty: filter when container label does not exist: hdfs regex: ^(airflow|superset|)$ action: drop + # Add empty label if not existing or pass-through existing value - sourceLabels: - __meta_kubernetes_service_annotation_prometheus_io_clusterdomain - targetLabel: __cluster_domain__ + targetLabel: __tmp_cluster_domain__ + replacement: "" + # Use default value if empty + - sourceLabels: + - __tmp_cluster_domain__ + targetLabel: __tmp_cluster_domain__ regex: ^$ replacement: "cluster.local" + # Scheme and Port - sourceLabels: - __meta_kubernetes_service_annotation_prometheus_io_scheme - action: replace targetLabel: __scheme__ regex: (https?) - sourceLabels: - __meta_kubernetes_service_annotation_prometheus_io_path - action: replace targetLabel: __metrics_path__ regex: (.+) + # Build metrics service address - sourceLabels: - __meta_kubernetes_service_name - __meta_kubernetes_namespace - - __cluster_domain__ + - __tmp_cluster_domain__ - __meta_kubernetes_service_annotation_prometheus_io_port - action: replace targetLabel: __address__ regex: (.+);(.+);(.+);(\d+) replacement: $1.$2.svc.$3:$4 From 5479ba7e7c0ceb66c7c4f705e35cf3fa88ef2f61 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Thu, 30 Oct 2025 14:41:11 +0100 Subject: [PATCH 7/9] fix cluster domain default --- stacks/monitoring/prometheus-service-monitors.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/stacks/monitoring/prometheus-service-monitors.yaml b/stacks/monitoring/prometheus-service-monitors.yaml index 8cf8f254..099b755c 100644 --- a/stacks/monitoring/prometheus-service-monitors.yaml +++ b/stacks/monitoring/prometheus-service-monitors.yaml @@ -50,12 +50,12 @@ spec: # Add empty label if not existing or pass-through existing value - sourceLabels: - __meta_kubernetes_service_annotation_prometheus_io_clusterdomain - targetLabel: __tmp_cluster_domain__ - replacement: "" + targetLabel: __tmp_clusterdomain__ + replacement: $1 # Use default value if empty - sourceLabels: - - __tmp_cluster_domain__ - targetLabel: __tmp_cluster_domain__ + - __tmp_clusterdomain__ + targetLabel: __tmp_clusterdomain__ regex: ^$ replacement: "cluster.local" # Scheme and Port @@ -71,7 +71,7 @@ spec: - sourceLabels: - __meta_kubernetes_service_name - __meta_kubernetes_namespace - - __tmp_cluster_domain__ + - __tmp_clusterdomain__ - __meta_kubernetes_service_annotation_prometheus_io_port targetLabel: __address__ regex: (.+);(.+);(.+);(\d+) From afbd36cdaaf87724432204b943569b362833bafb Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Thu, 30 Oct 2025 14:48:55 +0100 Subject: [PATCH 8/9] improve docs --- stacks/monitoring/prometheus-service-monitors.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/stacks/monitoring/prometheus-service-monitors.yaml b/stacks/monitoring/prometheus-service-monitors.yaml index 099b755c..1fe6f98d 100644 --- a/stacks/monitoring/prometheus-service-monitors.yaml +++ b/stacks/monitoring/prometheus-service-monitors.yaml @@ -58,7 +58,7 @@ spec: targetLabel: __tmp_clusterdomain__ regex: ^$ replacement: "cluster.local" - # Scheme and Port + # Scheme and port extraction - sourceLabels: - __meta_kubernetes_service_annotation_prometheus_io_scheme targetLabel: __scheme__ @@ -75,6 +75,7 @@ spec: - __meta_kubernetes_service_annotation_prometheus_io_port targetLabel: __address__ regex: (.+);(.+);(.+);(\d+) + # ..svc.: replacement: $1.$2.svc.$3:$4 tlsConfig: ca: From d65304bd052392b8f5be0a0f5ff8f26d64787405 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Thu, 30 Oct 2025 14:51:39 +0100 Subject: [PATCH 9/9] fix comment --- stacks/monitoring/prometheus-service-monitors.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stacks/monitoring/prometheus-service-monitors.yaml b/stacks/monitoring/prometheus-service-monitors.yaml index 1fe6f98d..0b1acdd2 100644 --- a/stacks/monitoring/prometheus-service-monitors.yaml +++ b/stacks/monitoring/prometheus-service-monitors.yaml @@ -58,7 +58,7 @@ spec: targetLabel: __tmp_clusterdomain__ regex: ^$ replacement: "cluster.local" - # Scheme and port extraction + # Scheme and path extraction - sourceLabels: - __meta_kubernetes_service_annotation_prometheus_io_scheme targetLabel: __scheme__