Skip to content

Commit 05a5fb6

Browse files
authored
Add stackable-generic ServiceMonitor (#324)
* Cleanup service monitors, add new stacakble-generic service monitor that works with all products. * cleanup obsolete service monitors * add spark connect / history comment * make cluster domain configurable via annotation * improve comments * use tmp label and improve label defaulting * fix cluster domain default * improve docs * fix comment
1 parent 78ae85d commit 05a5fb6

File tree

1 file changed

+44
-245
lines changed

1 file changed

+44
-245
lines changed

stacks/monitoring/prometheus-service-monitors.yaml

Lines changed: 44 additions & 245 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,27 @@
1+
---
12
# Use something like this to check for metrics:
23
# count by (app_kubernetes_io_name, app_kubernetes_io_instance, pod) ({app_kubernetes_io_name!="",pod!=""})
34
#
45
# Products metrics covered by the ServiceMonitors below. The list also includes whether the
56
# ServiceMonitor scrapes native metrics or a statsd/JMX exporter.
67
#
7-
# [x] Airflow - exporter
8-
# [x] Druid - native
9-
# [x] HBase - native
10-
# [x] Hadoop HDFS - native
11-
# [x] Hive - exporter
12-
# [x] Kafka - exporter
13-
# [x] NiFi 1 - native
14-
# [x] NiFi 2 - native
15-
# [x] OpenSearch - native
16-
# [ ] Spark - native - partially done, see comment on it below
17-
# [x] Superset - exporter
18-
# [x] Trino - native
19-
# [x] ZooKeeper - native
20-
# [x] OPA - native
21-
---
8+
#
9+
# Utilize `prometheus.io/scheme`, `prometheus.io/port`, `prometheus.io/path` (and optionally `prometheus.io/clusterdomain`)
10+
# annotations set by the operators to scrape all Stackable products.
11+
#
12+
# [x] Airflow - relabel drop filter on airflow container
13+
# [x] Druid
14+
# [x] HBase
15+
# [X] Hadoop HDFS - relabel drop filter on empty container
16+
# [x] Hive
17+
# [x] Kafka - TODO: listener services have metrics?
18+
# [x] NiFi 1 + 2
19+
# [x] OpenSearch
20+
# [x] Spark: Connect, HistoryServer
21+
# [x] Superset - relabel drop filter on superset container
22+
# [x] Trino
23+
# [x] ZooKeeper
24+
# [x] OPA
2225
apiVersion: monitoring.coreos.com/v1
2326
kind: ServiceMonitor
2427
metadata:
@@ -33,218 +36,47 @@ spec:
3336
matchLabels:
3437
stackable.tech/vendor: Stackable
3538
prometheus.io/scrape: "true"
36-
matchExpressions:
37-
- key: app.kubernetes.io/name
38-
operator: In
39-
values:
40-
- airflow
41-
- druid
42-
- hive
43-
- nifi # This only works for NiFi 1, NiFi 2 has a special ServiceMonitor below
44-
- opa
45-
- superset
46-
- trino
47-
endpoints:
48-
- scheme: http
49-
port: metrics
50-
path: /metrics
51-
podTargetLabels:
52-
- app.kubernetes.io/name
53-
- app.kubernetes.io/instance
54-
- app.kubernetes.io/component
55-
- app.kubernetes.io/role-group
56-
- app.kubernetes.io/version
57-
---
58-
apiVersion: monitoring.coreos.com/v1
59-
kind: ServiceMonitor
60-
metadata:
61-
name: stackable-native-metrics
62-
labels:
63-
stackable.tech/vendor: Stackable
64-
release: prometheus
65-
spec:
66-
namespaceSelector:
67-
any: true
68-
selector:
69-
matchLabels:
70-
stackable.tech/vendor: Stackable
71-
prometheus.io/scrape: "true"
72-
matchExpressions:
73-
- key: app.kubernetes.io/name
74-
operator: In
75-
values:
76-
- zookeeper
77-
endpoints:
78-
- scheme: http
79-
port: native-metrics
80-
path: /metrics
81-
podTargetLabels:
82-
- app.kubernetes.io/name
83-
- app.kubernetes.io/instance
84-
- app.kubernetes.io/component
85-
- app.kubernetes.io/role-group
86-
- app.kubernetes.io/version
87-
---
88-
# Kafka is special in that the operator totally messes up services:
89-
# 1. The metrics Service is missing
90-
# 2. The role level simple-kafka-broker-default has the prometheus.io/scrape label, but exposes no ports...
91-
# 3. The role level simple-kafka-broker-default is labeled with app.kubernetes.io/name: listener???
92-
# So we have a dedicated config for it
93-
apiVersion: monitoring.coreos.com/v1
94-
kind: ServiceMonitor
95-
metadata:
96-
name: stackable-kafka
97-
labels:
98-
stackable.tech/vendor: Stackable
99-
release: prometheus
100-
spec:
101-
namespaceSelector:
102-
any: true
103-
selector:
104-
matchLabels:
105-
stackable.tech/vendor: Stackable
106-
app.kubernetes.io/name: listener # Dafuq?
107-
app.kubernetes.io/component: broker # We need to filter on brokers instead, as the app.kubernetes.io/name is messed up
108-
endpoints:
109-
- scheme: http
110-
port: metrics
111-
path: /metrics
112-
podTargetLabels:
113-
- app.kubernetes.io/name
114-
- app.kubernetes.io/instance
115-
- app.kubernetes.io/component
116-
- app.kubernetes.io/role-group
117-
- app.kubernetes.io/version
118-
---
119-
# We prefer the native metrics over the statsd-exporter
120-
apiVersion: monitoring.coreos.com/v1
121-
kind: ServiceMonitor
122-
metadata:
123-
name: stackable-hdfs
124-
labels:
125-
stackable.tech/vendor: Stackable
126-
release: prometheus
127-
spec:
128-
namespaceSelector:
129-
any: true
130-
selector:
131-
matchLabels:
132-
stackable.tech/vendor: Stackable
133-
prometheus.io/scrape: "true"
134-
app.kubernetes.io/name: hdfs
135-
endpoints:
136-
- scheme: http
137-
port: http # Don't use the "metrics" exporter port, we want native metrics instead
138-
path: /prom
139-
podTargetLabels:
140-
- app.kubernetes.io/name
141-
- app.kubernetes.io/instance
142-
- app.kubernetes.io/component
143-
- app.kubernetes.io/role-group
144-
- app.kubernetes.io/version
145-
---
146-
apiVersion: monitoring.coreos.com/v1
147-
kind: ServiceMonitor
148-
metadata:
149-
name: stackable-hbase
150-
labels:
151-
stackable.tech/vendor: Stackable
152-
release: prometheus
153-
spec:
154-
namespaceSelector:
155-
any: true
156-
selector:
157-
matchLabels:
158-
stackable.tech/vendor: Stackable
159-
prometheus.io/scrape: "true"
160-
app.kubernetes.io/name: hbase
161-
endpoints:
162-
- scheme: http
163-
port: metrics
164-
path: /prometheus
165-
podTargetLabels:
166-
- app.kubernetes.io/name
167-
- app.kubernetes.io/instance
168-
- app.kubernetes.io/component
169-
- app.kubernetes.io/role-group
170-
- app.kubernetes.io/version
171-
---
172-
apiVersion: monitoring.coreos.com/v1
173-
kind: ServiceMonitor
174-
metadata:
175-
name: stackable-opensearch
176-
labels:
177-
stackable.tech/vendor: Stackable
178-
release: prometheus
179-
spec:
180-
namespaceSelector:
181-
any: true
182-
selector:
183-
matchLabels:
184-
stackable.tech/vendor: Stackable
185-
prometheus.io/scrape: "true"
186-
app.kubernetes.io/name: opensearch
18739
endpoints:
18840
- relabelings:
41+
- sourceLabels:
42+
- __meta_kubernetes_pod_container_name
43+
# Pods show up twice due to multiple containers, we only keep the main / product container.
44+
# Except for Airflow and Superset, where we choose the metrics container.
45+
# - airflow: airflow
46+
# - superset: superset
47+
# - empty: filter when container label does not exist: hdfs
48+
regex: ^(airflow|superset|)$
49+
action: drop
50+
# Add empty label if not existing or pass-through existing value
51+
- sourceLabels:
52+
- __meta_kubernetes_service_annotation_prometheus_io_clusterdomain
53+
targetLabel: __tmp_clusterdomain__
54+
replacement: $1
55+
# Use default value if empty
56+
- sourceLabels:
57+
- __tmp_clusterdomain__
58+
targetLabel: __tmp_clusterdomain__
59+
regex: ^$
60+
replacement: "cluster.local"
61+
# Scheme and path extraction
18962
- sourceLabels:
19063
- __meta_kubernetes_service_annotation_prometheus_io_scheme
191-
action: replace
19264
targetLabel: __scheme__
19365
regex: (https?)
19466
- sourceLabels:
19567
- __meta_kubernetes_service_annotation_prometheus_io_path
196-
action: replace
19768
targetLabel: __metrics_path__
19869
regex: (.+)
199-
# Use the FQDN instead of the IP address because the IP address
200-
# is not contained in the certificate.
70+
# Build metrics service address
20171
- sourceLabels:
202-
- __meta_kubernetes_pod_name
20372
- __meta_kubernetes_service_name
20473
- __meta_kubernetes_namespace
74+
- __tmp_clusterdomain__
20575
- __meta_kubernetes_service_annotation_prometheus_io_port
206-
action: replace
20776
targetLabel: __address__
20877
regex: (.+);(.+);(.+);(\d+)
209-
replacement: $1.$2.$3.svc.cluster.local:$4
210-
tlsConfig:
211-
ca:
212-
secret:
213-
name: prometheus-tls-certificate
214-
key: ca.crt
215-
podTargetLabels:
216-
- app.kubernetes.io/name
217-
- app.kubernetes.io/instance
218-
- app.kubernetes.io/component
219-
- app.kubernetes.io/role-group
220-
- app.kubernetes.io/version
221-
---
222-
# NiFI 2 is a beast on it's own...
223-
# We need to use mTLS (otherwise we get a 401) and can not use the PodIP
224-
apiVersion: monitoring.coreos.com/v1
225-
kind: ServiceMonitor
226-
metadata:
227-
name: stackable-nifi-2
228-
labels:
229-
stackable.tech/vendor: Stackable
230-
release: prometheus
231-
spec:
232-
namespaceSelector:
233-
any: true
234-
selector:
235-
matchLabels:
236-
stackable.tech/vendor: Stackable
237-
prometheus.io/scrape: "true"
238-
matchExpressions:
239-
- key: app.kubernetes.io/name
240-
operator: In
241-
values:
242-
- nifi
243-
endpoints:
244-
- scheme: https
245-
port: https
246-
path: /nifi-api/flow/metrics/prometheus
247-
# See https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api-reference/api.md#monitoring.coreos.com/v1.TLSConfig
78+
# <service-name>.<namespace>.svc.<cluster-domain>:<port>
79+
replacement: $1.$2.svc.$3:$4
24880
tlsConfig:
24981
ca:
25082
secret:
@@ -257,28 +89,15 @@ spec:
25789
keySecret:
25890
name: prometheus-tls-certificate
25991
key: tls.key
260-
# We need to talk to the Pod via the FQDN of the Pod because of the stupid SNI check of NiFi.
261-
# We can not use the typical PodIP, as it is not contained in the NiFi certificate,
262-
# see https://github.com/stackabletech/secret-operator/issues/620
263-
relabelings:
264-
- sourceLabels:
265-
- __meta_kubernetes_pod_name
266-
- __meta_kubernetes_service_name
267-
- __meta_kubernetes_namespace
268-
- __meta_kubernetes_pod_container_port_number
269-
targetLabel: __address__
270-
replacement: ${1}.${2}-headless.${3}.svc.cluster.local:${4}
271-
regex: (.+);(.+?)(?:-metrics)?;(.+);(.+)
27292
podTargetLabels:
27393
- app.kubernetes.io/name
27494
- app.kubernetes.io/instance
27595
- app.kubernetes.io/component
27696
- app.kubernetes.io/role-group
27797
- app.kubernetes.io/version
27898
---
279-
# spark-k8s-operator does not deploy any Services at all (at least for SparkApplications).
28099
# We currently only scrape the driver, going forward we might want to scrape the executors as well.
281-
# In the future we might also want to scrape SparkConnect and HistoryServers.
100+
# SparkConnect and HistoryServers are scraped via the `stackable` ServiceMonitor.
282101
apiVersion: monitoring.coreos.com/v1
283102
kind: PodMonitor
284103
metadata:
@@ -308,26 +127,6 @@ spec:
308127
---
309128
apiVersion: monitoring.coreos.com/v1
310129
kind: ServiceMonitor
311-
metadata:
312-
name: stackable-minio-http
313-
labels:
314-
stackable.tech/vendor: Stackable
315-
release: prometheus
316-
spec:
317-
namespaceSelector:
318-
any: true
319-
selector:
320-
matchLabels:
321-
# stackable.tech/vendor: Stackable # This is not always set, e.g. missing in the nifi-kafka-druid-water-level-data demo
322-
app: minio
323-
monitoring: "true"
324-
endpoints:
325-
- scheme: http
326-
port: http
327-
path: /minio/v2/metrics/cluster
328-
---
329-
apiVersion: monitoring.coreos.com/v1
330-
kind: ServiceMonitor
331130
metadata:
332131
name: stackable-minio-https
333132
labels:

0 commit comments

Comments
 (0)