Skip to content

Commit 987cefe

Browse files
authored
[Feature] OPS Alerts (#1119)
1 parent 7a416e5 commit 987cefe

File tree

12 files changed

+115
-1
lines changed

12 files changed

+115
-1
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
# Change Log
22

33
## [master](https://github.com/arangodb/kube-arangodb/tree/master) (N/A)
4-
- (Feature) Add new field to DeploymentReplicationStatus with details on DC2DC sync status
4+
- (Feature) Add new field to DeploymentReplicationStatus with details on DC2DC sync status=
55
- (Feature) Early connections support
66
- (Bugfix) Fix and document action timeouts
77
- (Feature) Propagate sidecars' ports to a member's service
88
- (Debug Package) Initial commit
99
- (Feature) Detach PVC from deployment in Ordered indexing method
10+
- (Feature) OPS Alerts
1011

1112
## [1.2.16](https://github.com/arangodb/kube-arangodb/tree/1.2.16) (2022-09-14)
1213
- (Feature) Add ArangoDeployment ServerGroupStatus

docs/generated/metrics/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
| [arangodb_operator_agency_cache_member_serving](./arangodb_operator_agency_cache_member_serving.md) | arangodb_operator | agency_cache | Gauge | Determines if agency member is reachable |
1515
| [arangodb_operator_agency_cache_present](./arangodb_operator_agency_cache_present.md) | arangodb_operator | agency_cache | Gauge | Determines if local agency cache is present |
1616
| [arangodb_operator_agency_cache_serving](./arangodb_operator_agency_cache_serving.md) | arangodb_operator | agency_cache | Gauge | Determines if agency is serving |
17+
| [arangodb_operator_engine_ops_alerts](./arangodb_operator_engine_ops_alerts.md) | arangodb_operator | engine | Counter | Counter for actions which requires ops attention |
1718
| [arangodb_operator_engine_panics_recovered](./arangodb_operator_engine_panics_recovered.md) | arangodb_operator | engine | Counter | Number of Panics recovered inside Operator reconciliation loop |
1819
| [arangodb_operator_members_unexpected_container_exit_codes](./arangodb_operator_members_unexpected_container_exit_codes.md) | arangodb_operator | members | Counter | Counter of unexpected restarts in pod (Containers/InitContainers/EphemeralContainers) |
1920
| [arangodb_operator_rebalancer_enabled](./arangodb_operator_rebalancer_enabled.md) | arangodb_operator | rebalancer | Gauge | Determines if rebalancer is enabled |
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# arangodb_operator_engine_ops_alerts (Counter)
2+
3+
## Description
4+
5+
Counter for actions which requires ops attention
6+
7+
## Labels
8+
9+
| Label | Description |
10+
|:---------:|:---------------------|
11+
| namespace | Deployment Namespace |
12+
| name | Deployment Name |
13+
14+
15+
## Alerting
16+
17+
| Priority | Query | Description |
18+
|:--------:|:--------------------------------------------------:|:--------------------------------------------|
19+
| Warning | irate(arangodb_operator_engine_ops_alerts[1m]) > 1 | Trigger an alert if OPS attention is needed |

internal/metrics.yaml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,4 +221,18 @@ namespaces:
221221
labels:
222222
- key: section
223223
description: "Panic Section"
224+
ops_alerts:
225+
shortDescription: "Counter for actions which requires ops attention"
226+
description: "Counter for actions which requires ops attention"
227+
type: "Counter"
228+
labels:
229+
- key: namespace
230+
description: "Deployment Namespace"
231+
- key: name
232+
description: "Deployment Name"
233+
alertingRules:
234+
- priority: Warning
235+
query: irate(arangodb_operator_engine_ops_alerts[1m]) > 1
236+
description: "Trigger an alert if OPS attention is needed"
237+
224238

pkg/deployment/context_impl.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ package deployment
2323
import (
2424
"context"
2525
"crypto/tls"
26+
"fmt"
2627
"net"
2728
nhttp "net/http"
2829
"strconv"
@@ -651,3 +652,13 @@ func (d *Deployment) GenerateMemberEndpoint(group api.ServerGroup, member api.Me
651652
func (d *Deployment) ACS() sutil.ACS {
652653
return d.acs
653654
}
655+
656+
func (d *Deployment) CreateOperatorEngineOpsAlertEvent(message string, args ...interface{}) {
657+
if d == nil {
658+
return
659+
}
660+
661+
d.metrics.ArangodbOperatorEngineOpsAlerts++
662+
663+
d.CreateEvent(k8sutil.NewOperatorEngineOpsAlertEvent(fmt.Sprintf(message, args...), d.GetAPIObject()))
664+
}

pkg/deployment/metrics.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ type Metrics struct {
3636
DeploymentValidationErrors, DeploymentImmutableErrors, StatusRestores uint64
3737
}
3838

39+
ArangodbOperatorEngineOpsAlerts int
40+
3941
Deployment struct {
4042
Accepted, UpToDate bool
4143
}

pkg/deployment/reconcile/action_cleanout_member.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,12 @@ func (a *actionCleanOutMember) CheckProgress(ctx context.Context) (bool, bool, e
141141
}
142142
}
143143

144+
if cache.PlanServers().Contains(agency.Server(m.ID)) {
145+
// Something is wrong, servers is CleanedOut but still exists in the Plan
146+
a.actionCtx.CreateOperatorEngineOpsAlertEvent("DBServer %s still exists in Plan after CleanOut", m.ID)
147+
return false, true, nil
148+
}
149+
144150
// Cleanout completed
145151
return true, false, nil
146152
}

pkg/deployment/reconcile/action_context.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ type ActionContext interface {
5151
reconciler.ArangoAgencyGet
5252
reconciler.DeploymentInfoGetter
5353
reconciler.DeploymentDatabaseClient
54+
reconciler.KubernetesEventGenerator
5455

5556
member.StateInspectorGetter
5657

@@ -130,6 +131,10 @@ type actionContext struct {
130131
metrics *Metrics
131132
}
132133

134+
func (ac *actionContext) CreateOperatorEngineOpsAlertEvent(message string, args ...interface{}) {
135+
ac.context.CreateOperatorEngineOpsAlertEvent(message, args...)
136+
}
137+
133138
func (ac *actionContext) Metrics() *Metrics {
134139
return ac.metrics
135140
}

pkg/deployment/reconcile/plan_builder_test.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,11 @@ type testContext struct {
8686
state member.StateInspector
8787
}
8888

89+
func (c *testContext) CreateOperatorEngineOpsAlertEvent(message string, args ...interface{}) {
90+
//TODO implement me
91+
panic("implement me")
92+
}
93+
8994
func (c *testContext) GetAgencyHealth() (agencyCache.Health, bool) {
9095
//TODO implement me
9196
panic("implement me")

pkg/deployment/reconciler/context.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,8 @@ type KubernetesEventGenerator interface {
138138
// CreateEvent creates a given event.
139139
// On error, the error is logged.
140140
CreateEvent(evt *k8sutil.Event)
141+
142+
CreateOperatorEngineOpsAlertEvent(message string, args ...interface{})
141143
}
142144

143145
// DeploymentClient provides functionalities to get deployment's clients.

0 commit comments

Comments
 (0)