diff --git a/helm/blueapi/README.md b/helm/blueapi/README.md index 3862290fb..17e506607 100644 --- a/helm/blueapi/README.md +++ b/helm/blueapi/README.md @@ -32,8 +32,12 @@ A Helm chart deploying a worker pod that runs Bluesky plans | podAnnotations | object | `{}` | | | podLabels | object | `{}` | | | podSecurityContext | object | `{}` | | +| pvcAutoDeletion.enabled | bool | `true` | | | readinessProbe | object | `{"failureThreshold":2,"httpGet":{"path":"/healthz","port":"http"},"periodSeconds":10}` | Readiness probe, if configured kubernetes will not route traffic to this pod if failed consecutively. This could allow the service time to recover if it is being overwhelmed by traffic, but without the to ability to load balance or scale up/outwards, upstream services will need to know to back off. This is automatically disabled when in debug mode. | -| resources | object | `{"limits":{"cpu":"2000m","memory":"4000Mi"},"requests":{"cpu":"200m","memory":"400Mi"}}` | Sets the compute resources available to the pod. These defaults are appropriate when using debug mode or an internal PVC and therefore running VS Code server in the pod. In the Diamond cluster, requests must be >= 0.1*limits When not using either of the above, the limits may be lowered. When idle but connected, blueapi consumes ~400MB of memory and 1% cpu and may struggle when allocated less. | +| resources.limits.cpu | string | `"2000m"` | | +| resources.limits.memory | string | `"4000Mi"` | | +| resources.requests.cpu | string | `"200m"` | | +| resources.requests.memory | string | `"400Mi"` | | | restartOnConfigChange | bool | `true` | If enabled the blueapi pod will restart on changes to `worker` | | securityContext.runAsNonRoot | bool | `true` | | | securityContext.runAsUser | int | `1000` | | @@ -44,6 +48,7 @@ A Helm chart deploying a worker pod that runs Bluesky plans | serviceAccount.create | bool | `false` | | | serviceAccount.name | string | `""` | | | startupProbe | object | `{"failureThreshold":5,"httpGet":{"path":"/healthz","port":"http"},"periodSeconds":10}` | A more lenient livenessProbe to allow the service to start fully. This is automatically disabled when in debug mode. | +| timeStampCron.enabled | bool | `true` | | | tolerations | list | `[]` | May be required to run on specific nodes (e.g. the control machine) | | tracing | object | `{"fastapi":{"excludedURLs":"/healthz"},"otlp":{"enabled":false,"protocol":"http/protobuf","server":{"host":"http://opentelemetry-collector.tracing","port":4318}}}` | Exclude health probe requests from tracing by default to prevent spamming | | volumeMounts | list | `[{"mountPath":"/config","name":"worker-config","readOnly":true}]` | Additional volumeMounts on the output StatefulSet definition. Define how volumes are mounted to the container referenced by using the same name. | @@ -51,6 +56,5 @@ A Helm chart deploying a worker pod that runs Bluesky plans | worker | object | `{"api":{"url":"http://0.0.0.0:8000/"},"env":{"sources":[{"kind":"planFunctions","module":"dodal.plans"},{"kind":"planFunctions","module":"dodal.plan_stubs.wrapped"}]},"logging":{"graylog":{"enabled":false,"url":"tcp://graylog-log-target.diamond.ac.uk:12231/"},"level":"INFO"},"scratch":{"repositories":[],"root":"/workspace"},"stomp":{"auth":{"password":"guest","username":"guest"},"enabled":false,"url":"tcp://rabbitmq:61613/"}}` | Config for the worker goes here, will be mounted into a config file | | worker.api.url | string | `"http://0.0.0.0:8000/"` | 0.0.0.0 required to allow non-loopback traffic If using hostNetwork, the port must be free on the host | | worker.env.sources | list | `[{"kind":"planFunctions","module":"dodal.plans"},{"kind":"planFunctions","module":"dodal.plan_stubs.wrapped"}]` | modules (must be installed in the venv) to fetch devices/plans from | -| worker.logging | object | `{"graylog":{"enabled":false,"url":"tcp://graylog-log-target.diamond.ac.uk:12231/"},"level":"INFO"}` | Configures logging. Port 12231 is the `dodal` input on graylog which will be renamed `blueapi` | | worker.scratch | object | `{"repositories":[],"root":"/workspace"}` | If initContainer is enabled the default branch of python projects in this section are installed into the venv *without their dependencies* | | worker.stomp | object | `{"auth":{"password":"guest","username":"guest"},"enabled":false,"url":"tcp://rabbitmq:61613/"}` | Message bus configuration for returning status to GDA/forwarding documents downstream Password may be in the form ${ENV_VAR} to be fetched from an environment variable e.g. mounted from a SealedSecret | diff --git a/helm/blueapi/files/scripts/pvc-deletion.sh b/helm/blueapi/files/scripts/pvc-deletion.sh new file mode 100644 index 000000000..fc4f2f941 --- /dev/null +++ b/helm/blueapi/files/scripts/pvc-deletion.sh @@ -0,0 +1,28 @@ +#!/bin/sh +# Get all PVCs by running pods +ALL_PVCS=$(kubectl get pvc -n $RELEASE_NAMESPACE -o=jsonpath='{.items[*].metadata.name}' | tr ' ' '\n' | sort -u) +NOW=$(date +%s) +#loop through all pvcs. +for pvc in $ALL_PVCS; do + #check if pvc has last-used annotation + if kubectl get pvc $pvc -n $RELEASE_NAMESPACE -o=jsonpath='{.metadata.annotations.last-used}' + then + #get last used annotation + LAST_USED=$(kubectl get pvc $pvc -n $RELEASE_NAMESPACE -o=jsonpath='{.metadata.annotations.last-used}') + #checking if its not null + if [ -n "$LAST_USED" ]; then + #check if last_used is older than 3 months + if [ $(($NOW - LAST_USED)) -gt 2628000 ]; then + #checking if the pvc is protected, if it is protected skip deletion + if [ "$(kubectl get pvc $pvc -n $RELEASE_NAMESPACE -o=jsonpath='{.metadata.annotations.protected}')" = "true" ]; then + echo "PVC $pvc is protected, skipping deletion" + continue + fi + #PVC has not been used for more than three months, delete it + kubectl delete pvc "$pvc" -n $RELEASE_NAMESPACE + fi + fi + else + echo "PVC $pvc does not have last-used annotation, skipping deletion" + fi +done diff --git a/helm/blueapi/files/scripts/time-stamper.sh b/helm/blueapi/files/scripts/time-stamper.sh new file mode 100644 index 000000000..514cd01d2 --- /dev/null +++ b/helm/blueapi/files/scripts/time-stamper.sh @@ -0,0 +1,9 @@ +#!/bin/sh +# Get all PVCs currently mounted by running pods +MOUNTED_PVCS=$(kubectl get pods -n $RELEASE_NAMESPACE \ + -o=jsonpath='{.items[*].spec.volumes[*].persistentVolumeClaim.claimName}' | tr ' ' '\n' | sort -u) +NOW=$(date +%s) +#loop through all the pvcs annotating ones thare are mounted +for pvc in $MOUNTED_PVCS; do + kubectl annotate --overwrite pvc "$pvc" -n $RELEASE_NAMESPACE last-used="$NOW" +done diff --git a/helm/blueapi/templates/configmap.yaml b/helm/blueapi/templates/configmap.yaml index aa813e648..93ba1447e 100644 --- a/helm/blueapi/templates/configmap.yaml +++ b/helm/blueapi/templates/configmap.yaml @@ -31,6 +31,6 @@ data: init_config.yaml: |- scratch: {{- toYaml .Values.worker.scratch | nindent 6 }} -{{- end }} ---- +--- +{{- end }} diff --git a/helm/blueapi/templates/cronjob-configmaps.yaml b/helm/blueapi/templates/cronjob-configmaps.yaml new file mode 100644 index 000000000..188bb1a5f --- /dev/null +++ b/helm/blueapi/templates/cronjob-configmaps.yaml @@ -0,0 +1,22 @@ +{{- if .Values.timeStampCron.enabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + name : {{include "blueapi.fullname" . }}-pvc-stamper-script +data: + {{- $files := .Files }} + time-stamper.sh: |- +{{ $files.Get "files/scripts/time-stamper.sh" | indent 4 }} +--- +{{- end }} + +{{- if .Values.pvcAutoDeletion.enabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + name : {{include "blueapi.fullname" . }}-pvc-auto-deletion-script +data: + {{- $files := .Files }} + pvc-deletion.sh: |- +{{ $files.Get "files/scripts/pvc-deletion.sh" | indent 4 }} +{{- end }} diff --git a/helm/blueapi/templates/cronjob.yaml b/helm/blueapi/templates/cronjob.yaml new file mode 100644 index 000000000..717fa6bf4 --- /dev/null +++ b/helm/blueapi/templates/cronjob.yaml @@ -0,0 +1,159 @@ +{{- if .Values.timeStampCron.enabled }} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "blueapi.fullname" . }}-last-used-stamper + namespace: {{ .Release.Namespace }} +automountServiceAccountToken: true +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ include "blueapi.fullname" . }}-last-used-stamper + namespace: {{ .Release.Namespace }} +rules: +- apiGroups: [""] + resources: ["pods", "persistentvolumeclaims"] + verbs: ["get", "list", "patch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "blueapi.fullname" . }}-last-used-stamper + namespace: {{ .Release.Namespace }} +subjects: +- kind: ServiceAccount + name: {{ include "blueapi.fullname" . }}-last-used-stamper + namespace: {{ .Release.Namespace }} +roleRef: + kind: Role + name: {{ include "blueapi.fullname" . }}-last-used-stamper + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: {{ include "blueapi.fullname" . }}-last-used-stamper + namespace: {{ .Release.Namespace }} +spec: + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 3 + failedJobsHistoryLimit: 1 + schedule: "*/5 * * * *" + + jobTemplate: + spec: + # amount of attempts of labeling a pvc + backoffLimit: 3 + # job stops after 180 seconds + activeDeadlineSeconds: 180 + template: + spec: + serviceAccountName: {{ include "blueapi.fullname" . }}-last-used-stamper + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 12 }} + {{- end }} + + volumes: + - name : {{include "blueapi.fullname" . }}-pvc-stamper-script + configMap: + name: {{include "blueapi.fullname" . }}-pvc-stamper-script + defaultMode: 0555 + + + containers: + - name: last-used-stamper + env: + - name: RELEASE_NAME + value: {{ .Release.Name }} + - name: RELEASE_NAMESPACE + value: {{ .Release.Namespace }} + volumeMounts: + - name: {{include "blueapi.fullname" . }}-pvc-stamper-script + mountPath: /scripts + image: bitnami/kubectl:latest + imagePullPolicy: IfNotPresent + command: ["/scripts/time-stamper.sh"] + restartPolicy: OnFailure +{{- end }} +{{- if .Values.pvcAutoDeletion.enabled }} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "blueapi.fullname" . }}-pvc-auto-deletion + namespace: {{ .Release.Namespace }} +automountServiceAccountToken: true +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ include "blueapi.fullname" . }}-pvc-auto-deletion + namespace: {{ .Release.Namespace }} +rules: +- apiGroups: [""] + resources: ["pods", "persistentvolumeclaims"] + verbs: ["get", "list", "patch","delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "blueapi.fullname" . }}-pvc-auto-deletion + namespace: {{ .Release.Namespace }} +subjects: +- kind: ServiceAccount + name: {{ include "blueapi.fullname" . }}-pvc-auto-deletion + namespace: {{ .Release.Namespace }} +roleRef: + kind: Role + name: {{ include "blueapi.fullname" . }}-pvc-auto-deletion + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: {{ include "blueapi.fullname" . }}-pvc-auto-deletion + namespace: {{ .Release.Namespace }} +spec: + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 3 + failedJobsHistoryLimit: 1 + schedule: "@weekly" + + jobTemplate: + spec: + # amount of attempts of labeling a pvc + backoffLimit: 3 + # job stops after 300 seconds + activeDeadlineSeconds: 300 + template: + spec: + serviceAccountName: {{ include "blueapi.fullname" . }}-pvc-auto-deletion + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 12 }} + {{- end }} + + volumes: + - name : {{include "blueapi.fullname" . }}-pvc-auto-deletion-script + configMap: + name: {{include "blueapi.fullname" . }}-pvc-auto-deletion-script + defaultMode: 0555 + + + containers: + - name: pvc-auto-deletion + env: + - name: RELEASE_NAME + value: {{ .Release.Name }} + - name: RELEASE_NAMESPACE + value: {{ .Release.Namespace }} + volumeMounts: + - name: {{include "blueapi.fullname" . }}-pvc-auto-deletion-script + mountPath: /scripts + image: bitnami/kubectl:latest + imagePullPolicy: IfNotPresent + command: ["/scripts/pvc-deletion.sh"] + restartPolicy: OnFailure +{{- end }} diff --git a/helm/blueapi/values.schema.json b/helm/blueapi/values.schema.json index 3159f6713..654e1178d 100644 --- a/helm/blueapi/values.schema.json +++ b/helm/blueapi/values.schema.json @@ -174,6 +174,14 @@ "podSecurityContext": { "type": "object" }, + "pvcAutoDeletion": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + } + } + }, "readinessProbe": { "description": "Readiness probe, if configured kubernetes will not route traffic to this pod if failed consecutively. This could allow the service time to recover if it is being overwhelmed by traffic, but without the to ability to load balance or scale up/outwards, upstream services will need to know to back off. This is automatically disabled when in debug mode.", "type": "object", @@ -198,7 +206,6 @@ } }, "resources": { - "description": "Sets the compute resources available to the pod. These defaults are appropriate when using debug mode or an internal PVC and therefore running VS Code server in the pod. In the Diamond cluster, requests must be \u003e= 0.1*limits When not using either of the above, the limits may be lowered. When idle but connected, blueapi consumes ~400MB of memory and 1% cpu and may struggle when allocated less.", "type": "object", "properties": { "limits": { @@ -292,6 +299,14 @@ } } }, + "timeStampCron": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + } + } + }, "tolerations": { "description": "May be required to run on specific nodes (e.g. the control machine)", "type": "array" @@ -389,7 +404,6 @@ } }, "logging": { - "description": "Configures logging. Port 12231 is the `dodal` input on graylog which will be renamed `blueapi`", "type": "object", "properties": { "graylog": { diff --git a/helm/blueapi/values.yaml b/helm/blueapi/values.yaml index 876b37a98..c7e6e2fa1 100644 --- a/helm/blueapi/values.yaml +++ b/helm/blueapi/values.yaml @@ -36,8 +36,7 @@ podAnnotations: {} # For more information checkout: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/ podLabels: {} -podSecurityContext: {} - # fsGroup: 2000 +podSecurityContext: {} # fsGroup: 2000 securityContext: # https://github.com/DiamondLightSource/blueapi/issues/1096 @@ -48,7 +47,7 @@ securityContext: # drop: # - ALL -# This is for setting up a service more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/ + # This is for setting up a service more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/ service: # This sets the service type more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/#publishing-services-service-types # -- To make blueapi available on an IP outside of the cluster prior to an Ingress being created, change this to LoadBalancer @@ -76,13 +75,13 @@ ingress: # hosts: # - chart-example.local -# -- Sets the compute resources available to the pod. -# These defaults are appropriate when using debug mode or an internal PVC and therefore -# running VS Code server in the pod. -# In the Diamond cluster, requests must be >= 0.1*limits -# When not using either of the above, the limits may be lowered. -# When idle but connected, blueapi consumes ~400MB of memory and 1% cpu -# and may struggle when allocated less. + # -- Sets the compute resources available to the pod. + # These defaults are appropriate when using debug mode or an internal PVC and therefore + # running VS Code server in the pod. + # In the Diamond cluster, requests must be >= 0.1*limits + # When not using either of the above, the limits may be lowered. + # When idle but connected, blueapi consumes ~400MB of memory and 1% cpu + # and may struggle when allocated less. resources: # We usually recommend not to specify default resources and to leave this as a conscious # choice for the user. This also increases chances charts run on environments with little @@ -205,7 +204,7 @@ worker: repositories: [] # - name: "dodal" # remote_url: https://github.com/DiamondLightSource/dodal.git - # -- Configures logging. Port 12231 is the `dodal` input on graylog which will be renamed `blueapi` + # -- Configures logging. Port 12231 is the `dodal` input on graylog which will be renamed `blueapi` logging: level: "INFO" graylog: @@ -224,6 +223,12 @@ initContainer: # -- Size of persistent volume size: "1Gi" +timeStampCron: + enabled: true + +pvcAutoDeletion: + enabled: true + debug: # -- If enabled, runs debugpy, allowing port-forwarding to expose port 5678 or attached vscode instance enabled: false