From f452f572878aebaf197585e5da1b45bb937ba0c1 Mon Sep 17 00:00:00 2001 From: Alex J Date: Thu, 9 Apr 2026 09:24:30 +0100 Subject: [PATCH 1/2] chore: making new branch (#1478) merging personal branch to a new branch. Add conditional cronjob and increase backoff limit Refactor time-stamper.sh script inclusion in ConfigMap Time stamper cron (#1479) Change chart version from 1.13.0 to 0.1.0 removing not fit for purpose test Update time-stamper.sh removing the logic for if the pvc wasn't mounted. now only annotates mounted pvcs. Base logic for pvc auto deletion Sorting out the weekly cronjob for pvc auto deletion, also adding someting to value yaml to turn it off Added Del perm del s changing name to be more readable fix: if there isn't a last_used check if not null Fix for time-stamper.sh as it was annotating all PVCs pvc deletion test fix removing test yaml added affinity and tolerations to cronjobs added an affinity to the cronjob. --- helm/blueapi/README.md | 8 +- helm/blueapi/files/scripts/pvc-deletion.sh | 29 +++ helm/blueapi/files/scripts/time-stamper.sh | 10 ++ helm/blueapi/templates/configmap.yaml | 4 +- .../blueapi/templates/cronjob-configmaps.yaml | 22 +++ helm/blueapi/templates/cronjob.yaml | 169 ++++++++++++++++++ helm/blueapi/values.schema.json | 18 +- helm/blueapi/values.yaml | 27 +-- 8 files changed, 270 insertions(+), 17 deletions(-) create mode 100644 helm/blueapi/files/scripts/pvc-deletion.sh create mode 100644 helm/blueapi/files/scripts/time-stamper.sh create mode 100644 helm/blueapi/templates/cronjob-configmaps.yaml create mode 100644 helm/blueapi/templates/cronjob.yaml diff --git a/helm/blueapi/README.md b/helm/blueapi/README.md index 3862290fb8..17e5066071 100644 --- a/helm/blueapi/README.md +++ b/helm/blueapi/README.md @@ -32,8 +32,12 @@ A Helm chart deploying a worker pod that runs Bluesky plans | podAnnotations | object | `{}` | | | podLabels | object | `{}` | | | podSecurityContext | object | `{}` | | +| pvcAutoDeletion.enabled | bool | `true` | | | readinessProbe | object | `{"failureThreshold":2,"httpGet":{"path":"/healthz","port":"http"},"periodSeconds":10}` | Readiness probe, if configured kubernetes will not route traffic to this pod if failed consecutively. This could allow the service time to recover if it is being overwhelmed by traffic, but without the to ability to load balance or scale up/outwards, upstream services will need to know to back off. This is automatically disabled when in debug mode. | -| resources | object | `{"limits":{"cpu":"2000m","memory":"4000Mi"},"requests":{"cpu":"200m","memory":"400Mi"}}` | Sets the compute resources available to the pod. These defaults are appropriate when using debug mode or an internal PVC and therefore running VS Code server in the pod. In the Diamond cluster, requests must be >= 0.1*limits When not using either of the above, the limits may be lowered. When idle but connected, blueapi consumes ~400MB of memory and 1% cpu and may struggle when allocated less. | +| resources.limits.cpu | string | `"2000m"` | | +| resources.limits.memory | string | `"4000Mi"` | | +| resources.requests.cpu | string | `"200m"` | | +| resources.requests.memory | string | `"400Mi"` | | | restartOnConfigChange | bool | `true` | If enabled the blueapi pod will restart on changes to `worker` | | securityContext.runAsNonRoot | bool | `true` | | | securityContext.runAsUser | int | `1000` | | @@ -44,6 +48,7 @@ A Helm chart deploying a worker pod that runs Bluesky plans | serviceAccount.create | bool | `false` | | | serviceAccount.name | string | `""` | | | startupProbe | object | `{"failureThreshold":5,"httpGet":{"path":"/healthz","port":"http"},"periodSeconds":10}` | A more lenient livenessProbe to allow the service to start fully. This is automatically disabled when in debug mode. | +| timeStampCron.enabled | bool | `true` | | | tolerations | list | `[]` | May be required to run on specific nodes (e.g. the control machine) | | tracing | object | `{"fastapi":{"excludedURLs":"/healthz"},"otlp":{"enabled":false,"protocol":"http/protobuf","server":{"host":"http://opentelemetry-collector.tracing","port":4318}}}` | Exclude health probe requests from tracing by default to prevent spamming | | volumeMounts | list | `[{"mountPath":"/config","name":"worker-config","readOnly":true}]` | Additional volumeMounts on the output StatefulSet definition. Define how volumes are mounted to the container referenced by using the same name. | @@ -51,6 +56,5 @@ A Helm chart deploying a worker pod that runs Bluesky plans | worker | object | `{"api":{"url":"http://0.0.0.0:8000/"},"env":{"sources":[{"kind":"planFunctions","module":"dodal.plans"},{"kind":"planFunctions","module":"dodal.plan_stubs.wrapped"}]},"logging":{"graylog":{"enabled":false,"url":"tcp://graylog-log-target.diamond.ac.uk:12231/"},"level":"INFO"},"scratch":{"repositories":[],"root":"/workspace"},"stomp":{"auth":{"password":"guest","username":"guest"},"enabled":false,"url":"tcp://rabbitmq:61613/"}}` | Config for the worker goes here, will be mounted into a config file | | worker.api.url | string | `"http://0.0.0.0:8000/"` | 0.0.0.0 required to allow non-loopback traffic If using hostNetwork, the port must be free on the host | | worker.env.sources | list | `[{"kind":"planFunctions","module":"dodal.plans"},{"kind":"planFunctions","module":"dodal.plan_stubs.wrapped"}]` | modules (must be installed in the venv) to fetch devices/plans from | -| worker.logging | object | `{"graylog":{"enabled":false,"url":"tcp://graylog-log-target.diamond.ac.uk:12231/"},"level":"INFO"}` | Configures logging. Port 12231 is the `dodal` input on graylog which will be renamed `blueapi` | | worker.scratch | object | `{"repositories":[],"root":"/workspace"}` | If initContainer is enabled the default branch of python projects in this section are installed into the venv *without their dependencies* | | worker.stomp | object | `{"auth":{"password":"guest","username":"guest"},"enabled":false,"url":"tcp://rabbitmq:61613/"}` | Message bus configuration for returning status to GDA/forwarding documents downstream Password may be in the form ${ENV_VAR} to be fetched from an environment variable e.g. mounted from a SealedSecret | diff --git a/helm/blueapi/files/scripts/pvc-deletion.sh b/helm/blueapi/files/scripts/pvc-deletion.sh new file mode 100644 index 0000000000..d912a1542d --- /dev/null +++ b/helm/blueapi/files/scripts/pvc-deletion.sh @@ -0,0 +1,29 @@ +#!/bin/sh +# Get all PVCs by running pods +ALL_PVCS=$(kubectl get pvc -n $RELEASE_NAMESPACE -o=jsonpath='{.items[*].metadata.name}' | tr ' ' '\n' | sort -u) +BLUEAPI_PVCS=$( echo $ALL_PVCS | tr ' ' '\n' | grep blueapi-scratch) +NOW=$(date +%s) +#loop through all pvcs. +for pvc in $BLUEAPI_PVCS; do + #check if pvc has last-used annotation + if kubectl get pvc $pvc -n $RELEASE_NAMESPACE -o=jsonpath='{.metadata.annotations.last-used}' + then + #get last used annotation + LAST_USED=$(kubectl get pvc $pvc -n $RELEASE_NAMESPACE -o=jsonpath='{.metadata.annotations.last-used}') + #checking if its not null + if [ -n "$LAST_USED" ]; then + #check if last_used is older than 3 months + if [ $(($NOW - LAST_USED)) -gt 7884000 ]; then + #checking if the pvc is protected, if it is protected skip deletion + if [ "$(kubectl get pvc $pvc -n $RELEASE_NAMESPACE -o=jsonpath='{.metadata.annotations.protected}')" = "true" ]; then + echo "PVC $pvc is protected, skipping deletion" + continue + fi + #PVC has not been used for more than three months, delete it + kubectl delete pvc "$pvc" -n $RELEASE_NAMESPACE + fi + fi + else + echo "PVC $pvc does not have last-used annotation, skipping deletion" + fi +done diff --git a/helm/blueapi/files/scripts/time-stamper.sh b/helm/blueapi/files/scripts/time-stamper.sh new file mode 100644 index 0000000000..40de8d006f --- /dev/null +++ b/helm/blueapi/files/scripts/time-stamper.sh @@ -0,0 +1,10 @@ +#!/bin/sh +# Get all PVCs currently mounted by running pods +MOUNTED_PVCS=$(kubectl get pods -n $RELEASE_NAMESPACE \ + -o=jsonpath='{.items[*].spec.volumes[*].persistentVolumeClaim.claimName}' | tr ' ' '\n' | sort -u) +BLUEAPI_PVCS=$( echo $MOUNTED_PVCS | tr ' ' '\n' | grep blueapi-scratch) +#loop through all the pvcs annotating ones thare are mounted +NOW=$(date +%s) +for pvc in $BLUEAPI_PVCS; do + kubectl annotate --overwrite pvc "$pvc" -n $RELEASE_NAMESPACE last-used="$NOW" +done diff --git a/helm/blueapi/templates/configmap.yaml b/helm/blueapi/templates/configmap.yaml index aa813e6485..93ba1447ea 100644 --- a/helm/blueapi/templates/configmap.yaml +++ b/helm/blueapi/templates/configmap.yaml @@ -31,6 +31,6 @@ data: init_config.yaml: |- scratch: {{- toYaml .Values.worker.scratch | nindent 6 }} -{{- end }} ---- +--- +{{- end }} diff --git a/helm/blueapi/templates/cronjob-configmaps.yaml b/helm/blueapi/templates/cronjob-configmaps.yaml new file mode 100644 index 0000000000..188bb1a5f7 --- /dev/null +++ b/helm/blueapi/templates/cronjob-configmaps.yaml @@ -0,0 +1,22 @@ +{{- if .Values.timeStampCron.enabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + name : {{include "blueapi.fullname" . }}-pvc-stamper-script +data: + {{- $files := .Files }} + time-stamper.sh: |- +{{ $files.Get "files/scripts/time-stamper.sh" | indent 4 }} +--- +{{- end }} + +{{- if .Values.pvcAutoDeletion.enabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + name : {{include "blueapi.fullname" . }}-pvc-auto-deletion-script +data: + {{- $files := .Files }} + pvc-deletion.sh: |- +{{ $files.Get "files/scripts/pvc-deletion.sh" | indent 4 }} +{{- end }} diff --git a/helm/blueapi/templates/cronjob.yaml b/helm/blueapi/templates/cronjob.yaml new file mode 100644 index 0000000000..3dea62180b --- /dev/null +++ b/helm/blueapi/templates/cronjob.yaml @@ -0,0 +1,169 @@ +{{- if .Values.timeStampCron.enabled }} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "blueapi.fullname" . }}-last-used-stamper + namespace: {{ .Release.Namespace }} +automountServiceAccountToken: true +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ include "blueapi.fullname" . }}-last-used-stamper + namespace: {{ .Release.Namespace }} +rules: +- apiGroups: [""] + resources: ["pods", "persistentvolumeclaims"] + verbs: ["get", "list", "patch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "blueapi.fullname" . }}-last-used-stamper + namespace: {{ .Release.Namespace }} +subjects: +- kind: ServiceAccount + name: {{ include "blueapi.fullname" . }}-last-used-stamper + namespace: {{ .Release.Namespace }} +roleRef: + kind: Role + name: {{ include "blueapi.fullname" . }}-last-used-stamper + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: {{ include "blueapi.fullname" . }}-last-used-stamper + namespace: {{ .Release.Namespace }} +spec: + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 3 + failedJobsHistoryLimit: 1 + schedule: "*/5 * * * *" + + jobTemplate: + spec: + # amount of attempts of labeling a pvc + backoffLimit: 3 + # job stops after 180 seconds + activeDeadlineSeconds: 180 + template: + spec: + serviceAccountName: {{ include "blueapi.fullname" . }}-last-used-stamper + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 12 }} + {{- end }} + volumes: + - name: {{include "blueapi.fullname" . }}-pvc-stamper-script + configMap: + name: {{include "blueapi.fullname" . }}-pvc-stamper-script + defaultMode: 0555 + containers: + - name: last-used-stamper + env: + - name: RELEASE_NAME + value: {{ .Release.Name }} + - name: RELEASE_NAMESPACE + value: {{ .Release.Namespace }} + volumeMounts: + - name: {{include "blueapi.fullname" . }}-pvc-stamper-script + mountPath: /scripts + image: bitnami/kubectl:latest + imagePullPolicy: IfNotPresent + command: ["/scripts/time-stamper.sh"] + restartPolicy: OnFailure +{{- end }} +{{- if .Values.pvcAutoDeletion.enabled }} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "blueapi.fullname" . }}-pvc-auto-deletion + namespace: {{ .Release.Namespace }} +automountServiceAccountToken: true +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ include "blueapi.fullname" . }}-pvc-auto-deletion + namespace: {{ .Release.Namespace }} +rules: +- apiGroups: [""] + resources: ["pods", "persistentvolumeclaims"] + verbs: ["get", "list", "patch","delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "blueapi.fullname" . }}-pvc-auto-deletion + namespace: {{ .Release.Namespace }} +subjects: +- kind: ServiceAccount + name: {{ include "blueapi.fullname" . }}-pvc-auto-deletion + namespace: {{ .Release.Namespace }} +roleRef: + kind: Role + name: {{ include "blueapi.fullname" . }}-pvc-auto-deletion + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: {{ include "blueapi.fullname" . }}-pvc-auto-deletion + namespace: {{ .Release.Namespace }} +spec: + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 3 + failedJobsHistoryLimit: 1 + schedule: "@weekly" + + jobTemplate: + spec: + # amount of attempts of labeling a pvc + backoffLimit: 3 + # job stops after 300 seconds + activeDeadlineSeconds: 300 + template: + spec: + serviceAccountName: {{ include "blueapi.fullname" . }}-pvc-auto-deletion + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 12 }} + {{- end }} + volumes: + - name: {{include "blueapi.fullname" . }}-pvc-auto-deletion-script + configMap: + name: {{include "blueapi.fullname" . }}-pvc-auto-deletion-script + defaultMode: 0555 + containers: + - name: pvc-auto-deletion + env: + - name: RELEASE_NAME + value: {{ .Release.Name }} + - name: RELEASE_NAMESPACE + value: {{ .Release.Namespace }} + volumeMounts: + - name: {{include "blueapi.fullname" . }}-pvc-auto-deletion-script + mountPath: /scripts + image: bitnami/kubectl:latest + imagePullPolicy: IfNotPresent + command: ["/scripts/pvc-deletion.sh"] + restartPolicy: OnFailure +{{- end }} diff --git a/helm/blueapi/values.schema.json b/helm/blueapi/values.schema.json index 3159f6713e..654e1178d4 100644 --- a/helm/blueapi/values.schema.json +++ b/helm/blueapi/values.schema.json @@ -174,6 +174,14 @@ "podSecurityContext": { "type": "object" }, + "pvcAutoDeletion": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + } + } + }, "readinessProbe": { "description": "Readiness probe, if configured kubernetes will not route traffic to this pod if failed consecutively. This could allow the service time to recover if it is being overwhelmed by traffic, but without the to ability to load balance or scale up/outwards, upstream services will need to know to back off. This is automatically disabled when in debug mode.", "type": "object", @@ -198,7 +206,6 @@ } }, "resources": { - "description": "Sets the compute resources available to the pod. These defaults are appropriate when using debug mode or an internal PVC and therefore running VS Code server in the pod. In the Diamond cluster, requests must be \u003e= 0.1*limits When not using either of the above, the limits may be lowered. When idle but connected, blueapi consumes ~400MB of memory and 1% cpu and may struggle when allocated less.", "type": "object", "properties": { "limits": { @@ -292,6 +299,14 @@ } } }, + "timeStampCron": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + } + } + }, "tolerations": { "description": "May be required to run on specific nodes (e.g. the control machine)", "type": "array" @@ -389,7 +404,6 @@ } }, "logging": { - "description": "Configures logging. Port 12231 is the `dodal` input on graylog which will be renamed `blueapi`", "type": "object", "properties": { "graylog": { diff --git a/helm/blueapi/values.yaml b/helm/blueapi/values.yaml index 876b37a989..c7e6e2fa1e 100644 --- a/helm/blueapi/values.yaml +++ b/helm/blueapi/values.yaml @@ -36,8 +36,7 @@ podAnnotations: {} # For more information checkout: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/ podLabels: {} -podSecurityContext: {} - # fsGroup: 2000 +podSecurityContext: {} # fsGroup: 2000 securityContext: # https://github.com/DiamondLightSource/blueapi/issues/1096 @@ -48,7 +47,7 @@ securityContext: # drop: # - ALL -# This is for setting up a service more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/ + # This is for setting up a service more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/ service: # This sets the service type more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/#publishing-services-service-types # -- To make blueapi available on an IP outside of the cluster prior to an Ingress being created, change this to LoadBalancer @@ -76,13 +75,13 @@ ingress: # hosts: # - chart-example.local -# -- Sets the compute resources available to the pod. -# These defaults are appropriate when using debug mode or an internal PVC and therefore -# running VS Code server in the pod. -# In the Diamond cluster, requests must be >= 0.1*limits -# When not using either of the above, the limits may be lowered. -# When idle but connected, blueapi consumes ~400MB of memory and 1% cpu -# and may struggle when allocated less. + # -- Sets the compute resources available to the pod. + # These defaults are appropriate when using debug mode or an internal PVC and therefore + # running VS Code server in the pod. + # In the Diamond cluster, requests must be >= 0.1*limits + # When not using either of the above, the limits may be lowered. + # When idle but connected, blueapi consumes ~400MB of memory and 1% cpu + # and may struggle when allocated less. resources: # We usually recommend not to specify default resources and to leave this as a conscious # choice for the user. This also increases chances charts run on environments with little @@ -205,7 +204,7 @@ worker: repositories: [] # - name: "dodal" # remote_url: https://github.com/DiamondLightSource/dodal.git - # -- Configures logging. Port 12231 is the `dodal` input on graylog which will be renamed `blueapi` + # -- Configures logging. Port 12231 is the `dodal` input on graylog which will be renamed `blueapi` logging: level: "INFO" graylog: @@ -224,6 +223,12 @@ initContainer: # -- Size of persistent volume size: "1Gi" +timeStampCron: + enabled: true + +pvcAutoDeletion: + enabled: true + debug: # -- If enabled, runs debugpy, allowing port-forwarding to expose port 5678 or attached vscode instance enabled: false From db3dc7a76f8ca9e0af70e4321b049094e55d9a0f Mon Sep 17 00:00:00 2001 From: alexj9837 <52531949+Alexj9837@users.noreply.github.com> Date: Wed, 29 Apr 2026 16:03:43 +0000 Subject: [PATCH 2/2] reduced the history of successful jobs, updated deletion script --- helm/blueapi/files/scripts/pvc-deletion.sh | 13 +++++-------- helm/blueapi/templates/cronjob.yaml | 6 +++--- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/helm/blueapi/files/scripts/pvc-deletion.sh b/helm/blueapi/files/scripts/pvc-deletion.sh index d912a1542d..dfcfb4d22a 100644 --- a/helm/blueapi/files/scripts/pvc-deletion.sh +++ b/helm/blueapi/files/scripts/pvc-deletion.sh @@ -6,24 +6,21 @@ NOW=$(date +%s) #loop through all pvcs. for pvc in $BLUEAPI_PVCS; do #check if pvc has last-used annotation - if kubectl get pvc $pvc -n $RELEASE_NAMESPACE -o=jsonpath='{.metadata.annotations.last-used}' - then #get last used annotation - LAST_USED=$(kubectl get pvc $pvc -n $RELEASE_NAMESPACE -o=jsonpath='{.metadata.annotations.last-used}') + LAST_USED=$(kubectl get pvc "$pvc" -n $RELEASE_NAMESPACE -o=jsonpath='{.metadata.annotations.last-used}') #checking if its not null if [ -n "$LAST_USED" ]; then #check if last_used is older than 3 months if [ $(($NOW - LAST_USED)) -gt 7884000 ]; then #checking if the pvc is protected, if it is protected skip deletion - if [ "$(kubectl get pvc $pvc -n $RELEASE_NAMESPACE -o=jsonpath='{.metadata.annotations.protected}')" = "true" ]; then - echo "PVC $pvc is protected, skipping deletion" + if [ "$(kubectl get pvc "$pvc" -n $RELEASE_NAMESPACE -o=jsonpath='{.metadata.annotations.protected}')" = "true" ]; then + echo " PVC $pvc is protected, skipping deletion" continue fi #PVC has not been used for more than three months, delete it kubectl delete pvc "$pvc" -n $RELEASE_NAMESPACE fi - fi - else - echo "PVC $pvc does not have last-used annotation, skipping deletion" + else + echo " $pvc has no last-used annotation" fi done diff --git a/helm/blueapi/templates/cronjob.yaml b/helm/blueapi/templates/cronjob.yaml index 3dea62180b..630b897e31 100644 --- a/helm/blueapi/templates/cronjob.yaml +++ b/helm/blueapi/templates/cronjob.yaml @@ -37,7 +37,7 @@ metadata: namespace: {{ .Release.Namespace }} spec: concurrencyPolicy: Forbid - successfulJobsHistoryLimit: 3 + successfulJobsHistoryLimit: 1 failedJobsHistoryLimit: 1 schedule: "*/5 * * * *" @@ -122,13 +122,13 @@ metadata: namespace: {{ .Release.Namespace }} spec: concurrencyPolicy: Forbid - successfulJobsHistoryLimit: 3 + successfulJobsHistoryLimit: 1 failedJobsHistoryLimit: 1 schedule: "@weekly" jobTemplate: spec: - # amount of attempts of labeling a pvc + # amount of attempts for pvc deletion backoffLimit: 3 # job stops after 300 seconds activeDeadlineSeconds: 300