From 5a50eacc1d25781aced1f4c9508093efb0e2441c Mon Sep 17 00:00:00 2001 From: Gerard Toonstra Date: Wed, 13 May 2026 20:39:11 +0200 Subject: [PATCH] [ENG-3846]: Add liveness probe for temporal workers Wires up the /livez HTTP endpoint (datafold/datafold#12364) into the worker-temporal Helm chart: exposes port 8091, sets TEMPORAL_LIVENESS_PORT, and configures a K8s livenessProbe that restarts the pod when any worker stops running, a fatal error occurs, or the process pool breaks. Co-Authored-By: Claude Sonnet 4.6 --- charts/datafold-manager/Chart.yaml | 2 +- charts/datafold-manager/values.yaml | 2 +- charts/datafold/Chart.yaml | 2 +- .../worker-temporal/templates/deployment.yaml | 23 ++++++++++++++++++- .../charts/worker-temporal/values.yaml | 11 +++++++++ 5 files changed, 36 insertions(+), 4 deletions(-) diff --git a/charts/datafold-manager/Chart.yaml b/charts/datafold-manager/Chart.yaml index ef9f08b..fc6b653 100644 --- a/charts/datafold-manager/Chart.yaml +++ b/charts/datafold-manager/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: datafold-manager description: Helm chart for Datafold Operator type: application -version: 0.1.103 +version: 0.1.104 appVersion: "1.0.0" icon: https://www.datafold.com/logo.png diff --git a/charts/datafold-manager/values.yaml b/charts/datafold-manager/values.yaml index a379909..661562d 100644 --- a/charts/datafold-manager/values.yaml +++ b/charts/datafold-manager/values.yaml @@ -18,7 +18,7 @@ operator: # Operator image configuration image: repository: us-docker.pkg.dev/datadiff-mm/datafold/datafold-operator - tag: "1.1.71" + tag: "1.1.72" pullPolicy: Always # Operator deployment configuration diff --git a/charts/datafold/Chart.yaml b/charts/datafold/Chart.yaml index 40abd49..b7d3348 100644 --- a/charts/datafold/Chart.yaml +++ b/charts/datafold/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: datafold description: Helm chart package to deploy Datafold on kubernetes. type: application -version: 0.10.87 +version: 0.10.88 appVersion: "1.0.0" icon: https://www.datafold.com/logo.png diff --git a/charts/datafold/charts/worker-temporal/templates/deployment.yaml b/charts/datafold/charts/worker-temporal/templates/deployment.yaml index f656847..bbb5cc1 100644 --- a/charts/datafold/charts/worker-temporal/templates/deployment.yaml +++ b/charts/datafold/charts/worker-temporal/templates/deployment.yaml @@ -103,14 +103,35 @@ spec: - name: TEMPORAL_METRICS_BIND_ADDRESS value: "0.0.0.0:{{ .Values.metrics.port }}" {{- end }} + {{- if .Values.liveness.enabled }} + - name: TEMPORAL_LIVENESS_PORT + value: {{ .Values.liveness.port | quote }} + {{- end }} {{- with .Values.extraEnv }} {{- toYaml . | nindent 12 }} {{- end }} - {{- if .Values.metrics.enabled }} + {{- if or .Values.metrics.enabled .Values.liveness.enabled }} ports: + {{- if .Values.metrics.enabled }} - name: metrics containerPort: {{ .Values.metrics.port }} protocol: TCP + {{- end }} + {{- if .Values.liveness.enabled }} + - name: liveness + containerPort: {{ .Values.liveness.port }} + protocol: TCP + {{- end }} + {{- end }} + {{- if .Values.liveness.enabled }} + livenessProbe: + httpGet: + path: /livez + port: {{ .Values.liveness.port }} + initialDelaySeconds: {{ .Values.liveness.initialDelaySeconds }} + periodSeconds: {{ .Values.liveness.periodSeconds }} + failureThreshold: {{ .Values.liveness.failureThreshold }} + timeoutSeconds: {{ .Values.liveness.timeoutSeconds }} {{- end }} resources: {{- toYaml .Values.resources | nindent 12 }} diff --git a/charts/datafold/charts/worker-temporal/values.yaml b/charts/datafold/charts/worker-temporal/values.yaml index 863fd4b..36cb00f 100644 --- a/charts/datafold/charts/worker-temporal/values.yaml +++ b/charts/datafold/charts/worker-temporal/values.yaml @@ -76,6 +76,17 @@ metrics: enabled: true port: 9090 +liveness: + # Starts a lightweight HTTP server inside the worker and wires up a K8s + # livenessProbe against /livez. The probe returns 503 when any Temporal worker + # stops running, a fatal error is raised, or the process pool becomes broken. + enabled: true + port: 8091 + initialDelaySeconds: 60 + periodSeconds: 20 + failureThreshold: 3 + timeoutSeconds: 5 + extraEnv: [] volumes: []