diff --git a/Makefile b/Makefile index 93581a28..f3454f53 100644 --- a/Makefile +++ b/Makefile @@ -23,7 +23,7 @@ LOCAL_NODE_BIN := $(LOCAL_NODE_DIR)/bin LOCAL_NODE := $(LOCAL_NODE_BIN)/node LOCAL_NPM := $(LOCAL_NODE_BIN)/npm -.PHONY: proto build test test-nested-modules tidy lint generate build-sdk docker-build docker-build-e2e-client docker-build-etcd-tools docker-clean ensure-minio start-minio stop-containers release-broker-ports test-produce-consume test-produce-consume-debug test-consumer-group test-ops-api test-mcp test-multi-segment-durability test-full test-operator test-acl demo demo-platform demo-platform-bootstrap iceberg-demo kafsql-demo platform-demo help clean-kind-all ensure-local-node check vet race fmt fmt-check test-fuzz code-ql code-ql-summary code-ql-gate commit-check +.PHONY: proto build test test-nested-modules tidy lint generate build-sdk docker-build docker-build-e2e-client docker-build-etcd-tools docker-clean ensure-minio start-minio stop-containers release-broker-ports test-produce-consume test-produce-consume-debug test-consumer-group test-ops-api test-mcp test-multi-segment-durability test-full test-operator test-acl demo demo-platform demo-platform-bootstrap iceberg-demo kafsql-demo platform-demo help clean-kind-all ensure-local-node check vet race fmt fmt-check test-fuzz test-chart-antiaffinity code-ql code-ql-summary code-ql-gate commit-check REGISTRY ?= ghcr.io/kafscale STAMP_DIR ?= .build @@ -205,6 +205,9 @@ fmt-check: ## Check formatting (fails if unformatted) test-fuzz: ## Run Go fuzz test(s) bash scripts/test_fuzz.sh +test-chart-antiaffinity: ## Run the proxy default podAntiAffinity chart template test (helm only, no cluster) + bash test/chart/proxy-antiaffinity_test.sh + code-ql: ensure-local-node ## Run local CodeQL and emit SARIF under .tmp/codeql/ bash scripts/codeql_local.sh diff --git a/deploy/helm/kafscale/Chart.yaml b/deploy/helm/kafscale/Chart.yaml index ded5f027..070963a9 100644 --- a/deploy/helm/kafscale/Chart.yaml +++ b/deploy/helm/kafscale/Chart.yaml @@ -19,5 +19,5 @@ description: Helm chart for the Kafscale operator and console home: https://github.com/KafScale/platform icon: https://raw.githubusercontent.com/KafScale/platform/main/docs/assets/icon.png type: application -version: 0.4.0 +version: 0.4.1 appVersion: "v1.5.0" diff --git a/deploy/helm/kafscale/templates/proxy-deployment.yaml b/deploy/helm/kafscale/templates/proxy-deployment.yaml index 7f984633..4e87eea1 100644 --- a/deploy/helm/kafscale/templates/proxy-deployment.yaml +++ b/deploy/helm/kafscale/templates/proxy-deployment.yaml @@ -115,8 +115,24 @@ spec: tolerations: {{ toYaml . | indent 8 }} {{- end }} -{{- with .Values.proxy.affinity }} +{{- if .Values.proxy.affinity }} affinity: -{{ toYaml . | indent 8 }} +{{ toYaml .Values.proxy.affinity | indent 8 }} +{{- else }} + # Default soft anti-affinity. The label selector is templated from the + # proxy's own pod labels (the same componentSelectorLabels helper used + # for the pod template above), so it always matches the pods it spreads, + # even when nameOverride/fullnameOverride changes the rendered name. Soft + # (preferredDuring) so single-node clusters still schedule every replica. + # Replaced by proxy.affinity when that value is set. + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: +{{ include "kafscale.componentSelectorLabels" (dict "root" . "component" "proxy") | indent 20 }} + topologyKey: kubernetes.io/hostname {{- end }} {{- end }} diff --git a/deploy/helm/kafscale/values.yaml b/deploy/helm/kafscale/values.yaml index a5ae4f9e..d4ab8daf 100644 --- a/deploy/helm/kafscale/values.yaml +++ b/deploy/helm/kafscale/values.yaml @@ -145,6 +145,19 @@ proxy: resources: {} nodeSelector: {} tolerations: [] + # Pod affinity for the proxy Deployment. + # + # When left empty, proxy-deployment.yaml renders a default soft + # (preferredDuring) podAntiAffinity that prefers to place the proxy + # replicas on different nodes, with a label selector templated from the + # proxy's own pod labels so it always matches the pods it spreads, even + # under nameOverride/fullnameOverride. Soft so single-node KIND clusters + # still schedule every replica; flip to requiredDuring in multi-node + # production by setting an explicit affinity here. + # + # Backward-compat note: setting this key REPLACES the chart default + # rather than merging with it. If you want anti-affinity plus extra + # affinity rules, include the anti-affinity block in your override. affinity: {} service: type: LoadBalancer diff --git a/pkg/operator/antiaffinity_test.go b/pkg/operator/antiaffinity_test.go new file mode 100644 index 00000000..159cf001 --- /dev/null +++ b/pkg/operator/antiaffinity_test.go @@ -0,0 +1,98 @@ +// Copyright 2025 Alexander Alten (novatechflow), NovaTechflow (novatechflow.com). +// This project is supported and financed by Scalytics, Inc. (www.scalytics.io). +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package operator + +import ( + "context" + "reflect" + "testing" + + appsv1 "k8s.io/api/apps/v1" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +// assertSpreadsOwnPods asserts that the StatefulSet carries a single soft +// (preferred) podAntiAffinity term whose label selector EQUALS the pod-template +// labels and whose topology key is the node hostname. That equality is the +// property that makes the rule work: the selector matches the very pods the +// term is meant to spread. A mismatch (the proxy selector-drift class of bug) +// would leave the rule matching nothing and HA silently disabled. +func assertSpreadsOwnPods(t *testing.T, sts *appsv1.StatefulSet) { + t.Helper() + + aff := sts.Spec.Template.Spec.Affinity + if aff == nil || aff.PodAntiAffinity == nil { + t.Fatalf("expected podAntiAffinity on %s, got %+v", sts.Name, aff) + } + terms := aff.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution + if len(terms) != 1 { + t.Fatalf("expected exactly one preferred term on %s, got %d", sts.Name, len(terms)) + } + if aff.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution != nil { + t.Fatalf("expected soft-only anti-affinity on %s, found a required term", sts.Name) + } + + term := terms[0].PodAffinityTerm + if term.LabelSelector == nil { + t.Fatalf("expected a label selector on %s", sts.Name) + } + if term.TopologyKey != "kubernetes.io/hostname" { + t.Fatalf("expected topologyKey kubernetes.io/hostname on %s, got %q", sts.Name, term.TopologyKey) + } + + podLabels := sts.Spec.Template.Labels + if len(podLabels) == 0 { + t.Fatalf("expected pod-template labels on %s", sts.Name) + } + if !reflect.DeepEqual(term.LabelSelector.MatchLabels, podLabels) { + t.Fatalf("anti-affinity selector on %s does not match its own pod labels:\n selector=%v\n podLabels=%v", + sts.Name, term.LabelSelector.MatchLabels, podLabels) + } +} + +func TestBrokerStatefulSetAntiAffinitySpreadsOwnPods(t *testing.T) { + cluster := testCluster("affinity-broker", nil) + scheme := testScheme(t) + c := fake.NewClientBuilder().WithScheme(scheme).WithObjects(cluster).Build() + r := &ClusterReconciler{Client: c, Scheme: scheme} + + if err := r.reconcileBrokerDeployment(context.Background(), cluster, []string{"http://etcd:2379"}); err != nil { + t.Fatalf("reconcileBrokerDeployment: %v", err) + } + + sts := &appsv1.StatefulSet{} + assertFound(t, c, sts, cluster.Namespace, cluster.Name+"-broker") + assertSpreadsOwnPods(t, sts) +} + +func TestEtcdStatefulSetAntiAffinitySpreadsOwnPods(t *testing.T) { + t.Setenv(operatorEtcdEndpointsEnv, "") + cluster := testCluster("affinity-etcd", nil) + scheme := testScheme(t) + c := fake.NewClientBuilder().WithScheme(scheme).WithObjects(cluster).Build() + + res, err := EnsureEtcd(context.Background(), c, scheme, cluster) + if err != nil { + t.Fatalf("EnsureEtcd: %v", err) + } + if !res.Managed { + t.Fatalf("expected managed etcd so a StatefulSet is created") + } + + sts := &appsv1.StatefulSet{} + assertFound(t, c, sts, cluster.Namespace, cluster.Name+"-etcd") + assertSpreadsOwnPods(t, sts) +} diff --git a/pkg/operator/cluster_controller.go b/pkg/operator/cluster_controller.go index 03688586..eb4e388e 100644 --- a/pkg/operator/cluster_controller.go +++ b/pkg/operator/cluster_controller.go @@ -151,6 +151,7 @@ func (r *ClusterReconciler) reconcileBrokerDeployment(ctx context.Context, clust sts.Spec.Selector = &metav1.LabelSelector{MatchLabels: labels} sts.Spec.Replicas = &replicas sts.Spec.Template.Labels = labels + sts.Spec.Template.Spec.Affinity = softPodAntiAffinity(labels) sts.Spec.Template.Spec.Containers = []corev1.Container{ r.brokerContainer(cluster, endpoints), } @@ -159,6 +160,28 @@ func (r *ClusterReconciler) reconcileBrokerDeployment(ctx context.Context, clust return err } +// softPodAntiAffinity returns a preferred (soft) pod anti-affinity that spreads +// replicas across nodes by hostname, so a single-node loss does not take the +// whole quorum (broker or etcd) at once. The selector reuses the pod-template +// label map, so it always matches the very pods it is meant to spread. Soft +// (not required) so a single-node cluster still schedules every replica instead +// of leaving them Pending; on a multi-node cluster the scheduler spreads them. +func softPodAntiAffinity(labels map[string]string) *corev1.Affinity { + return &corev1.Affinity{ + PodAntiAffinity: &corev1.PodAntiAffinity{ + PreferredDuringSchedulingIgnoredDuringExecution: []corev1.WeightedPodAffinityTerm{ + { + Weight: 100, + PodAffinityTerm: corev1.PodAffinityTerm{ + LabelSelector: &metav1.LabelSelector{MatchLabels: labels}, + TopologyKey: "kubernetes.io/hostname", + }, + }, + }, + }, + } +} + func (r *ClusterReconciler) deleteLegacyBrokerDeployment(ctx context.Context, cluster *kafscalev1alpha1.KafscaleCluster) error { deploy := &appsv1.Deployment{ ObjectMeta: metav1.ObjectMeta{ diff --git a/pkg/operator/etcd_resources.go b/pkg/operator/etcd_resources.go index 80714437..56d21d2d 100644 --- a/pkg/operator/etcd_resources.go +++ b/pkg/operator/etcd_resources.go @@ -183,6 +183,7 @@ func reconcileEtcdStatefulSet(ctx context.Context, c client.Client, scheme *runt sts.Spec.Replicas = &replicas sts.Spec.Selector = &metav1.LabelSelector{MatchLabels: labels} sts.Spec.Template.Labels = labels + sts.Spec.Template.Spec.Affinity = softPodAntiAffinity(labels) useMemory := parseBoolEnv(operatorEtcdStorageMemoryEnv) if useMemory { diff --git a/test/chart/proxy-antiaffinity_test.sh b/test/chart/proxy-antiaffinity_test.sh new file mode 100755 index 00000000..ec3b4a32 --- /dev/null +++ b/test/chart/proxy-antiaffinity_test.sh @@ -0,0 +1,143 @@ +#!/usr/bin/env bash +# Copyright 2026 KafScale team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Chart template test for the proxy Deployment default podAntiAffinity. +# +# The default anti-affinity rule only spreads replicas if its label selector +# matches the proxy's own pod labels. A selector hardcoded to "kafscale-proxy" +# silently matches nothing under nameOverride (the pod label becomes +# "-proxy"), disabling HA with no error. This test asserts that the +# rendered anti-affinity selector EQUALS the rendered pod-template labels, in +# the default case AND under --set nameOverride=foo (the case that would have +# caught the drift). It also asserts that an explicit proxy.affinity replaces +# the default. Self-contained: needs only helm and awk, no helm plugins. Run +# directly or via `make test-chart-antiaffinity`. + +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +CHART_DIR="${ROOT_DIR}/deploy/helm/kafscale" + +command -v helm >/dev/null 2>&1 || { echo "helm is required"; exit 1; } + +fail=0 + +render_proxy() { + helm template kafscale "${CHART_DIR}" \ + --show-only templates/proxy-deployment.yaml \ + --set proxy.enabled=true \ + "$@" +} + +# Extract the three app.kubernetes.io label values (name/instance/component) +# from a given YAML block. We scope to a section by streaming the lines that +# belong to it. The values are emitted as "name=..,instance=..,component=.." +# so two blocks can be compared as a single string. +labels_triplet() { + awk ' + /app\.kubernetes\.io\/name:/ { name=$2 } + /app\.kubernetes\.io\/instance:/ { inst=$2 } + /app\.kubernetes\.io\/component:/ { comp=$2 } + END { printf "name=%s,instance=%s,component=%s", name, inst, comp } + ' +} + +# The pod-template label block: lines after " template:" up to (not including) +# the " spec:" line. Within that we read the matchLabels-free label map. +pod_template_labels() { + render_proxy "$@" | awk ' + /^ template:/ { intpl=1; next } + intpl && /^ spec:/ { intpl=0 } + intpl { print } + ' | labels_triplet +} + +# The anti-affinity selector label block: lines after the podAntiAffinity +# "matchLabels:" up to the "topologyKey:" line. +antiaffinity_selector_labels() { + render_proxy "$@" | awk ' + /matchLabels:/ { insel=1; next } + insel && /topologyKey:/ { insel=0 } + insel { print } + ' | labels_triplet +} + +# Whether the render contains a soft (preferredDuring) podAntiAffinity at all. +has_soft_antiaffinity() { + render_proxy "$@" | grep -q "preferredDuringSchedulingIgnoredDuringExecution" +} + +assert_eq() { + local desc="$1" want="$2" got="$3" + if [ "${got}" = "${want}" ]; then + echo "PASS: ${desc}" + else + echo "FAIL: ${desc}" + echo " want: ${want}" + echo " got: ${got}" + fail=1 + fi +} + +assert_true() { + local desc="$1"; shift + if "$@"; then + echo "PASS: ${desc}" + else + echo "FAIL: ${desc}" + fail=1 + fi +} + +assert_false() { + local desc="$1"; shift + if "$@"; then + echo "FAIL: ${desc}" + fail=1 + else + echo "PASS: ${desc}" + fi +} + +echo "==> chart proxy anti-affinity template test" + +# Case 1: default install. Selector must equal the pod labels (kafscale-proxy). +pod="$(pod_template_labels)" +sel="$(antiaffinity_selector_labels)" +assert_eq "default: anti-affinity selector equals pod labels" "${pod}" "${sel}" +assert_eq "default: selector targets the proxy pods" "name=kafscale-proxy,instance=kafscale,component=proxy" "${sel}" + +# Case 2: nameOverride=foo. The pod label name becomes foo-proxy; the selector +# must move with it. The old hardcoded "kafscale-proxy" selector would fail here. +pod_ov="$(pod_template_labels --set nameOverride=foo)" +sel_ov="$(antiaffinity_selector_labels --set nameOverride=foo)" +assert_eq "nameOverride=foo: anti-affinity selector equals pod labels" "${pod_ov}" "${sel_ov}" +assert_eq "nameOverride=foo: selector tracks the overridden name" "name=foo-proxy,instance=kafscale,component=proxy" "${sel_ov}" + +# Case 3: soft-only. The default must be preferredDuring (never requiredDuring), +# so single-node clusters still schedule every replica. +assert_true "default anti-affinity is soft (preferredDuring present)" has_soft_antiaffinity +assert_false "default anti-affinity is not hard (requiredDuring absent)" \ + bash -c 'helm template kafscale "'"${CHART_DIR}"'" --show-only templates/proxy-deployment.yaml --set proxy.enabled=true | grep -q requiredDuringSchedulingIgnoredDuringExecution' + +# Case 4: explicit proxy.affinity replaces the default (no podAntiAffinity). +assert_false "explicit proxy.affinity replaces the default soft anti-affinity" \ + has_soft_antiaffinity --set 'proxy.affinity.nodeAffinity.foo=bar' + +if [ "${fail}" -ne 0 ]; then + echo "==> chart proxy anti-affinity template test FAILED" + exit 1 +fi +echo "==> chart proxy anti-affinity template test passed"