From 0ccac81ab55bb286ac5b0422ddff9fbc9255e06b Mon Sep 17 00:00:00 2001 From: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> Date: Wed, 17 Jun 2026 16:51:39 +0200 Subject: [PATCH 1/4] tests: spark connect/kerberos/iceberg integration test --- .../spark-connect-kerberos/00-assert.yaml | 9 + .../00-patch-ns.yaml.j2 | 9 + .../00-serviceaccount.yaml.j2 | 29 +++ .../spark-connect-kerberos/01-assert.yaml.j2 | 10 + ...tor-aggregator-discovery-configmap.yaml.j2 | 9 + .../spark-connect-kerberos/02-assert.yaml | 12 ++ .../02-install-krb5-kdc.yaml.j2 | 142 +++++++++++++ .../03-create-kerberos-secretclass.yaml.j2 | 42 ++++ .../03-minio-tls-ca-secret.yaml | 16 ++ .../04-minio-secrets.yaml.j2 | 70 +++++++ .../spark-connect-kerberos/05-assert.yaml | 11 + .../spark-connect-kerberos/05-minio.yaml | 11 + .../spark-connect-kerberos/06-assert.yaml | 12 ++ .../06-install-hive-postgres.yaml | 12 ++ .../06_helm-bitnami-postgresql-values.yaml.j2 | 42 ++++ .../spark-connect-kerberos/07-assert.yaml | 12 ++ .../07-install-hive.yaml.j2 | 6 + .../spark-connect-kerberos/07_hive.yaml.j2 | 46 ++++ .../spark-connect-kerberos/10-assert.yaml | 6 + .../10-install-spark-connect.yaml.j2 | 7 + .../10_spark-connect.yaml.j2 | 172 +++++++++++++++ .../spark-connect-kerberos/11-assert.yaml | 18 ++ .../spark-connect-kerberos/12-assert.yaml | 197 ++++++++++++++++++ .../spark-connect-kerberos/20-assert.yaml | 11 + .../20-run-connect-client.yaml.j2 | 120 +++++++++++ .../spark-connect-kerberos/certs/generate.sh | 132 ++++++++++++ .../helm-bitnami-minio-values.yaml.j2 | 74 +++++++ tests/test-definition.yaml | 15 ++ 28 files changed, 1252 insertions(+) create mode 100644 tests/templates/kuttl/spark-connect-kerberos/00-assert.yaml create mode 100644 tests/templates/kuttl/spark-connect-kerberos/00-patch-ns.yaml.j2 create mode 100644 tests/templates/kuttl/spark-connect-kerberos/00-serviceaccount.yaml.j2 create mode 100644 tests/templates/kuttl/spark-connect-kerberos/01-assert.yaml.j2 create mode 100644 tests/templates/kuttl/spark-connect-kerberos/01-install-vector-aggregator-discovery-configmap.yaml.j2 create mode 100644 tests/templates/kuttl/spark-connect-kerberos/02-assert.yaml create mode 100644 tests/templates/kuttl/spark-connect-kerberos/02-install-krb5-kdc.yaml.j2 create mode 100644 tests/templates/kuttl/spark-connect-kerberos/03-create-kerberos-secretclass.yaml.j2 create mode 100644 tests/templates/kuttl/spark-connect-kerberos/03-minio-tls-ca-secret.yaml create mode 100644 tests/templates/kuttl/spark-connect-kerberos/04-minio-secrets.yaml.j2 create mode 100644 tests/templates/kuttl/spark-connect-kerberos/05-assert.yaml create mode 100644 tests/templates/kuttl/spark-connect-kerberos/05-minio.yaml create mode 100644 tests/templates/kuttl/spark-connect-kerberos/06-assert.yaml create mode 100644 tests/templates/kuttl/spark-connect-kerberos/06-install-hive-postgres.yaml create mode 100644 tests/templates/kuttl/spark-connect-kerberos/06_helm-bitnami-postgresql-values.yaml.j2 create mode 100644 tests/templates/kuttl/spark-connect-kerberos/07-assert.yaml create mode 100644 tests/templates/kuttl/spark-connect-kerberos/07-install-hive.yaml.j2 create mode 100644 tests/templates/kuttl/spark-connect-kerberos/07_hive.yaml.j2 create mode 100644 tests/templates/kuttl/spark-connect-kerberos/10-assert.yaml create mode 100644 tests/templates/kuttl/spark-connect-kerberos/10-install-spark-connect.yaml.j2 create mode 100644 tests/templates/kuttl/spark-connect-kerberos/10_spark-connect.yaml.j2 create mode 100644 tests/templates/kuttl/spark-connect-kerberos/11-assert.yaml create mode 100644 tests/templates/kuttl/spark-connect-kerberos/12-assert.yaml create mode 100644 tests/templates/kuttl/spark-connect-kerberos/20-assert.yaml create mode 100644 tests/templates/kuttl/spark-connect-kerberos/20-run-connect-client.yaml.j2 create mode 100755 tests/templates/kuttl/spark-connect-kerberos/certs/generate.sh create mode 100644 tests/templates/kuttl/spark-connect-kerberos/helm-bitnami-minio-values.yaml.j2 diff --git a/tests/templates/kuttl/spark-connect-kerberos/00-assert.yaml b/tests/templates/kuttl/spark-connect-kerberos/00-assert.yaml new file mode 100644 index 00000000..5baf8caa --- /dev/null +++ b/tests/templates/kuttl/spark-connect-kerberos/00-assert.yaml @@ -0,0 +1,9 @@ +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestAssert +timeout: 900 +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: integration-tests-sa diff --git a/tests/templates/kuttl/spark-connect-kerberos/00-patch-ns.yaml.j2 b/tests/templates/kuttl/spark-connect-kerberos/00-patch-ns.yaml.j2 new file mode 100644 index 00000000..67185acf --- /dev/null +++ b/tests/templates/kuttl/spark-connect-kerberos/00-patch-ns.yaml.j2 @@ -0,0 +1,9 @@ +{% if test_scenario['values']['openshift'] == 'true' %} +# see https://github.com/stackabletech/issues/issues/566 +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +commands: + - script: kubectl patch namespace $NAMESPACE -p '{"metadata":{"labels":{"pod-security.kubernetes.io/enforce":"privileged"}}}' + timeout: 120 +{% endif %} diff --git a/tests/templates/kuttl/spark-connect-kerberos/00-serviceaccount.yaml.j2 b/tests/templates/kuttl/spark-connect-kerberos/00-serviceaccount.yaml.j2 new file mode 100644 index 00000000..9cbf0351 --- /dev/null +++ b/tests/templates/kuttl/spark-connect-kerberos/00-serviceaccount.yaml.j2 @@ -0,0 +1,29 @@ +--- +kind: Role +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: use-integration-tests-scc +rules: +{% if test_scenario['values']['openshift'] == "true" %} + - apiGroups: ["security.openshift.io"] + resources: ["securitycontextconstraints"] + resourceNames: ["privileged"] + verbs: ["use"] +{% endif %} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: integration-tests-sa +--- +kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: use-integration-tests-scc +subjects: + - kind: ServiceAccount + name: integration-tests-sa +roleRef: + kind: Role + name: use-integration-tests-scc + apiGroup: rbac.authorization.k8s.io diff --git a/tests/templates/kuttl/spark-connect-kerberos/01-assert.yaml.j2 b/tests/templates/kuttl/spark-connect-kerberos/01-assert.yaml.j2 new file mode 100644 index 00000000..50b1d4c3 --- /dev/null +++ b/tests/templates/kuttl/spark-connect-kerberos/01-assert.yaml.j2 @@ -0,0 +1,10 @@ +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestAssert +{% if lookup('env', 'VECTOR_AGGREGATOR') %} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: vector-aggregator-discovery +{% endif %} diff --git a/tests/templates/kuttl/spark-connect-kerberos/01-install-vector-aggregator-discovery-configmap.yaml.j2 b/tests/templates/kuttl/spark-connect-kerberos/01-install-vector-aggregator-discovery-configmap.yaml.j2 new file mode 100644 index 00000000..2d6a0df5 --- /dev/null +++ b/tests/templates/kuttl/spark-connect-kerberos/01-install-vector-aggregator-discovery-configmap.yaml.j2 @@ -0,0 +1,9 @@ +{% if lookup('env', 'VECTOR_AGGREGATOR') %} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: vector-aggregator-discovery +data: + ADDRESS: {{ lookup('env', 'VECTOR_AGGREGATOR') }} +{% endif %} diff --git a/tests/templates/kuttl/spark-connect-kerberos/02-assert.yaml b/tests/templates/kuttl/spark-connect-kerberos/02-assert.yaml new file mode 100644 index 00000000..79f20ef0 --- /dev/null +++ b/tests/templates/kuttl/spark-connect-kerberos/02-assert.yaml @@ -0,0 +1,12 @@ +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestAssert +timeout: 600 +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: krb5-kdc +status: + readyReplicas: 1 + replicas: 1 diff --git a/tests/templates/kuttl/spark-connect-kerberos/02-install-krb5-kdc.yaml.j2 b/tests/templates/kuttl/spark-connect-kerberos/02-install-krb5-kdc.yaml.j2 new file mode 100644 index 00000000..c92192e3 --- /dev/null +++ b/tests/templates/kuttl/spark-connect-kerberos/02-install-krb5-kdc.yaml.j2 @@ -0,0 +1,142 @@ +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: krb5-kdc +spec: + selector: + matchLabels: + app: krb5-kdc + template: + metadata: + labels: + app: krb5-kdc + spec: + serviceAccountName: integration-tests-sa + initContainers: + - name: init + image: oci.stackable.tech/sdp/krb5:{{ test_scenario['values']['krb5'] }}-stackable0.0.0-dev + args: + - sh + - -euo + - pipefail + - -c + - | + test -e /var/kerberos/krb5kdc/principal || kdb5_util create -s -P asdf + kadmin.local get_principal -terse root/admin || kadmin.local add_principal -pw asdf root/admin + # stackable-secret-operator principal must match the keytab specified in the SecretClass + kadmin.local get_principal -terse stackable-secret-operator || kadmin.local add_principal -e aes256-cts-hmac-sha384-192:normal -pw asdf stackable-secret-operator + env: + - name: KRB5_CONFIG + value: /stackable/config/krb5.conf + volumeMounts: + - mountPath: /stackable/config + name: config + - mountPath: /var/kerberos/krb5kdc + name: data + containers: + - name: kdc + image: oci.stackable.tech/sdp/krb5:{{ test_scenario['values']['krb5'] }}-stackable0.0.0-dev + args: + - krb5kdc + - -n + env: + - name: KRB5_CONFIG + value: /stackable/config/krb5.conf + volumeMounts: + - mountPath: /stackable/config + name: config + - mountPath: /var/kerberos/krb5kdc + name: data +# Root permissions required on Openshift to access internal ports +{% if test_scenario['values']['openshift'] == "true" %} + securityContext: + runAsUser: 0 +{% endif %} + - name: kadmind + image: oci.stackable.tech/sdp/krb5:{{ test_scenario['values']['krb5'] }}-stackable0.0.0-dev + args: + - kadmind + - -nofork + env: + - name: KRB5_CONFIG + value: /stackable/config/krb5.conf + volumeMounts: + - mountPath: /stackable/config + name: config + - mountPath: /var/kerberos/krb5kdc + name: data +# Root permissions required on Openshift to access internal ports +{% if test_scenario['values']['openshift'] == "true" %} + securityContext: + runAsUser: 0 +{% endif %} + - name: client + image: oci.stackable.tech/sdp/krb5:{{ test_scenario['values']['krb5'] }}-stackable0.0.0-dev + tty: true + stdin: true + env: + - name: KRB5_CONFIG + value: /stackable/config/krb5.conf + volumeMounts: + - mountPath: /stackable/config + name: config + volumes: + - name: config + configMap: + name: krb5-kdc + volumeClaimTemplates: + - metadata: + name: data + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 1Gi +--- +apiVersion: v1 +kind: Service +metadata: + name: krb5-kdc +spec: + selector: + app: krb5-kdc + ports: + - name: kadmin + port: 749 + - name: kdc + port: 88 + - name: kdc-udp + port: 88 + protocol: UDP +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: krb5-kdc +data: + krb5.conf: | + [logging] + default = STDERR + kdc = STDERR + admin_server = STDERR + [libdefaults] + dns_lookup_realm = false + ticket_lifetime = 24h + renew_lifetime = 7d + forwardable = true + rdns = false + default_realm = {{ test_scenario['values']['kerberos-realm'] }} + spake_preauth_groups = edwards25519 + [realms] + {{ test_scenario['values']['kerberos-realm'] }} = { + acl_file = /stackable/config/kadm5.acl + disable_encrypted_timestamp = false + } + [domain_realm] + .cluster.local = {{ test_scenario['values']['kerberos-realm'] }} + cluster.local = {{ test_scenario['values']['kerberos-realm'] }} + kadm5.acl: | + root/admin *e + stackable-secret-operator *e diff --git a/tests/templates/kuttl/spark-connect-kerberos/03-create-kerberos-secretclass.yaml.j2 b/tests/templates/kuttl/spark-connect-kerberos/03-create-kerberos-secretclass.yaml.j2 new file mode 100644 index 00000000..acb67e23 --- /dev/null +++ b/tests/templates/kuttl/spark-connect-kerberos/03-create-kerberos-secretclass.yaml.j2 @@ -0,0 +1,42 @@ +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +commands: + # We need to replace $NAMESPACE (by KUTTL) + - script: | + kubectl apply -n "$NAMESPACE" -f - <- + helm install minio + --namespace $NAMESPACE + --version 14.6.16 + -f helm-bitnami-minio-values.yaml + --repo https://charts.bitnami.com/bitnami minio + timeout: 240 diff --git a/tests/templates/kuttl/spark-connect-kerberos/06-assert.yaml b/tests/templates/kuttl/spark-connect-kerberos/06-assert.yaml new file mode 100644 index 00000000..9772135e --- /dev/null +++ b/tests/templates/kuttl/spark-connect-kerberos/06-assert.yaml @@ -0,0 +1,12 @@ +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestAssert +timeout: 600 +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: postgresql-hive +status: + readyReplicas: 1 + replicas: 1 diff --git a/tests/templates/kuttl/spark-connect-kerberos/06-install-hive-postgres.yaml b/tests/templates/kuttl/spark-connect-kerberos/06-install-hive-postgres.yaml new file mode 100644 index 00000000..3c701e92 --- /dev/null +++ b/tests/templates/kuttl/spark-connect-kerberos/06-install-hive-postgres.yaml @@ -0,0 +1,12 @@ +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +timeout: 300 +commands: + - script: >- + helm upgrade postgresql-hive + --install + --version=12.5.6 + --namespace $NAMESPACE + -f 06_helm-bitnami-postgresql-values.yaml + --repo https://charts.bitnami.com/bitnami postgresql diff --git a/tests/templates/kuttl/spark-connect-kerberos/06_helm-bitnami-postgresql-values.yaml.j2 b/tests/templates/kuttl/spark-connect-kerberos/06_helm-bitnami-postgresql-values.yaml.j2 new file mode 100644 index 00000000..40dfcd33 --- /dev/null +++ b/tests/templates/kuttl/spark-connect-kerberos/06_helm-bitnami-postgresql-values.yaml.j2 @@ -0,0 +1,42 @@ +--- +global: + security: + allowInsecureImages: true # needed starting with Chart version 16.3.0 if modifying images + +image: + repository: bitnamilegacy/postgresql + +volumePermissions: + enabled: false + image: + repository: bitnamilegacy/os-shell + securityContext: + runAsUser: auto + +metrics: + image: + repository: bitnamilegacy/postgres-exporter + +primary: + extendedConfiguration: | + password_encryption=md5 + podSecurityContext: +{% if test_scenario['values']['openshift'] == 'true' %} + enabled: false +{% else %} + enabled: true +{% endif %} + containerSecurityContext: + enabled: false + resources: + requests: + memory: "512Mi" + cpu: "512m" + limits: + memory: "512Mi" + cpu: "1" + +auth: + username: hive + password: hive + database: hive diff --git a/tests/templates/kuttl/spark-connect-kerberos/07-assert.yaml b/tests/templates/kuttl/spark-connect-kerberos/07-assert.yaml new file mode 100644 index 00000000..50c27fd9 --- /dev/null +++ b/tests/templates/kuttl/spark-connect-kerberos/07-assert.yaml @@ -0,0 +1,12 @@ +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestAssert +timeout: 900 +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: hive-metastore-default +status: + readyReplicas: 1 + replicas: 1 diff --git a/tests/templates/kuttl/spark-connect-kerberos/07-install-hive.yaml.j2 b/tests/templates/kuttl/spark-connect-kerberos/07-install-hive.yaml.j2 new file mode 100644 index 00000000..c4bba773 --- /dev/null +++ b/tests/templates/kuttl/spark-connect-kerberos/07-install-hive.yaml.j2 @@ -0,0 +1,6 @@ +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +commands: + # We need to replace $NAMESPACE (by KUTTL) + - script: envsubst '$NAMESPACE' < 07_hive.yaml | kubectl apply -n $NAMESPACE -f - diff --git a/tests/templates/kuttl/spark-connect-kerberos/07_hive.yaml.j2 b/tests/templates/kuttl/spark-connect-kerberos/07_hive.yaml.j2 new file mode 100644 index 00000000..b15cd765 --- /dev/null +++ b/tests/templates/kuttl/spark-connect-kerberos/07_hive.yaml.j2 @@ -0,0 +1,46 @@ +--- +apiVersion: hive.stackable.tech/v1alpha1 +kind: HiveCluster +metadata: + name: hive +spec: + image: +{% if test_scenario['values']['hive-iceberg'].find(",") > 0 %} + custom: "{{ test_scenario['values']['hive-iceberg'].split(',')[1] }}" + productVersion: "{{ test_scenario['values']['hive-iceberg'].split(',')[0] }}" +{% else %} + productVersion: "{{ test_scenario['values']['hive-iceberg'] }}" +{% endif %} + pullPolicy: IfNotPresent + clusterConfig: + metadataDatabase: + postgresql: + host: postgresql-hive + database: hive + credentialsSecretName: postgres-credentials + s3: + reference: minio + # Kerberos-enable the metastore. Clients (the Spark Connect server) must + # authenticate via SASL/GSSAPI. No HDFS is required: the warehouse is on S3. + authentication: + kerberos: + secretClass: kerberos-$NAMESPACE +{% if lookup('env', 'VECTOR_AGGREGATOR') %} + vectorAggregatorConfigMapName: vector-aggregator-discovery +{% endif %} + metastore: + config: + logging: + enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} + roleGroups: + default: + replicas: 1 +--- +apiVersion: v1 +kind: Secret +metadata: + name: postgres-credentials +type: Opaque +stringData: + username: hive + password: hive diff --git a/tests/templates/kuttl/spark-connect-kerberos/10-assert.yaml b/tests/templates/kuttl/spark-connect-kerberos/10-assert.yaml new file mode 100644 index 00000000..41ae37b6 --- /dev/null +++ b/tests/templates/kuttl/spark-connect-kerberos/10-assert.yaml @@ -0,0 +1,6 @@ +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestAssert +timeout: 300 +commands: + - script: kubectl -n $NAMESPACE wait --for=condition=Available=true sparkconnectservers.spark.stackable.tech/spark-connect --timeout=301s diff --git a/tests/templates/kuttl/spark-connect-kerberos/10-install-spark-connect.yaml.j2 b/tests/templates/kuttl/spark-connect-kerberos/10-install-spark-connect.yaml.j2 new file mode 100644 index 00000000..e1a24351 --- /dev/null +++ b/tests/templates/kuttl/spark-connect-kerberos/10-install-spark-connect.yaml.j2 @@ -0,0 +1,7 @@ +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +commands: + # We need to replace $NAMESPACE (by KUTTL) because it is part of the Kerberos + # principals and the kerberos SecretClass name. + - script: envsubst '$NAMESPACE' < 10_spark-connect.yaml | kubectl apply -n $NAMESPACE -f - diff --git a/tests/templates/kuttl/spark-connect-kerberos/10_spark-connect.yaml.j2 b/tests/templates/kuttl/spark-connect-kerberos/10_spark-connect.yaml.j2 new file mode 100644 index 00000000..f9e285b0 --- /dev/null +++ b/tests/templates/kuttl/spark-connect-kerberos/10_spark-connect.yaml.j2 @@ -0,0 +1,172 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: spark-connect-log-config +data: + log4j2.properties: |- + appenders = CONSOLE + + appender.CONSOLE.type = Console + appender.CONSOLE.name = CONSOLE + appender.CONSOLE.target = SYSTEM_ERR + appender.CONSOLE.layout.type = PatternLayout + appender.CONSOLE.layout.pattern = %d{ISO8601} %p [%t] %c - %m%n + appender.CONSOLE.filter.threshold.type = ThresholdFilter + appender.CONSOLE.filter.threshold.level = DEBUG + + rootLogger.level=INFO + rootLogger.appenderRefs = CONSOLE + rootLogger.appenderRef.CONSOLE.ref = CONSOLE +--- +apiVersion: spark.stackable.tech/v1alpha1 +kind: SparkConnectServer +metadata: + name: spark-connect +spec: + image: +{% if test_scenario['values']['spark-connect'].find(",") > 0 %} + custom: "{{ test_scenario['values']['spark-connect'].split(',')[1] }}" + productVersion: "{{ test_scenario['values']['spark-connect'].split(',')[0] }}" +{% else %} + productVersion: "{{ test_scenario['values']['spark-connect'] }}" +{% endif %} + pullPolicy: IfNotPresent +{% if lookup('env', 'VECTOR_AGGREGATOR') %} + vectorAggregatorConfigMapName: vector-aggregator-discovery +{% endif %} + args: + # Provision the Iceberg runtime. Scala 2.13 for Spark 4, 2.12 otherwise. +{% if test_scenario['values']['spark-connect'].startswith("4") %} + - "--packages org.apache.iceberg:iceberg-spark-runtime-{{ ".".join(test_scenario['values']['spark-connect'].split('.')[:2]) }}_2.13:{{ test_scenario['values']['iceberg-latest'] }}" +{% else %} + - "--packages org.apache.iceberg:iceberg-spark-runtime-{{ ".".join(test_scenario['values']['spark-connect'].split('.')[:2]) }}_2.12:{{ test_scenario['values']['iceberg-latest'] }}" +{% endif %} + connectors: + s3connection: + reference: minio + server: + roleConfig: + listenerClass: external-unstable + # The JVM does NOT read the KRB5_CONFIG env var (that is an MIT C-library + # variable). The Java Kerberos layer reads `-Djava.security.krb5.conf`, then + # /etc/krb5.conf. The secret-operator kerberos volume provides krb5.conf at + # /stackable/kerberos/krb5.conf, so point the JVM at it explicitly. + jvmArgumentOverrides: + add: + - -Djava.security.krb5.conf=/stackable/kerberos/krb5.conf + # By default JGSS only uses credentials from the current Subject + # (useSubjectCredsOnly=true). The Spark Connect execute thread does not + # carry the login user's ticket in its Subject, so GSS fails with "Failed + # to find any Kerberos tgt". Setting this to false makes JGSS fall back to + # the ambient credential cache (KRB5CCNAME) populated by the kinit init + # container. + - -Djavax.security.auth.useSubjectCredsOnly=false + # The Spark Connect server runs in `--deploy-mode client` on Kubernetes. + # Spark only performs the automatic keytab login (UserGroupInformation + # loginUserFromKeytab) for YARN/local/Mesos or the k8s *cluster*-mode driver + # - NOT for k8s client mode. So `spark.kerberos.{keytab,principal}` never + # establish a TGT here and the Hive metastore SASL/GSS handshake fails with + # "Failed to find any Kerberos tgt". + # + # We therefore obtain a TGT ourselves: an init container runs `kinit` into a + # shared credential cache, and the server JVM uses it via `KRB5CCNAME`. + # Hadoop's UserGroupInformation reads `KRB5CCNAME` (useTicketCache) when + # `hadoop.security.authentication=kerberos`, so the metastore client uses + # that ticket. The kerberos keytab/krb5.conf are only needed on the server + # (the Spark driver); executors only write table data to S3 (not kerberized). + envOverrides: + KRB5CCNAME: /stackable/krb5/ccache + podOverrides: + spec: + initContainers: + - name: kinit +{% if test_scenario['values']['spark-connect'].find(",") > 0 %} + image: "{{ test_scenario['values']['spark-connect'].split(',')[1] }}" +{% else %} + image: oci.stackable.tech/sdp/spark-k8s:{{ test_scenario['values']['spark-connect'] }}-stackable0.0.0-dev +{% endif %} + command: ["/bin/bash", "-euo", "pipefail", "-c"] + args: + - kinit -kt /stackable/kerberos/keytab spark-connect/spark-connect.$NAMESPACE.svc.cluster.local@{{ test_scenario['values']['kerberos-realm'] }} + env: + - name: KRB5_CONFIG + value: /stackable/kerberos/krb5.conf + - name: KRB5CCNAME + value: /stackable/krb5/ccache + volumeMounts: + - name: kerberos + mountPath: /stackable/kerberos + - name: krb5-ccache + mountPath: /stackable/krb5 + containers: + - name: spark + volumeMounts: + - name: kerberos + mountPath: /stackable/kerberos + - name: krb5-ccache + mountPath: /stackable/krb5 + volumes: + - name: krb5-ccache + emptyDir: {} + - name: kerberos + ephemeral: + volumeClaimTemplate: + metadata: + annotations: + secrets.stackable.tech/class: kerberos-$NAMESPACE + secrets.stackable.tech/scope: service=spark-connect + secrets.stackable.tech/kerberos.service.names: spark-connect + spec: + storageClassName: secrets.stackable.tech + accessModes: + - ReadWriteOnce + resources: + requests: + storage: "1" + config: + logging: + enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} + containers: + spark: + custom: + configMap: spark-connect-log-config + configOverrides: + spark-defaults.conf: + spark.jars.ivy: /tmp/ivy2 + # Use Spark's built-in Hive catalog (bundled in the image). This is a + # static SQL config and must be set at server startup. + spark.sql.catalogImplementation: hive + spark.hadoop.hive.metastore.uris: thrift://hive-metastore:9083 + # Put Hadoop UserGroupInformation into Kerberos mode so it logs in from + # the ticket cache (KRB5CCNAME) populated by the kinit init container. + spark.hadoop.hadoop.security.authentication: kerberos + # Authenticate to the kerberized Hive metastore via SASL/GSSAPI. The HMS + # service principal must be the literal value below (not _HOST, which + # would resolve to the connection host "hive-metastore"). + spark.hadoop.hive.metastore.sasl.enabled: "true" + spark.hadoop.hive.metastore.kerberos.principal: hive/hive.$NAMESPACE.svc.cluster.local@{{ test_scenario['values']['kerberos-realm'] }} + # Hive/Iceberg catalog backed by the same kerberized metastore, storing + # table data and metadata in S3 (MinIO). Iceberg uses its own commit + # protocol (not the rename-based FileOutputCommitter), so writes to S3 + # work without the staging/_temporary issues of plain parquet tables. + # The catalog inherits the kerberos metastore config from spark.hadoop.* + # above. See https://github.com/stackabletech/spark-k8s-operator/issues/702 + spark.sql.extensions: org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions + spark.sql.catalog.iceberg: org.apache.iceberg.spark.SparkCatalog + spark.sql.catalog.iceberg.type: hive + spark.sql.catalog.iceberg.uri: thrift://hive-metastore:9083 + spark.sql.catalog.iceberg.warehouse: s3a://lakehouse/warehouse + executor: + configOverrides: + spark-defaults.conf: + spark.executor.instances: "1" + spark.executor.memoryOverhead: "1m" + spark.sql.extensions: org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions + config: + logging: + enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} + containers: + spark: + custom: + configMap: spark-connect-log-config diff --git a/tests/templates/kuttl/spark-connect-kerberos/11-assert.yaml b/tests/templates/kuttl/spark-connect-kerberos/11-assert.yaml new file mode 100644 index 00000000..2a0f4043 --- /dev/null +++ b/tests/templates/kuttl/spark-connect-kerberos/11-assert.yaml @@ -0,0 +1,18 @@ +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestAssert +timeout: 600 +commands: + # Test that spark connect executors are running. + # Sleep to prevent the following spark connect app from failing + # while the spark-connect server is busy setting up the executors. + - script: | + # wait for the spark-connect CR to become available + kubectl wait --for=condition=Available sparkconnectservers/spark-connect --namespace "$NAMESPACE" --timeout=3m + + # FIXME: As the status currently does not respect the executors state, we wait for them to be ready ourselves + # (see TODO comment in code): + kubectl wait --for=condition=Ready pod -l spark-app-name=spark-connect-server -n "$NAMESPACE" --timeout=10m + + # wait a little longer to increase the chance apps being able to connect + sleep 10 diff --git a/tests/templates/kuttl/spark-connect-kerberos/12-assert.yaml b/tests/templates/kuttl/spark-connect-kerberos/12-assert.yaml new file mode 100644 index 00000000..d25e9b1b --- /dev/null +++ b/tests/templates/kuttl/spark-connect-kerberos/12-assert.yaml @@ -0,0 +1,197 @@ +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestAssert +timeout: 60 +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: spark-connect-server + labels: + app.kubernetes.io/component: server + app.kubernetes.io/instance: spark-connect + app.kubernetes.io/managed-by: spark.stackable.tech_connect + app.kubernetes.io/name: spark-connect + app.kubernetes.io/role-group: default + stackable.tech/vendor: Stackable + ownerReferences: + - apiVersion: spark.stackable.tech/v1alpha1 + controller: true + kind: SparkConnectServer + name: spark-connect +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/component: server + app.kubernetes.io/instance: spark-connect + app.kubernetes.io/name: spark-connect + app.kubernetes.io/role-group: default +status: + readyReplicas: 1 + replicas: 1 +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: spark-connect-server + labels: + app.kubernetes.io/component: server + app.kubernetes.io/instance: spark-connect + app.kubernetes.io/managed-by: spark.stackable.tech_connect + app.kubernetes.io/name: spark-connect + app.kubernetes.io/role-group: default + stackable.tech/vendor: Stackable + ownerReferences: + - apiVersion: spark.stackable.tech/v1alpha1 + controller: true + kind: SparkConnectServer + name: spark-connect +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: spark-connect-executor + labels: + app.kubernetes.io/component: executor + app.kubernetes.io/instance: spark-connect + app.kubernetes.io/managed-by: spark.stackable.tech_connect + app.kubernetes.io/name: spark-connect + app.kubernetes.io/role-group: default + stackable.tech/vendor: Stackable + ownerReferences: + - apiVersion: spark.stackable.tech/v1alpha1 + controller: true + kind: SparkConnectServer + name: spark-connect +--- +apiVersion: v1 +kind: Service +metadata: + name: spark-connect-server-headless + labels: + app.kubernetes.io/component: server + app.kubernetes.io/instance: spark-connect + app.kubernetes.io/managed-by: spark.stackable.tech_connect + app.kubernetes.io/name: spark-connect + app.kubernetes.io/role-group: default + stackable.tech/vendor: Stackable + ownerReferences: + - apiVersion: spark.stackable.tech/v1alpha1 + controller: true + kind: SparkConnectServer + name: spark-connect +spec: + clusterIP: None + ports: + - name: grpc + port: 15002 + protocol: TCP + targetPort: 15002 + - name: http + port: 4040 + protocol: TCP + targetPort: 4040 + publishNotReadyAddresses: true + selector: + app.kubernetes.io/component: server + app.kubernetes.io/instance: spark-connect + app.kubernetes.io/name: spark-connect + type: ClusterIP +--- +apiVersion: v1 +kind: Service +metadata: + name: spark-connect-server-metrics + annotations: + prometheus.io/path: /metrics/prometheus + prometheus.io/port: "4040" + prometheus.io/scheme: http + prometheus.io/scrape: "true" + labels: + app.kubernetes.io/component: server + app.kubernetes.io/instance: spark-connect + app.kubernetes.io/managed-by: spark.stackable.tech_connect + app.kubernetes.io/name: spark-connect + app.kubernetes.io/role-group: default + prometheus.io/scrape: "true" + stackable.tech/vendor: Stackable + ownerReferences: + - apiVersion: spark.stackable.tech/v1alpha1 + controller: true + kind: SparkConnectServer + name: spark-connect +spec: + clusterIP: None + ports: + - name: metrics + port: 4040 + protocol: TCP + targetPort: 4040 + publishNotReadyAddresses: true + selector: + app.kubernetes.io/component: server + app.kubernetes.io/instance: spark-connect + app.kubernetes.io/name: spark-connect + type: ClusterIP +--- +apiVersion: listeners.stackable.tech/v1alpha1 +kind: Listener +metadata: + name: spark-connect-server + labels: + app.kubernetes.io/component: server + app.kubernetes.io/instance: spark-connect + app.kubernetes.io/managed-by: spark.stackable.tech_connect + app.kubernetes.io/name: spark-connect + app.kubernetes.io/role-group: default + stackable.tech/vendor: Stackable + ownerReferences: + - apiVersion: spark.stackable.tech/v1alpha1 + controller: true + kind: SparkConnectServer + name: spark-connect +spec: + className: external-unstable + ports: + - name: grpc + port: 15002 + protocol: TCP + - name: http + port: 4040 + protocol: TCP +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: spark-connect-serviceaccount + labels: + app.kubernetes.io/instance: spark-connect + app.kubernetes.io/managed-by: spark.stackable.tech_connect + app.kubernetes.io/name: spark-connect + ownerReferences: + - apiVersion: spark.stackable.tech/v1alpha1 + controller: true + kind: SparkConnectServer + name: spark-connect +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: spark-connect-rolebinding + labels: + app.kubernetes.io/instance: spark-connect + app.kubernetes.io/managed-by: spark.stackable.tech_connect + app.kubernetes.io/name: spark-connect + ownerReferences: + - apiVersion: spark.stackable.tech/v1alpha1 + controller: true + kind: SparkConnectServer + name: spark-connect +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: spark-connect-clusterrole +subjects: + - kind: ServiceAccount + name: spark-connect-serviceaccount diff --git a/tests/templates/kuttl/spark-connect-kerberos/20-assert.yaml b/tests/templates/kuttl/spark-connect-kerberos/20-assert.yaml new file mode 100644 index 00000000..c37bc28e --- /dev/null +++ b/tests/templates/kuttl/spark-connect-kerberos/20-assert.yaml @@ -0,0 +1,11 @@ +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestAssert +timeout: 600 +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: simple-connect-app +status: + succeeded: 1 diff --git a/tests/templates/kuttl/spark-connect-kerberos/20-run-connect-client.yaml.j2 b/tests/templates/kuttl/spark-connect-kerberos/20-run-connect-client.yaml.j2 new file mode 100644 index 00000000..2134d95b --- /dev/null +++ b/tests/templates/kuttl/spark-connect-kerberos/20-run-connect-client.yaml.j2 @@ -0,0 +1,120 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: spark-connect-client +data: + example.py: |- + # + # PySpark example that connects to a Spark Connect server and exercises a + # Kerberos-enabled Hive metastore in two ways: + # 1. Spark's built-in Hive catalog: DDL (write) + catalog queries (read). + # 2. A Hive/Iceberg catalog ("iceberg") that stores table data and metadata + # in S3 (MinIO): write rows into an Iceberg table and read them back. + # + # The client itself does NOT use Kerberos: it speaks plain gRPC over the + # `sc://` protocol. All Kerberos authentication happens between the Spark + # Connect server (the driver) and the Hive metastore. The catalogs and the + # kerberos config are set server-side, so the client only issues SQL. + # + import sys + from pyspark.sql import SparkSession + + if __name__ == "__main__": + remote: str = sys.argv[1] + + print(f"Connecting to Spark Connect server at {remote}") + spark = ( + SparkSession.builder.appName("KerberosHiveExample") + .remote(remote) + .getOrCreate() + ) + + # 1. Built-in Hive catalog: DDL is persisted in the kerberized Hive + # metastore. Reaching the metastore at all requires a successful + # SASL/GSSAPI (Kerberos) handshake, which is the point of this test. + print("Creating database and table in the kerberized Hive metastore") + spark.sql("CREATE DATABASE IF NOT EXISTS kerb LOCATION 's3a://lakehouse/kerb'") + spark.sql("DROP TABLE IF EXISTS kerb.greetings") + spark.sql("CREATE TABLE kerb.greetings (id INT, data STRING) USING parquet") + + print("Reading metadata back from the metastore") + tables = [row.tableName for row in spark.sql("SHOW TABLES IN kerb").collect()] + print("Tables in 'kerb':", tables) + assert "greetings" in tables, f"table 'greetings' not registered in the HMS: {tables}" + + columns = [row.col_name for row in spark.sql("DESCRIBE TABLE kerb.greetings").collect()] + print("Columns of kerb.greetings:", columns) + assert "id" in columns and "data" in columns, f"unexpected columns: {columns}" + + # 2. Hive/Iceberg catalog: write an Iceberg table managed in S3. Creating + # and writing the table goes through the Iceberg HiveCatalog, which opens + # its own connection to the kerberized metastore - this is exactly the path + # that failed in issue #702 (GSS initiate failed). Iceberg's commit protocol + # writes the data + metadata to S3 without a rename-based committer. + print("Writing an Iceberg table stored in S3 via the kerberized metastore") + spark.sql("CREATE NAMESPACE IF NOT EXISTS iceberg.lakehouse") + spark.sql("DROP TABLE IF EXISTS iceberg.lakehouse.greetings") + spark.sql("CREATE TABLE iceberg.lakehouse.greetings (id INT, data STRING) USING iceberg") + spark.sql("INSERT INTO iceberg.lakehouse.greetings VALUES (1, 'one'), (2, 'two'), (3, 'three')") + spark.sql("SELECT * FROM iceberg.lakehouse.greetings ORDER BY id").show() + + # Verify the Iceberg table is registered in the kerberized metastore via a + # metastore query (driver-side). We deliberately avoid result.collect() / + # count() over the Iceberg table: a *distributed* scan ships the Iceberg + # RDD closure to the executors, where it currently fails to deserialize on + # Spark Connect ("cannot assign SerializedLambda ... Function3"). That is a + # Spark Connect classloader limitation independent of where the Iceberg jar + # is placed, and is separate from the metastore/Kerberos integration this + # test validates. See https://github.com/stackabletech/spark-k8s-operator/issues/702 + ice_tables = [row.tableName for row in spark.sql("SHOW TABLES IN iceberg.lakehouse").collect()] + print("Tables in 'iceberg.lakehouse':", ice_tables) + assert "greetings" in ice_tables, f"Iceberg table not registered in the HMS: {ice_tables}" + + ice_columns = [row.col_name for row in spark.sql("DESCRIBE TABLE iceberg.lakehouse.greetings").collect()] + print("Columns of iceberg.lakehouse.greetings:", ice_columns) + assert "id" in ice_columns and "data" in ice_columns, f"unexpected columns: {ice_columns}" + + print("[SUCCESS] Read/write against the kerberized Hive metastore and Iceberg catalog succeeded") + + spark.stop() +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: simple-connect-app + labels: + stackable.tech/vendor: Stackable +spec: + template: + spec: + restartPolicy: OnFailure + activeDeadlineSeconds: 600 + containers: + - name: simple-connect-app +{% set spark_connect_client_version = test_scenario['values']['spark-connect'].split(',')[0] %} + # Extract the Spark Connect version and use it for the client. + # Using a separate dimension for the client doesn't work because beku would generate tests with + # version mismatches between client and server. + image: oci.stackable.tech/stackable/spark-connect-client:{{ spark_connect_client_version }}-stackable0.0.0-dev + imagePullPolicy: IfNotPresent + command: + [ + "/usr/bin/python", + "/app/example.py", + "sc://spark-connect-server", + ] + resources: + limits: + cpu: 500m + memory: 128Mi + requests: + cpu: 200m + memory: 128Mi + volumeMounts: + - name: spark-connect-client + mountPath: /app + volumes: + - name: spark-connect-client + configMap: + name: spark-connect-client diff --git a/tests/templates/kuttl/spark-connect-kerberos/certs/generate.sh b/tests/templates/kuttl/spark-connect-kerberos/certs/generate.sh new file mode 100755 index 00000000..cdd4962b --- /dev/null +++ b/tests/templates/kuttl/spark-connect-kerberos/certs/generate.sh @@ -0,0 +1,132 @@ +#!/bin/bash + +set -euo pipefail + +# Function to display help message +show_help() { + cat << EOF +Usage: $(basename "$0") [OPTIONS] [COMMON_NAME] [LIFETIME_DAYS] + +Generate a self-signed root CA and client certificate for TLS connections. + +Arguments: + COMMON_NAME Common name for the certificate (default: minio) + LIFETIME_DAYS Validity period in days for CA and client cert (default: 36500) + +Options: + -h, --help Show this help message and exit + +Examples: + $(basename "$0") # Use defaults (minio, 36500 days) + $(basename "$0") myserver # Custom common name + $(basename "$0") myserver 365 # Custom common name and 1 year validity + +The script generates the following files: + - ca.crt (Root CA) + - tls.crt (Client certificate) + - tls.key (Client private key) + - minio-tls-ca-secret.yaml (Kubernetes secret manifest) +EOF + exit 0 +} + +# Parse command line arguments +if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then + show_help +fi + +# Set defaults +FQDN="${1:-minio}" +LIFETIME="${2:-36500}" + +# Validate that LIFETIME is a number +if ! [[ "$LIFETIME" =~ ^[0-9]+$ ]]; then + echo "Error: LIFETIME_DAYS must be a positive integer" >&2 + exit 1 +fi + +echo "Generating certificates with:" +echo " Common Name: ${FQDN}" +echo " Lifetime: ${LIFETIME} days" +echo "" + +echo "Creating Root Certificate Authority" +openssl genrsa \ + -out root-ca.key.pem \ + 2048 + +echo "Self-signing the Root Certificate Authority" +openssl req \ + -x509 \ + -new \ + -nodes \ + -key root-ca.key.pem \ + -days "${LIFETIME}" \ + -out root-ca.crt.pem \ + -subj "/C=DE/ST=Schleswig-Holstein/L=Wedel/O=Stackable Signing Authority Inc/CN=stackable.de" + +echo "Creating client private key" +openssl genrsa \ + -out client.key.pem \ + 2048 + +echo "Creating the CSR" +openssl req -new \ + -key client.key.pem \ + -out client.csr.pem \ + -subj "/C=DE/ST=Schleswig-Holstein/L=Wedel/O=Stackable/CN=${FQDN}" \ + -addext "subjectAltName = DNS:${FQDN}, DNS:localhost" + +echo "Signing the client cert with the root ca" +openssl x509 \ + -req -in client.csr.pem \ + -CA root-ca.crt.pem \ + -CAkey root-ca.key.pem \ + -CAcreateserial \ + -out client.crt.pem \ + -days "${LIFETIME}" \ + -copy_extensions copy + +echo "Copying the files to match the api of the secret-operator" +mv root-ca.crt.pem ca.crt +mv client.key.pem tls.key +mv client.crt.pem tls.crt + +echo "" +echo "Generating Kubernetes secret manifest..." + +# Calculate dates +GENERATION_DATE=$(date -u +"%Y-%m-%dT%H:%M:%SZ") +EXPIRATION_DATE=$(date -u -d "+${LIFETIME} days" +"%Y-%m-%dT%H:%M:%SZ") + +# Base64 encode the certificate files +CA_CRT_B64=$(base64 -w 0 < ca.crt) +TLS_CRT_B64=$(base64 -w 0 < tls.crt) +TLS_KEY_B64=$(base64 -w 0 < tls.key) + +# Generate the Kubernetes secret manifest +cat > minio-tls-ca-secret.yaml << EOF +# Generated with certs/generate.sh in this test folder. +--- +apiVersion: v1 +kind: Secret +metadata: + name: minio-tls-ca + labels: + secrets.stackable.tech/class: minio-tls-ca + annotations: + cert.common-name: "${FQDN}" + cert.generated-at: "${GENERATION_DATE}" + cert.expires-at: "${EXPIRATION_DATE}" +data: + ca.crt: ${CA_CRT_B64} + tls.crt: ${TLS_CRT_B64} + tls.key: ${TLS_KEY_B64} +EOF + +echo "" +echo "Certificate generation complete!" +echo "Generated files:" +echo " - ca.crt, tls.crt, tls.key (certificate files)" +echo " - minio-tls-ca-secret.yaml (Kubernetes secret manifest)" +echo "" diff --git a/tests/templates/kuttl/spark-connect-kerberos/helm-bitnami-minio-values.yaml.j2 b/tests/templates/kuttl/spark-connect-kerberos/helm-bitnami-minio-values.yaml.j2 new file mode 100644 index 00000000..9257f8d9 --- /dev/null +++ b/tests/templates/kuttl/spark-connect-kerberos/helm-bitnami-minio-values.yaml.j2 @@ -0,0 +1,74 @@ +--- +global: + security: + allowInsecureImages: true # needed starting with Chart version 14.9.0 if modifying images + +image: + repository: bitnamilegacy/minio +clientImage: + repository: bitnamilegacy/minio-client +defaultInitContainers: + volumePermissions: # volumePermissions moved under defaultInitContainers starting with Chart version 17.0.0 + enabled: false + image: + repository: bitnamilegacy/os-shell +console: + image: + repository: bitnamilegacy/minio-object-browser + +mode: standalone +disableWebUI: false +extraEnvVars: + - name: MINIO_LOG_LEVEL + value: DEBUG + +provisioning: + enabled: true + buckets: + - name: ingest-bucket + - name: stats-bucket + - name: lakehouse + usersExistingSecrets: + - minio-users + resources: + requests: + memory: 1Gi + cpu: "512m" + limits: + memory: "1Gi" + cpu: "1" + podSecurityContext: + enabled: false + containerSecurityContext: + enabled: false + +# volumePermissions can be removed starting with Chart version 17.0.0, moved under defaultInitContainers +volumePermissions: + enabled: false + image: + repository: bitnamilegacy/os-shell + +podSecurityContext: + enabled: false + +containerSecurityContext: + enabled: false + +persistence: + enabled: false + +resources: + requests: + memory: 1Gi + cpu: "512m" + limits: + memory: "1Gi" + cpu: "1" + +service: + type: NodePort +{% if test_scenario['values']['s3-use-tls'] == 'true' %} +tls: + enabled: true + existingSecret: minio-tls-ca +{% endif %} diff --git a/tests/test-definition.yaml b/tests/test-definition.yaml index 5cf528eb..8fb06924 100644 --- a/tests/test-definition.yaml +++ b/tests/test-definition.yaml @@ -40,6 +40,12 @@ dimensions: - 3.5.8 - 4.1.1 # - 3.5.6,oci.stackable.tech/sandbox/spark-k8s:3.5.6-stackable0.0.0-dev + - name: krb5 + values: + - 1.21.1 + - name: kerberos-realm + values: + - "CLUSTER.LOCAL" - name: hbase values: - 2.6.4 @@ -131,6 +137,15 @@ tests: - hive-iceberg - openshift - s3-use-tls + - name: spark-connect-kerberos + dimensions: + - spark-connect + - iceberg-latest + - hive-iceberg + - krb5 + - kerberos-realm + - openshift + - s3-use-tls suites: - name: nightly From ad51d1b2a973a5af66a3ad6210928b0cfeef279c Mon Sep 17 00:00:00 2001 From: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> Date: Thu, 18 Jun 2026 10:10:29 +0200 Subject: [PATCH 2/4] Add comment about spark.addArtifact() --- .../spark-connect-kerberos/10_spark-connect.yaml.j2 | 2 +- .../20-run-connect-client.yaml.j2 | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/templates/kuttl/spark-connect-kerberos/10_spark-connect.yaml.j2 b/tests/templates/kuttl/spark-connect-kerberos/10_spark-connect.yaml.j2 index f9e285b0..f4c44fb5 100644 --- a/tests/templates/kuttl/spark-connect-kerberos/10_spark-connect.yaml.j2 +++ b/tests/templates/kuttl/spark-connect-kerberos/10_spark-connect.yaml.j2 @@ -151,7 +151,7 @@ spec: # protocol (not the rename-based FileOutputCommitter), so writes to S3 # work without the staging/_temporary issues of plain parquet tables. # The catalog inherits the kerberos metastore config from spark.hadoop.* - # above. See https://github.com/stackabletech/spark-k8s-operator/issues/702 + # above. See https://issues.apache.org/jira/browse/SPARK-46032 spark.sql.extensions: org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions spark.sql.catalog.iceberg: org.apache.iceberg.spark.SparkCatalog spark.sql.catalog.iceberg.type: hive diff --git a/tests/templates/kuttl/spark-connect-kerberos/20-run-connect-client.yaml.j2 b/tests/templates/kuttl/spark-connect-kerberos/20-run-connect-client.yaml.j2 index 2134d95b..801d0c18 100644 --- a/tests/templates/kuttl/spark-connect-kerberos/20-run-connect-client.yaml.j2 +++ b/tests/templates/kuttl/spark-connect-kerberos/20-run-connect-client.yaml.j2 @@ -63,10 +63,12 @@ data: # metastore query (driver-side). We deliberately avoid result.collect() / # count() over the Iceberg table: a *distributed* scan ships the Iceberg # RDD closure to the executors, where it currently fails to deserialize on - # Spark Connect ("cannot assign SerializedLambda ... Function3"). That is a - # Spark Connect classloader limitation independent of where the Iceberg jar - # is placed, and is separate from the metastore/Kerberos integration this - # test validates. See https://github.com/stackabletech/spark-k8s-operator/issues/702 + # Spark Connect ("cannot assign SerializedLambda ... Function3"). This is an + # upstream Spark Connect classloader limitation (see SPARK-46032 / + # SPARK-51537) independent of how the Iceberg jar is provisioned - it + # reproduces with --packages, with the jar on the system classpath, with + # spark.addArtifact(), and with --packages + addArtifact together - and is + # separate from the metastore/Kerberos integration this test validates. ice_tables = [row.tableName for row in spark.sql("SHOW TABLES IN iceberg.lakehouse").collect()] print("Tables in 'iceberg.lakehouse':", ice_tables) assert "greetings" in ice_tables, f"Iceberg table not registered in the HMS: {ice_tables}" From cf2432e2c9ad2cc1a999fa777ac6192ee46647b9 Mon Sep 17 00:00:00 2001 From: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> Date: Thu, 18 Jun 2026 10:59:42 +0200 Subject: [PATCH 3/4] Update docs --- .../pages/usage-guide/spark-connect.adoc | 93 ++++++++++++++++++- 1 file changed, 92 insertions(+), 1 deletion(-) diff --git a/docs/modules/spark-k8s/pages/usage-guide/spark-connect.adoc b/docs/modules/spark-k8s/pages/usage-guide/spark-connect.adoc index eebc51e7..9271109e 100644 --- a/docs/modules/spark-k8s/pages/usage-guide/spark-connect.adoc +++ b/docs/modules/spark-k8s/pages/usage-guide/spark-connect.adoc @@ -97,6 +97,92 @@ spec: ... ``` +== Kerberos + +NOTE: Kerberos support for Spark Connect is not a first-class feature of the operator. The setup described here is a manual configuration that uses `podOverrides`, `configOverrides`, `envOverrides` and an init container. + +A Spark Connect server can authenticate to a Kerberos-secured service, such as a Apache Hive metastore or Apache Hadoop HDFS. +There is, however, an important caveat: the Spark Connect server runs in Spark's `client` deploy mode. +Spark only performs the automatic keytab login (`UserGroupInformation.loginUserFromKeytab`) for YARN, local, Mesos and the Kubernetes _cluster_-mode driver -- *not* for the Kubernetes client mode that the Connect server uses. +As a consequence, setting `spark.kerberos.keytab` and `spark.kerberos.principal` alone does not obtain a Kerberos ticket (TGT), and the SASL/GSSAPI handshake to the metastore fails with `GSS initiate failed` / `Failed to find any Kerberos tgt`. + +The workaround is to obtain the ticket yourself with an *init container* that runs `kinit` before the Spark Connect server JVM starts. +The init container reads the keytab and writes a Kerberos credential cache to an `emptyDir` volume that is shared with the `spark` container; the server JVM then picks the ticket up from that cache. +An init container is required because nothing in the client-mode server process performs the login automatically, so the ticket must exist in the credential cache before the JVM opens the metastore connection. + +The relevant parts of the `SparkConnectServer` are shown below. + +[source,yaml] +---- +spec: + server: + envOverrides: + # The shared credential cache populated by the init container. Hadoop's + # UserGroupInformation reads KRB5CCNAME when Kerberos auth is enabled. + KRB5CCNAME: /stackable/krb5/ccache + jvmArgumentOverrides: + add: + # The JVM does NOT read the KRB5_CONFIG environment variable (that is an + # MIT C-library variable). It reads this system property (or /etc/krb5.conf). + - -Djava.security.krb5.conf=/stackable/kerberos/krb5.conf + # The Spark Connect execute thread does not carry the ticket in its + # Subject, so JGSS must fall back to the ambient credential cache. + - -Djavax.security.auth.useSubjectCredsOnly=false + configOverrides: + spark-defaults.conf: + spark.hadoop.hadoop.security.authentication: "kerberos" + spark.hadoop.hive.metastore.uris: "thrift://hive-metastore:9083" + spark.hadoop.hive.metastore.sasl.enabled: "true" + # Use the literal metastore principal, not _HOST (which would resolve to + # the connection host instead of the metastore's service principal). + spark.hadoop.hive.metastore.kerberos.principal: "hive/hive.example.svc.cluster.local@EXAMPLE.COM" + podOverrides: + spec: + initContainers: + - name: kinit + image: oci.stackable.tech/sdp/spark-k8s:4.1.1-stackable0.0.0-dev + command: ["/bin/bash", "-euo", "pipefail", "-c"] + args: + - kinit -kt /stackable/kerberos/keytab spark-connect/spark-connect.example.svc.cluster.local@EXAMPLE.COM + env: + - name: KRB5_CONFIG + value: /stackable/kerberos/krb5.conf + - name: KRB5CCNAME + value: /stackable/krb5/ccache + volumeMounts: + - name: kerberos + mountPath: /stackable/kerberos + - name: krb5-ccache + mountPath: /stackable/krb5 + containers: + - name: spark + volumeMounts: + - name: kerberos + mountPath: /stackable/kerberos + - name: krb5-ccache + mountPath: /stackable/krb5 + volumes: + - name: krb5-ccache + emptyDir: {} + - name: kerberos + ephemeral: + volumeClaimTemplate: + metadata: + annotations: + secrets.stackable.tech/class: kerberos + secrets.stackable.tech/scope: service=spark-connect + secrets.stackable.tech/kerberos.service.names: spark-connect + spec: + storageClassName: secrets.stackable.tech + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: "1" +---- + +The keytab and `krb5.conf` are only required on the server, which is the Spark driver and the only component that talks to the metastore. +Executors that merely read and write data files on (non-kerberized) S3 do not need Kerberos credentials. + == Spark History Server Unfortunately integration with the Spark History Server is not supported yet. @@ -112,4 +198,9 @@ The following features are not supported by the Stackable Spark operator yet == Known Issues -* Dynamically provisioning the iceberg runtime leads to "iceberg.SparkWrite$WriterFactory" ClassNotfoundException when attempting to use it from clients. +* Distributed operations on Apache Iceberg tables fail on the executors. + PySpark calls that trigger a distributed job over an Iceberg table -- for example `DataFrame.collect()` or `DataFrame.count()` -- fail with `java.lang.ClassCastException: cannot assign instance of java.lang.invoke.SerializedLambda to field org.apache.spark.rdd.MapPartitionsRDD.f of type scala.Function3`. + Driver-only operations still work: DDL (`CREATE`/`DROP`), `INSERT` of small data, `DataFrame.show()` of a small result, `SHOW TABLES` and `DESCRIBE`. + The cause is upstream in Spark Connect: executor tasks run under a per-session class loader (which fetches session classes from the driver's artifact server) that cannot deserialize the Iceberg scan closures. + It is independent of how the Iceberg runtime is provisioned -- it reproduces with `--packages`, with the jar placed on the system class path (`/stackable/spark/jars`), with `spark.addArtifact()`, and with combinations of these. + See https://issues.apache.org/jira/browse/SPARK-46032[SPARK-46032] (open) and https://issues.apache.org/jira/browse/SPARK-51537[SPARK-51537] for details. From 5039752fe0da6c0c31502b82bc07a2a706cf320f Mon Sep 17 00:00:00 2001 From: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> Date: Thu, 18 Jun 2026 12:09:56 +0200 Subject: [PATCH 4/4] Update test and docs to mention that only Spark 3 is affected --- .../pages/usage-guide/spark-connect.adoc | 4 +- .../20-run-connect-client.yaml.j2 | 41 +++++++++++-------- 2 files changed, 25 insertions(+), 20 deletions(-) diff --git a/docs/modules/spark-k8s/pages/usage-guide/spark-connect.adoc b/docs/modules/spark-k8s/pages/usage-guide/spark-connect.adoc index 9271109e..8019a0f7 100644 --- a/docs/modules/spark-k8s/pages/usage-guide/spark-connect.adoc +++ b/docs/modules/spark-k8s/pages/usage-guide/spark-connect.adoc @@ -198,9 +198,9 @@ The following features are not supported by the Stackable Spark operator yet == Known Issues -* Distributed operations on Apache Iceberg tables fail on the executors. +* Distributed operations on Apache Iceberg tables fail on the executors *with Spark 3.5.x*. PySpark calls that trigger a distributed job over an Iceberg table -- for example `DataFrame.collect()` or `DataFrame.count()` -- fail with `java.lang.ClassCastException: cannot assign instance of java.lang.invoke.SerializedLambda to field org.apache.spark.rdd.MapPartitionsRDD.f of type scala.Function3`. Driver-only operations still work: DDL (`CREATE`/`DROP`), `INSERT` of small data, `DataFrame.show()` of a small result, `SHOW TABLES` and `DESCRIBE`. The cause is upstream in Spark Connect: executor tasks run under a per-session class loader (which fetches session classes from the driver's artifact server) that cannot deserialize the Iceberg scan closures. It is independent of how the Iceberg runtime is provisioned -- it reproduces with `--packages`, with the jar placed on the system class path (`/stackable/spark/jars`), with `spark.addArtifact()`, and with combinations of these. - See https://issues.apache.org/jira/browse/SPARK-46032[SPARK-46032] (open) and https://issues.apache.org/jira/browse/SPARK-51537[SPARK-51537] for details. + *This is fixed in Spark 4* (https://issues.apache.org/jira/browse/SPARK-51537[SPARK-51537] / https://github.com/apache/spark/pull/50475[apache/spark#50475]), where distributed reads and writes of Iceberg tables work; it is not backported to the 3.5.x line (see the still-open https://issues.apache.org/jira/browse/SPARK-46032[SPARK-46032]). diff --git a/tests/templates/kuttl/spark-connect-kerberos/20-run-connect-client.yaml.j2 b/tests/templates/kuttl/spark-connect-kerberos/20-run-connect-client.yaml.j2 index 801d0c18..7255afa3 100644 --- a/tests/templates/kuttl/spark-connect-kerberos/20-run-connect-client.yaml.j2 +++ b/tests/templates/kuttl/spark-connect-kerberos/20-run-connect-client.yaml.j2 @@ -57,25 +57,30 @@ data: spark.sql("DROP TABLE IF EXISTS iceberg.lakehouse.greetings") spark.sql("CREATE TABLE iceberg.lakehouse.greetings (id INT, data STRING) USING iceberg") spark.sql("INSERT INTO iceberg.lakehouse.greetings VALUES (1, 'one'), (2, 'two'), (3, 'three')") - spark.sql("SELECT * FROM iceberg.lakehouse.greetings ORDER BY id").show() - # Verify the Iceberg table is registered in the kerberized metastore via a - # metastore query (driver-side). We deliberately avoid result.collect() / - # count() over the Iceberg table: a *distributed* scan ships the Iceberg - # RDD closure to the executors, where it currently fails to deserialize on - # Spark Connect ("cannot assign SerializedLambda ... Function3"). This is an - # upstream Spark Connect classloader limitation (see SPARK-46032 / - # SPARK-51537) independent of how the Iceberg jar is provisioned - it - # reproduces with --packages, with the jar on the system classpath, with - # spark.addArtifact(), and with --packages + addArtifact together - and is - # separate from the metastore/Kerberos integration this test validates. - ice_tables = [row.tableName for row in spark.sql("SHOW TABLES IN iceberg.lakehouse").collect()] - print("Tables in 'iceberg.lakehouse':", ice_tables) - assert "greetings" in ice_tables, f"Iceberg table not registered in the HMS: {ice_tables}" - - ice_columns = [row.col_name for row in spark.sql("DESCRIBE TABLE iceberg.lakehouse.greetings").collect()] - print("Columns of iceberg.lakehouse.greetings:", ice_columns) - assert "id" in ice_columns and "data" in ice_columns, f"unexpected columns: {ice_columns}" + # Reading the Iceberg table back differs by Spark version: + # + # Spark 4 includes the Spark Connect executor class loader fix + # (SPARK-51537 / apache/spark#50475), so a *distributed* read works: + # collect() ships the Iceberg scan closure to the executors and succeeds. + # + # On Spark 3.5.x that distributed scan still fails on the executors with + # "cannot assign SerializedLambda ... Function3" (SPARK-46032, not fixed in + # 3.5.x), so we verify the table via metastore queries (driver-side) instead. + print(f"Connected to Spark {spark.version}") + if spark.version.startswith("4"): + result = spark.sql("SELECT * FROM iceberg.lakehouse.greetings ORDER BY id") + result.show() + rows = result.collect() + assert len(rows) == 3, f"expected 3 rows in the Iceberg table, got {len(rows)}" + else: + spark.sql("SELECT * FROM iceberg.lakehouse.greetings ORDER BY id").show() + ice_tables = [row.tableName for row in spark.sql("SHOW TABLES IN iceberg.lakehouse").collect()] + print("Tables in 'iceberg.lakehouse':", ice_tables) + assert "greetings" in ice_tables, f"Iceberg table not registered in the HMS: {ice_tables}" + ice_columns = [row.col_name for row in spark.sql("DESCRIBE TABLE iceberg.lakehouse.greetings").collect()] + print("Columns of iceberg.lakehouse.greetings:", ice_columns) + assert "id" in ice_columns and "data" in ice_columns, f"unexpected columns: {ice_columns}" print("[SUCCESS] Read/write against the kerberized Hive metastore and Iceberg catalog succeeded")