From e641710a8987e7c37efa2a5bda4e88d860d738af Mon Sep 17 00:00:00 2001
From: syf2211 <syf2211@users.noreply.github.com>
Date: Thu, 25 Jun 2026 16:40:05 +0000
Subject: [PATCH] fix(vmcp): detect pod volume drift via hash annotation
 (#5619)

Mirror the imagePullSecrets hash-annotation pattern so changes to
operator-managed volumes and volumeMounts (auth-server signing keys,
CA-bundle refs, telemetry CA bundles) trigger a Deployment rollout.

Adds buildPodVolumesForVmcp shared by the deploy builder and drift
check, podVolumesHash with canonical source fingerprinting that ignores
K8s-defaulted fields like defaultMode, and regression tests.
---
 .../virtualmcpserver_controller.go            |  40 ++++
 .../virtualmcpserver_controller_test.go       |   5 +-
 .../virtualmcpserver_deployment.go            | 187 +++++++++++++++---
 .../virtualmcpserver_deployment_test.go       | 129 +++++++++++-
 4 files changed, 333 insertions(+), 28 deletions(-)

diff --git a/cmd/thv-operator/controllers/virtualmcpserver_controller.go b/cmd/thv-operator/controllers/virtualmcpserver_controller.go
index ea94504456..9ea1ae6880 100644
--- a/cmd/thv-operator/controllers/virtualmcpserver_controller.go
+++ b/cmd/thv-operator/controllers/virtualmcpserver_controller.go
@@ -1621,6 +1621,10 @@ func (r *VirtualMCPServerReconciler) deploymentNeedsUpdate(
 		return true
 	}
 
+	if r.podVolumesNeedsUpdate(ctx, deployment, vmcp, telemetryCfg, typedWorkloads) {
+		return true
+	}
+
 	// Check if spec.replicas has changed. Only compare when spec.replicas is non-nil;
 	// nil means hands-off mode (HPA or external controller manages replicas) and the live count is authoritative.
 	if vmcp.Spec.Replicas != nil {
@@ -1802,6 +1806,42 @@ func (r *VirtualMCPServerReconciler) imagePullSecretsNeedsUpdate(
 	return deployment.Annotations[imagePullRefsHashAnnotation] != expectedHash
 }
 
+// podVolumesNeedsUpdate detects drift on the desired pod volumes and volumeMounts
+// by comparing a hash of the desired set against podVolumesHashAnnotation. We
+// cannot compare deployment.Spec.Template.Spec.Volumes directly because the
+// live list includes K8s-defaulted fields (defaultMode, etc.) that would cause
+// spurious drift or mask real changes when only the backing secret/configmap
+// ref changes (see #5619).
+func (r *VirtualMCPServerReconciler) podVolumesNeedsUpdate(
+	ctx context.Context,
+	deployment *appsv1.Deployment,
+	vmcp *mcpv1beta1.VirtualMCPServer,
+	telemetryCfg *mcpv1beta1.MCPTelemetryConfig,
+	typedWorkloads []workloads.TypedWorkload,
+) bool {
+	if deployment == nil || vmcp == nil {
+		return true
+	}
+
+	volumeMounts, volumes, err := r.buildPodVolumesForVmcp(ctx, vmcp, telemetryCfg, typedWorkloads)
+	if err != nil {
+		log.FromContext(ctx).Error(err, "Failed to build pod volumes for drift check, assuming update needed")
+		return true
+	}
+
+	expectedHash, err := podVolumesHash(volumeMounts, volumes)
+	if err != nil {
+		log.FromContext(ctx).Error(err, "Failed to hash pod volumes, assuming update needed")
+		return true
+	}
+
+	_, present := deployment.Annotations[podVolumesHashAnnotation]
+	if expectedHash == "" {
+		return present
+	}
+	return deployment.Annotations[podVolumesHashAnnotation] != expectedHash
+}
+
 // serviceNeedsUpdate checks if the service needs to be updated
 func (*VirtualMCPServerReconciler) serviceNeedsUpdate(
 	service *corev1.Service,
diff --git a/cmd/thv-operator/controllers/virtualmcpserver_controller_test.go b/cmd/thv-operator/controllers/virtualmcpserver_controller_test.go
index ccf61555d4..68ce18aa0c 100644
--- a/cmd/thv-operator/controllers/virtualmcpserver_controller_test.go
+++ b/cmd/thv-operator/controllers/virtualmcpserver_controller_test.go
@@ -1905,6 +1905,9 @@ func TestVirtualMCPServerDeploymentNeedsUpdate(t *testing.T) {
 	expectedLabels, expectedAnnotations := reconciler.buildPodTemplateMetadata(
 		labelsForVirtualMCPServer(vmcp.Name), vmcp, vmcpConfigChecksum,
 	)
+	_, expectedDeploymentAnnotations := reconciler.buildDeploymentMetadataForVmcp(
+		context.Background(), labelsForVirtualMCPServer(vmcp.Name), vmcp, nil, []workloads.TypedWorkload{},
+	)
 
 	tests := []struct {
 		name           string
@@ -2014,7 +2017,7 @@ func TestVirtualMCPServerDeploymentNeedsUpdate(t *testing.T) {
 			deployment: &appsv1.Deployment{
 				ObjectMeta: metav1.ObjectMeta{
 					Labels:      labelsForVirtualMCPServer(vmcp.Name),
-					Annotations: make(map[string]string),
+					Annotations: expectedDeploymentAnnotations,
 				},
 				Spec: appsv1.DeploymentSpec{
 					Template: corev1.PodTemplateSpec{
diff --git a/cmd/thv-operator/controllers/virtualmcpserver_deployment.go b/cmd/thv-operator/controllers/virtualmcpserver_deployment.go
index 4b6120004c..0261a21e94 100644
--- a/cmd/thv-operator/controllers/virtualmcpserver_deployment.go
+++ b/cmd/thv-operator/controllers/virtualmcpserver_deployment.go
@@ -50,6 +50,14 @@ const (
 	// detect every input that influences the deployed PodSpec.ImagePullSecrets.
 	imagePullRefsHashAnnotation = "toolhive.stacklok.io/imagepullsecrets-hash"
 
+	// podVolumesHashAnnotation tracks the SHA256 hash of the desired pod
+	// volumes and volumeMounts list. Mirrors the imagePullRefsHashAnnotation
+	// pattern to detect drift on volume-only inputs (auth-server signing keys,
+	// CA-bundle refs, telemetry CA bundles) without comparing live
+	// PodSpec.Volumes directly, which trips on K8s-defaulted fields like
+	// defaultMode.
+	podVolumesHashAnnotation = "toolhive.stacklok.io/podvolumes-hash"
+
 	// Log level configuration
 	logLevelDebug = "debug" // Debug log level value
 
@@ -147,7 +155,7 @@ func (r *VirtualMCPServerReconciler) deploymentForVirtualMCPServer(
 
 	// Build deployment components using helper functions
 	args := r.buildContainerArgsForVmcp(vmcp)
-	volumeMounts, volumes, err := r.buildVolumesForVmcp(ctx, vmcp)
+	volumeMounts, volumes, err := r.buildPodVolumesForVmcp(ctx, vmcp, telemetryCfg, typedWorkloads)
 	if err != nil {
 		log.FromContext(ctx).Error(err, "Failed to build volumes for VirtualMCPServer")
 		return nil
@@ -158,31 +166,9 @@ func (r *VirtualMCPServerReconciler) deploymentForVirtualMCPServer(
 		return nil
 	}
 
-	// Add CA bundle volumes for MCPServerEntry backends with caBundleRef
-	caVolumes, caMounts, err := r.buildCABundleVolumesForEntries(ctx, vmcp.Namespace, typedWorkloads)
-	if err != nil {
-		log.FromContext(ctx).Error(err, "Failed to build CA bundle volumes for MCPServerEntries")
-		return nil
-	}
-	volumes = append(volumes, caVolumes...)
-	volumeMounts = append(volumeMounts, caMounts...)
-
-	// Add telemetry CA bundle volumes from the pre-fetched MCPTelemetryConfig
-	if telemetryCfg != nil {
-		telVolumes, telMounts := ctrlutil.AddTelemetryCABundleVolumes(telemetryCfg)
-		volumes = append(volumes, telVolumes...)
-		volumeMounts = append(volumeMounts, telMounts...)
-	}
-
-	// Add embedded auth server volumes if configured (inline config). The matching
-	// env vars are injected by buildEnvVarsForVmcp above so the drift check stays
-	// symmetric with what is built here (see #5616).
-	if vmcp.Spec.AuthServerConfig != nil {
-		authServerVolumes, authServerMounts := ctrlutil.GenerateAuthServerVolumes(vmcp.Spec.AuthServerConfig)
-		volumes = append(volumes, authServerVolumes...)
-		volumeMounts = append(volumeMounts, authServerMounts...)
-	}
-	deploymentLabels, deploymentAnnotations := r.buildDeploymentMetadataForVmcp(ls, vmcp)
+	deploymentLabels, deploymentAnnotations := r.buildDeploymentMetadataForVmcp(
+		ctx, ls, vmcp, telemetryCfg, typedWorkloads,
+	)
 	deploymentTemplateLabels, deploymentTemplateAnnotations := r.buildPodTemplateMetadata(ls, vmcp, vmcpConfigChecksum)
 	podSecurityContext, containerSecurityContext := r.buildSecurityContextsForVmcp(ctx, vmcp)
 	serviceAccountName := r.serviceAccountNameForVmcp(vmcp)
@@ -329,6 +315,44 @@ func (r *VirtualMCPServerReconciler) buildVolumesForVmcp(
 	return volumeMounts, volumes, nil
 }
 
+// buildPodVolumesForVmcp assembles the full desired pod volumes and volumeMounts
+// for a VirtualMCPServer Deployment. Shared by deploymentForVirtualMCPServer and
+// the podVolumesNeedsUpdate drift check so both paths hash the same inputs.
+func (r *VirtualMCPServerReconciler) buildPodVolumesForVmcp(
+	ctx context.Context,
+	vmcp *mcpv1beta1.VirtualMCPServer,
+	telemetryCfg *mcpv1beta1.MCPTelemetryConfig,
+	typedWorkloads []workloads.TypedWorkload,
+) ([]corev1.VolumeMount, []corev1.Volume, error) {
+	volumeMounts, volumes, err := r.buildVolumesForVmcp(ctx, vmcp)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	caVolumes, caMounts, err := r.buildCABundleVolumesForEntries(ctx, vmcp.Namespace, typedWorkloads)
+	if err != nil {
+		return nil, nil, err
+	}
+	volumes = append(volumes, caVolumes...)
+	volumeMounts = append(volumeMounts, caMounts...)
+
+	if telemetryCfg != nil {
+		telVolumes, telMounts := ctrlutil.AddTelemetryCABundleVolumes(telemetryCfg)
+		volumes = append(volumes, telVolumes...)
+		volumeMounts = append(volumeMounts, telMounts...)
+	}
+
+	// The matching env vars are injected by buildEnvVarsForVmcp so the drift
+	// check stays symmetric with what is built here (see #5616).
+	if vmcp.Spec.AuthServerConfig != nil {
+		authServerVolumes, authServerMounts := ctrlutil.GenerateAuthServerVolumes(vmcp.Spec.AuthServerConfig)
+		volumes = append(volumes, authServerVolumes...)
+		volumeMounts = append(volumeMounts, authServerMounts...)
+	}
+
+	return volumeMounts, volumes, nil
+}
+
 // buildEnvVarsForVmcp builds environment variables for the vmcp container.
 // telemetryCfg is the already-fetched MCPTelemetryConfig (nil when not referenced).
 func (r *VirtualMCPServerReconciler) buildEnvVarsForVmcp(
@@ -891,8 +915,11 @@ func (r *VirtualMCPServerReconciler) getExternalAuthConfigSecretEnvVars(
 
 // buildDeploymentMetadataForVmcp builds deployment-level labels and annotations
 func (r *VirtualMCPServerReconciler) buildDeploymentMetadataForVmcp(
+	ctx context.Context,
 	baseLabels map[string]string,
 	vmcp *mcpv1beta1.VirtualMCPServer,
+	telemetryCfg *mcpv1beta1.MCPTelemetryConfig,
+	typedWorkloads []workloads.TypedWorkload,
 ) (map[string]string, map[string]string) {
 	deploymentLabels := baseLabels
 	deploymentAnnotations := make(map[string]string)
@@ -918,6 +945,16 @@ func (r *VirtualMCPServerReconciler) buildDeploymentMetadataForVmcp(
 		deploymentAnnotations[imagePullRefsHashAnnotation] = hash
 	}
 
+	// Store hash of the desired pod volumes and volumeMounts so
+	// deploymentNeedsUpdate can detect volume-only drift (auth-server secret
+	// refs, CA-bundle refs, telemetry CA bundles) without comparing live
+	// PodSpec.Volumes, which trips on K8s-defaulted fields (see #5619).
+	if volumeMounts, volumes, err := r.buildPodVolumesForVmcp(ctx, vmcp, telemetryCfg, typedWorkloads); err == nil {
+		if hash, err := podVolumesHash(volumeMounts, volumes); err == nil && hash != "" {
+			deploymentAnnotations[podVolumesHashAnnotation] = hash
+		}
+	}
+
 	// TODO: Add support for ResourceOverrides if needed in the future
 
 	return deploymentLabels, deploymentAnnotations
@@ -944,6 +981,104 @@ func imagePullSecretsHash(secrets []corev1.LocalObjectReference) (string, error)
 	return hex.EncodeToString(h[:]), nil
 }
 
+// podVolumeHashEntry captures drift-relevant volume fields without K8s-defaulted noise.
+type podVolumeHashEntry struct {
+	Name   string `json:"name"`
+	Source string `json:"source"`
+}
+
+// podVolumeMountHashEntry captures drift-relevant volumeMount fields.
+type podVolumeMountHashEntry struct {
+	Name      string `json:"name"`
+	MountPath string `json:"mountPath"`
+	SubPath   string `json:"subPath,omitempty"`
+	ReadOnly  bool   `json:"readOnly,omitempty"`
+}
+
+// podVolumesHashPayload is the canonical form hashed by podVolumesHash.
+type podVolumesHashPayload struct {
+	Volumes      []podVolumeHashEntry      `json:"volumes"`
+	VolumeMounts []podVolumeMountHashEntry `json:"volumeMounts"`
+}
+
+// volumeSourceFingerprint returns a stable identifier for the backing resource
+// referenced by a volume, ignoring K8s-defaulted fields like defaultMode.
+func volumeSourceFingerprint(vol corev1.Volume) string {
+	switch {
+	case vol.ConfigMap != nil:
+		src := "configmap:" + vol.ConfigMap.Name
+		if len(vol.ConfigMap.Items) > 0 {
+			items := make([]string, len(vol.ConfigMap.Items))
+			for i, item := range vol.ConfigMap.Items {
+				items[i] = item.Key + "=" + item.Path
+			}
+			sort.Strings(items)
+			src += ":" + strings.Join(items, ",")
+		}
+		return src
+	case vol.Secret != nil:
+		src := "secret:" + vol.Secret.SecretName
+		if len(vol.Secret.Items) > 0 {
+			items := make([]string, len(vol.Secret.Items))
+			for i, item := range vol.Secret.Items {
+				items[i] = item.Key + "=" + item.Path
+			}
+			sort.Strings(items)
+			src += ":" + strings.Join(items, ",")
+		}
+		return src
+	case vol.EmptyDir != nil:
+		return "emptydir"
+	default:
+		return "unknown:" + vol.Name
+	}
+}
+
+// podVolumesHash returns a deterministic SHA256 hash of the given volumes and
+// volumeMounts. Returns an empty string with no error when both lists are empty.
+func podVolumesHash(mounts []corev1.VolumeMount, volumes []corev1.Volume) (string, error) {
+	if len(mounts) == 0 && len(volumes) == 0 {
+		return "", nil
+	}
+
+	payload := podVolumesHashPayload{
+		Volumes:      make([]podVolumeHashEntry, len(volumes)),
+		VolumeMounts: make([]podVolumeMountHashEntry, len(mounts)),
+	}
+
+	for i, vol := range volumes {
+		payload.Volumes[i] = podVolumeHashEntry{
+			Name:   vol.Name,
+			Source: volumeSourceFingerprint(vol),
+		}
+	}
+	sort.Slice(payload.Volumes, func(i, j int) bool {
+		return payload.Volumes[i].Name < payload.Volumes[j].Name
+	})
+
+	for i, mount := range mounts {
+		payload.VolumeMounts[i] = podVolumeMountHashEntry{
+			Name:      mount.Name,
+			MountPath: mount.MountPath,
+			SubPath:   mount.SubPath,
+			ReadOnly:  mount.ReadOnly,
+		}
+	}
+	sort.Slice(payload.VolumeMounts, func(i, j int) bool {
+		if payload.VolumeMounts[i].Name != payload.VolumeMounts[j].Name {
+			return payload.VolumeMounts[i].Name < payload.VolumeMounts[j].Name
+		}
+		return payload.VolumeMounts[i].MountPath < payload.VolumeMounts[j].MountPath
+	})
+
+	canonical, err := json.Marshal(payload)
+	if err != nil {
+		return "", fmt.Errorf("failed to marshal pod volumes for hashing: %w", err)
+	}
+	h := sha256.Sum256(canonical)
+	return hex.EncodeToString(h[:]), nil
+}
+
 // buildPodTemplateMetadata builds pod template labels and annotations for vmcp
 func (*VirtualMCPServerReconciler) buildPodTemplateMetadata(
 	baseLabels map[string]string,
diff --git a/cmd/thv-operator/controllers/virtualmcpserver_deployment_test.go b/cmd/thv-operator/controllers/virtualmcpserver_deployment_test.go
index ef3ee1482d..a315832ca6 100644
--- a/cmd/thv-operator/controllers/virtualmcpserver_deployment_test.go
+++ b/cmd/thv-operator/controllers/virtualmcpserver_deployment_test.go
@@ -333,10 +333,14 @@ func TestBuildDeploymentMetadataForVmcp(t *testing.T) {
 	vmcp := v1beta1test.NewVirtualMCPServer("test-vmcp", "default")
 
 	r := &VirtualMCPServerReconciler{}
-	labels, annotations := r.buildDeploymentMetadataForVmcp(baseLabels, vmcp)
+	labels, annotations := r.buildDeploymentMetadataForVmcp(
+		t.Context(), baseLabels, vmcp, nil, []workloads.TypedWorkload{},
+	)
 
 	assert.Equal(t, baseLabels, labels)
 	assert.NotNil(t, annotations)
+	assert.NotEmpty(t, annotations[podVolumesHashAnnotation],
+		"podVolumesHashAnnotation must be set for the default vmcp-config volume")
 }
 
 // TestBuildPodTemplateMetadata tests pod template metadata generation
@@ -1134,6 +1138,129 @@ func TestImagePullSecretsHash(t *testing.T) {
 	})
 }
 
+// TestPodVolumesHash verifies the hash helper normalizes order, treats empty
+// lists as the sentinel "" hash, and distinguishes different backing refs.
+func TestPodVolumesHash(t *testing.T) {
+	t.Parallel()
+
+	t.Run("empty lists return empty hash", func(t *testing.T) {
+		t.Parallel()
+		hash, err := podVolumesHash(nil, nil)
+		require.NoError(t, err)
+		assert.Empty(t, hash)
+	})
+
+	t.Run("volume mount order-insensitive", func(t *testing.T) {
+		t.Parallel()
+		volumes := []corev1.Volume{{
+			Name: "vmcp-config",
+			VolumeSource: corev1.VolumeSource{
+				ConfigMap: &corev1.ConfigMapVolumeSource{
+					LocalObjectReference: corev1.LocalObjectReference{Name: "cfg"},
+				},
+			},
+		}}
+		mountsA := []corev1.VolumeMount{
+			{Name: "a", MountPath: "/a"},
+			{Name: "b", MountPath: "/b"},
+		}
+		mountsB := []corev1.VolumeMount{
+			{Name: "b", MountPath: "/b"},
+			{Name: "a", MountPath: "/a"},
+		}
+		a, err := podVolumesHash(mountsA, volumes)
+		require.NoError(t, err)
+		b, err := podVolumesHash(mountsB, volumes)
+		require.NoError(t, err)
+		assert.Equal(t, a, b, "reordering must not change the hash")
+	})
+
+	t.Run("different secret refs produce different hashes", func(t *testing.T) {
+		t.Parallel()
+		baseMounts := []corev1.VolumeMount{{Name: "auth-key-0", MountPath: "/keys/key-0.pem", SubPath: "key-0.pem", ReadOnly: true}}
+		volA := []corev1.Volume{{
+			Name: "auth-key-0",
+			VolumeSource: corev1.VolumeSource{
+				Secret: &corev1.SecretVolumeSource{
+					SecretName: "keys-v1",
+					Items:      []corev1.KeyToPath{{Key: "pem", Path: "key-0.pem"}},
+				},
+			},
+		}}
+		volB := []corev1.Volume{{
+			Name: "auth-key-0",
+			VolumeSource: corev1.VolumeSource{
+				Secret: &corev1.SecretVolumeSource{
+					SecretName: "keys-v2",
+					Items:      []corev1.KeyToPath{{Key: "pem", Path: "key-0.pem"}},
+				},
+			},
+		}}
+		a, err := podVolumesHash(baseMounts, volA)
+		require.NoError(t, err)
+		b, err := podVolumesHash(baseMounts, volB)
+		require.NoError(t, err)
+		assert.NotEqual(t, a, b)
+	})
+}
+
+// TestDeploymentForVirtualMCPServer_PodVolumes_UpdatePath verifies that edits
+// to volume-only inputs (auth-server signing key secret refs) are detected by
+// deploymentNeedsUpdate and propagate to the live Deployment. Regression test
+// for #5619 where volume-only changes were silently missed.
+func TestDeploymentForVirtualMCPServer_PodVolumes_UpdatePath(t *testing.T) {
+	t.Parallel()
+
+	scheme := testutil.NewScheme(t)
+
+	r := &VirtualMCPServerReconciler{
+		Scheme:           scheme,
+		PlatformDetector: ctrlutil.NewSharedPlatformDetector(),
+	}
+
+	authConfig := func(secretName string) *mcpv1beta1.EmbeddedAuthServerConfig {
+		return &mcpv1beta1.EmbeddedAuthServerConfig{
+			SigningKeySecretRefs: []mcpv1beta1.SecretKeyRef{
+				{Name: secretName, Key: "pem"},
+			},
+		}
+	}
+
+	vmcp := v1beta1test.NewVirtualMCPServer("test-vmcp", "default",
+		v1beta1test.WithVMCPGroupRef("test-group"),
+		v1beta1test.WithVMCPAuthServerConfig(authConfig("keys-v1")),
+	)
+
+	initialDep := r.deploymentForVirtualMCPServer(t.Context(), vmcp, "test-checksum", nil, []workloads.TypedWorkload{})
+	require.NotNil(t, initialDep)
+	require.NotEmpty(t, initialDep.Annotations[podVolumesHashAnnotation])
+
+	vmcp.Spec.AuthServerConfig = authConfig("keys-v2")
+
+	needsUpdate := r.podVolumesNeedsUpdate(t.Context(), initialDep, vmcp, nil, []workloads.TypedWorkload{})
+	assert.True(t, needsUpdate, "signing key secret ref change must be detected as drift")
+
+	parentNeedsUpdate := r.deploymentNeedsUpdate(
+		t.Context(), initialDep, vmcp, "test-checksum", nil, []workloads.TypedWorkload{},
+	)
+	assert.True(t, parentNeedsUpdate, "deploymentNeedsUpdate must propagate pod volume drift")
+
+	updatedDep := r.deploymentForVirtualMCPServer(t.Context(), vmcp, "test-checksum", nil, []workloads.TypedWorkload{})
+	require.NotNil(t, updatedDep)
+
+	var secretName string
+	for _, vol := range updatedDep.Spec.Template.Spec.Volumes {
+		if vol.Secret != nil && strings.HasPrefix(vol.Name, "authserver-signing-key-") {
+			secretName = vol.Secret.SecretName
+			break
+		}
+	}
+	assert.Equal(t, "keys-v2", secretName, "rebuilt Deployment must mount the new secret")
+
+	settled := r.podVolumesNeedsUpdate(t.Context(), updatedDep, vmcp, nil, []workloads.TypedWorkload{})
+	assert.False(t, settled, "drift check must settle once Deployment is rebuilt")
+}
+
 // TestBuildHeaderForwardEnvVarsForEntries verifies the operator emits one env
 // var per (entry, header) declared on an MCPServerEntry.spec.headerForward,
 // using literal values for plaintext and valueFrom.secretKeyRef for