From e641710a8987e7c37efa2a5bda4e88d860d738af Mon Sep 17 00:00:00 2001 From: syf2211 Date: Thu, 25 Jun 2026 16:40:05 +0000 Subject: [PATCH] fix(vmcp): detect pod volume drift via hash annotation (#5619) Mirror the imagePullSecrets hash-annotation pattern so changes to operator-managed volumes and volumeMounts (auth-server signing keys, CA-bundle refs, telemetry CA bundles) trigger a Deployment rollout. Adds buildPodVolumesForVmcp shared by the deploy builder and drift check, podVolumesHash with canonical source fingerprinting that ignores K8s-defaulted fields like defaultMode, and regression tests. --- .../virtualmcpserver_controller.go | 40 ++++ .../virtualmcpserver_controller_test.go | 5 +- .../virtualmcpserver_deployment.go | 187 +++++++++++++++--- .../virtualmcpserver_deployment_test.go | 129 +++++++++++- 4 files changed, 333 insertions(+), 28 deletions(-) diff --git a/cmd/thv-operator/controllers/virtualmcpserver_controller.go b/cmd/thv-operator/controllers/virtualmcpserver_controller.go index ea94504456..9ea1ae6880 100644 --- a/cmd/thv-operator/controllers/virtualmcpserver_controller.go +++ b/cmd/thv-operator/controllers/virtualmcpserver_controller.go @@ -1621,6 +1621,10 @@ func (r *VirtualMCPServerReconciler) deploymentNeedsUpdate( return true } + if r.podVolumesNeedsUpdate(ctx, deployment, vmcp, telemetryCfg, typedWorkloads) { + return true + } + // Check if spec.replicas has changed. Only compare when spec.replicas is non-nil; // nil means hands-off mode (HPA or external controller manages replicas) and the live count is authoritative. if vmcp.Spec.Replicas != nil { @@ -1802,6 +1806,42 @@ func (r *VirtualMCPServerReconciler) imagePullSecretsNeedsUpdate( return deployment.Annotations[imagePullRefsHashAnnotation] != expectedHash } +// podVolumesNeedsUpdate detects drift on the desired pod volumes and volumeMounts +// by comparing a hash of the desired set against podVolumesHashAnnotation. We +// cannot compare deployment.Spec.Template.Spec.Volumes directly because the +// live list includes K8s-defaulted fields (defaultMode, etc.) that would cause +// spurious drift or mask real changes when only the backing secret/configmap +// ref changes (see #5619). +func (r *VirtualMCPServerReconciler) podVolumesNeedsUpdate( + ctx context.Context, + deployment *appsv1.Deployment, + vmcp *mcpv1beta1.VirtualMCPServer, + telemetryCfg *mcpv1beta1.MCPTelemetryConfig, + typedWorkloads []workloads.TypedWorkload, +) bool { + if deployment == nil || vmcp == nil { + return true + } + + volumeMounts, volumes, err := r.buildPodVolumesForVmcp(ctx, vmcp, telemetryCfg, typedWorkloads) + if err != nil { + log.FromContext(ctx).Error(err, "Failed to build pod volumes for drift check, assuming update needed") + return true + } + + expectedHash, err := podVolumesHash(volumeMounts, volumes) + if err != nil { + log.FromContext(ctx).Error(err, "Failed to hash pod volumes, assuming update needed") + return true + } + + _, present := deployment.Annotations[podVolumesHashAnnotation] + if expectedHash == "" { + return present + } + return deployment.Annotations[podVolumesHashAnnotation] != expectedHash +} + // serviceNeedsUpdate checks if the service needs to be updated func (*VirtualMCPServerReconciler) serviceNeedsUpdate( service *corev1.Service, diff --git a/cmd/thv-operator/controllers/virtualmcpserver_controller_test.go b/cmd/thv-operator/controllers/virtualmcpserver_controller_test.go index ccf61555d4..68ce18aa0c 100644 --- a/cmd/thv-operator/controllers/virtualmcpserver_controller_test.go +++ b/cmd/thv-operator/controllers/virtualmcpserver_controller_test.go @@ -1905,6 +1905,9 @@ func TestVirtualMCPServerDeploymentNeedsUpdate(t *testing.T) { expectedLabels, expectedAnnotations := reconciler.buildPodTemplateMetadata( labelsForVirtualMCPServer(vmcp.Name), vmcp, vmcpConfigChecksum, ) + _, expectedDeploymentAnnotations := reconciler.buildDeploymentMetadataForVmcp( + context.Background(), labelsForVirtualMCPServer(vmcp.Name), vmcp, nil, []workloads.TypedWorkload{}, + ) tests := []struct { name string @@ -2014,7 +2017,7 @@ func TestVirtualMCPServerDeploymentNeedsUpdate(t *testing.T) { deployment: &appsv1.Deployment{ ObjectMeta: metav1.ObjectMeta{ Labels: labelsForVirtualMCPServer(vmcp.Name), - Annotations: make(map[string]string), + Annotations: expectedDeploymentAnnotations, }, Spec: appsv1.DeploymentSpec{ Template: corev1.PodTemplateSpec{ diff --git a/cmd/thv-operator/controllers/virtualmcpserver_deployment.go b/cmd/thv-operator/controllers/virtualmcpserver_deployment.go index 4b6120004c..0261a21e94 100644 --- a/cmd/thv-operator/controllers/virtualmcpserver_deployment.go +++ b/cmd/thv-operator/controllers/virtualmcpserver_deployment.go @@ -50,6 +50,14 @@ const ( // detect every input that influences the deployed PodSpec.ImagePullSecrets. imagePullRefsHashAnnotation = "toolhive.stacklok.io/imagepullsecrets-hash" + // podVolumesHashAnnotation tracks the SHA256 hash of the desired pod + // volumes and volumeMounts list. Mirrors the imagePullRefsHashAnnotation + // pattern to detect drift on volume-only inputs (auth-server signing keys, + // CA-bundle refs, telemetry CA bundles) without comparing live + // PodSpec.Volumes directly, which trips on K8s-defaulted fields like + // defaultMode. + podVolumesHashAnnotation = "toolhive.stacklok.io/podvolumes-hash" + // Log level configuration logLevelDebug = "debug" // Debug log level value @@ -147,7 +155,7 @@ func (r *VirtualMCPServerReconciler) deploymentForVirtualMCPServer( // Build deployment components using helper functions args := r.buildContainerArgsForVmcp(vmcp) - volumeMounts, volumes, err := r.buildVolumesForVmcp(ctx, vmcp) + volumeMounts, volumes, err := r.buildPodVolumesForVmcp(ctx, vmcp, telemetryCfg, typedWorkloads) if err != nil { log.FromContext(ctx).Error(err, "Failed to build volumes for VirtualMCPServer") return nil @@ -158,31 +166,9 @@ func (r *VirtualMCPServerReconciler) deploymentForVirtualMCPServer( return nil } - // Add CA bundle volumes for MCPServerEntry backends with caBundleRef - caVolumes, caMounts, err := r.buildCABundleVolumesForEntries(ctx, vmcp.Namespace, typedWorkloads) - if err != nil { - log.FromContext(ctx).Error(err, "Failed to build CA bundle volumes for MCPServerEntries") - return nil - } - volumes = append(volumes, caVolumes...) - volumeMounts = append(volumeMounts, caMounts...) - - // Add telemetry CA bundle volumes from the pre-fetched MCPTelemetryConfig - if telemetryCfg != nil { - telVolumes, telMounts := ctrlutil.AddTelemetryCABundleVolumes(telemetryCfg) - volumes = append(volumes, telVolumes...) - volumeMounts = append(volumeMounts, telMounts...) - } - - // Add embedded auth server volumes if configured (inline config). The matching - // env vars are injected by buildEnvVarsForVmcp above so the drift check stays - // symmetric with what is built here (see #5616). - if vmcp.Spec.AuthServerConfig != nil { - authServerVolumes, authServerMounts := ctrlutil.GenerateAuthServerVolumes(vmcp.Spec.AuthServerConfig) - volumes = append(volumes, authServerVolumes...) - volumeMounts = append(volumeMounts, authServerMounts...) - } - deploymentLabels, deploymentAnnotations := r.buildDeploymentMetadataForVmcp(ls, vmcp) + deploymentLabels, deploymentAnnotations := r.buildDeploymentMetadataForVmcp( + ctx, ls, vmcp, telemetryCfg, typedWorkloads, + ) deploymentTemplateLabels, deploymentTemplateAnnotations := r.buildPodTemplateMetadata(ls, vmcp, vmcpConfigChecksum) podSecurityContext, containerSecurityContext := r.buildSecurityContextsForVmcp(ctx, vmcp) serviceAccountName := r.serviceAccountNameForVmcp(vmcp) @@ -329,6 +315,44 @@ func (r *VirtualMCPServerReconciler) buildVolumesForVmcp( return volumeMounts, volumes, nil } +// buildPodVolumesForVmcp assembles the full desired pod volumes and volumeMounts +// for a VirtualMCPServer Deployment. Shared by deploymentForVirtualMCPServer and +// the podVolumesNeedsUpdate drift check so both paths hash the same inputs. +func (r *VirtualMCPServerReconciler) buildPodVolumesForVmcp( + ctx context.Context, + vmcp *mcpv1beta1.VirtualMCPServer, + telemetryCfg *mcpv1beta1.MCPTelemetryConfig, + typedWorkloads []workloads.TypedWorkload, +) ([]corev1.VolumeMount, []corev1.Volume, error) { + volumeMounts, volumes, err := r.buildVolumesForVmcp(ctx, vmcp) + if err != nil { + return nil, nil, err + } + + caVolumes, caMounts, err := r.buildCABundleVolumesForEntries(ctx, vmcp.Namespace, typedWorkloads) + if err != nil { + return nil, nil, err + } + volumes = append(volumes, caVolumes...) + volumeMounts = append(volumeMounts, caMounts...) + + if telemetryCfg != nil { + telVolumes, telMounts := ctrlutil.AddTelemetryCABundleVolumes(telemetryCfg) + volumes = append(volumes, telVolumes...) + volumeMounts = append(volumeMounts, telMounts...) + } + + // The matching env vars are injected by buildEnvVarsForVmcp so the drift + // check stays symmetric with what is built here (see #5616). + if vmcp.Spec.AuthServerConfig != nil { + authServerVolumes, authServerMounts := ctrlutil.GenerateAuthServerVolumes(vmcp.Spec.AuthServerConfig) + volumes = append(volumes, authServerVolumes...) + volumeMounts = append(volumeMounts, authServerMounts...) + } + + return volumeMounts, volumes, nil +} + // buildEnvVarsForVmcp builds environment variables for the vmcp container. // telemetryCfg is the already-fetched MCPTelemetryConfig (nil when not referenced). func (r *VirtualMCPServerReconciler) buildEnvVarsForVmcp( @@ -891,8 +915,11 @@ func (r *VirtualMCPServerReconciler) getExternalAuthConfigSecretEnvVars( // buildDeploymentMetadataForVmcp builds deployment-level labels and annotations func (r *VirtualMCPServerReconciler) buildDeploymentMetadataForVmcp( + ctx context.Context, baseLabels map[string]string, vmcp *mcpv1beta1.VirtualMCPServer, + telemetryCfg *mcpv1beta1.MCPTelemetryConfig, + typedWorkloads []workloads.TypedWorkload, ) (map[string]string, map[string]string) { deploymentLabels := baseLabels deploymentAnnotations := make(map[string]string) @@ -918,6 +945,16 @@ func (r *VirtualMCPServerReconciler) buildDeploymentMetadataForVmcp( deploymentAnnotations[imagePullRefsHashAnnotation] = hash } + // Store hash of the desired pod volumes and volumeMounts so + // deploymentNeedsUpdate can detect volume-only drift (auth-server secret + // refs, CA-bundle refs, telemetry CA bundles) without comparing live + // PodSpec.Volumes, which trips on K8s-defaulted fields (see #5619). + if volumeMounts, volumes, err := r.buildPodVolumesForVmcp(ctx, vmcp, telemetryCfg, typedWorkloads); err == nil { + if hash, err := podVolumesHash(volumeMounts, volumes); err == nil && hash != "" { + deploymentAnnotations[podVolumesHashAnnotation] = hash + } + } + // TODO: Add support for ResourceOverrides if needed in the future return deploymentLabels, deploymentAnnotations @@ -944,6 +981,104 @@ func imagePullSecretsHash(secrets []corev1.LocalObjectReference) (string, error) return hex.EncodeToString(h[:]), nil } +// podVolumeHashEntry captures drift-relevant volume fields without K8s-defaulted noise. +type podVolumeHashEntry struct { + Name string `json:"name"` + Source string `json:"source"` +} + +// podVolumeMountHashEntry captures drift-relevant volumeMount fields. +type podVolumeMountHashEntry struct { + Name string `json:"name"` + MountPath string `json:"mountPath"` + SubPath string `json:"subPath,omitempty"` + ReadOnly bool `json:"readOnly,omitempty"` +} + +// podVolumesHashPayload is the canonical form hashed by podVolumesHash. +type podVolumesHashPayload struct { + Volumes []podVolumeHashEntry `json:"volumes"` + VolumeMounts []podVolumeMountHashEntry `json:"volumeMounts"` +} + +// volumeSourceFingerprint returns a stable identifier for the backing resource +// referenced by a volume, ignoring K8s-defaulted fields like defaultMode. +func volumeSourceFingerprint(vol corev1.Volume) string { + switch { + case vol.ConfigMap != nil: + src := "configmap:" + vol.ConfigMap.Name + if len(vol.ConfigMap.Items) > 0 { + items := make([]string, len(vol.ConfigMap.Items)) + for i, item := range vol.ConfigMap.Items { + items[i] = item.Key + "=" + item.Path + } + sort.Strings(items) + src += ":" + strings.Join(items, ",") + } + return src + case vol.Secret != nil: + src := "secret:" + vol.Secret.SecretName + if len(vol.Secret.Items) > 0 { + items := make([]string, len(vol.Secret.Items)) + for i, item := range vol.Secret.Items { + items[i] = item.Key + "=" + item.Path + } + sort.Strings(items) + src += ":" + strings.Join(items, ",") + } + return src + case vol.EmptyDir != nil: + return "emptydir" + default: + return "unknown:" + vol.Name + } +} + +// podVolumesHash returns a deterministic SHA256 hash of the given volumes and +// volumeMounts. Returns an empty string with no error when both lists are empty. +func podVolumesHash(mounts []corev1.VolumeMount, volumes []corev1.Volume) (string, error) { + if len(mounts) == 0 && len(volumes) == 0 { + return "", nil + } + + payload := podVolumesHashPayload{ + Volumes: make([]podVolumeHashEntry, len(volumes)), + VolumeMounts: make([]podVolumeMountHashEntry, len(mounts)), + } + + for i, vol := range volumes { + payload.Volumes[i] = podVolumeHashEntry{ + Name: vol.Name, + Source: volumeSourceFingerprint(vol), + } + } + sort.Slice(payload.Volumes, func(i, j int) bool { + return payload.Volumes[i].Name < payload.Volumes[j].Name + }) + + for i, mount := range mounts { + payload.VolumeMounts[i] = podVolumeMountHashEntry{ + Name: mount.Name, + MountPath: mount.MountPath, + SubPath: mount.SubPath, + ReadOnly: mount.ReadOnly, + } + } + sort.Slice(payload.VolumeMounts, func(i, j int) bool { + if payload.VolumeMounts[i].Name != payload.VolumeMounts[j].Name { + return payload.VolumeMounts[i].Name < payload.VolumeMounts[j].Name + } + return payload.VolumeMounts[i].MountPath < payload.VolumeMounts[j].MountPath + }) + + canonical, err := json.Marshal(payload) + if err != nil { + return "", fmt.Errorf("failed to marshal pod volumes for hashing: %w", err) + } + h := sha256.Sum256(canonical) + return hex.EncodeToString(h[:]), nil +} + // buildPodTemplateMetadata builds pod template labels and annotations for vmcp func (*VirtualMCPServerReconciler) buildPodTemplateMetadata( baseLabels map[string]string, diff --git a/cmd/thv-operator/controllers/virtualmcpserver_deployment_test.go b/cmd/thv-operator/controllers/virtualmcpserver_deployment_test.go index ef3ee1482d..a315832ca6 100644 --- a/cmd/thv-operator/controllers/virtualmcpserver_deployment_test.go +++ b/cmd/thv-operator/controllers/virtualmcpserver_deployment_test.go @@ -333,10 +333,14 @@ func TestBuildDeploymentMetadataForVmcp(t *testing.T) { vmcp := v1beta1test.NewVirtualMCPServer("test-vmcp", "default") r := &VirtualMCPServerReconciler{} - labels, annotations := r.buildDeploymentMetadataForVmcp(baseLabels, vmcp) + labels, annotations := r.buildDeploymentMetadataForVmcp( + t.Context(), baseLabels, vmcp, nil, []workloads.TypedWorkload{}, + ) assert.Equal(t, baseLabels, labels) assert.NotNil(t, annotations) + assert.NotEmpty(t, annotations[podVolumesHashAnnotation], + "podVolumesHashAnnotation must be set for the default vmcp-config volume") } // TestBuildPodTemplateMetadata tests pod template metadata generation @@ -1134,6 +1138,129 @@ func TestImagePullSecretsHash(t *testing.T) { }) } +// TestPodVolumesHash verifies the hash helper normalizes order, treats empty +// lists as the sentinel "" hash, and distinguishes different backing refs. +func TestPodVolumesHash(t *testing.T) { + t.Parallel() + + t.Run("empty lists return empty hash", func(t *testing.T) { + t.Parallel() + hash, err := podVolumesHash(nil, nil) + require.NoError(t, err) + assert.Empty(t, hash) + }) + + t.Run("volume mount order-insensitive", func(t *testing.T) { + t.Parallel() + volumes := []corev1.Volume{{ + Name: "vmcp-config", + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{Name: "cfg"}, + }, + }, + }} + mountsA := []corev1.VolumeMount{ + {Name: "a", MountPath: "/a"}, + {Name: "b", MountPath: "/b"}, + } + mountsB := []corev1.VolumeMount{ + {Name: "b", MountPath: "/b"}, + {Name: "a", MountPath: "/a"}, + } + a, err := podVolumesHash(mountsA, volumes) + require.NoError(t, err) + b, err := podVolumesHash(mountsB, volumes) + require.NoError(t, err) + assert.Equal(t, a, b, "reordering must not change the hash") + }) + + t.Run("different secret refs produce different hashes", func(t *testing.T) { + t.Parallel() + baseMounts := []corev1.VolumeMount{{Name: "auth-key-0", MountPath: "/keys/key-0.pem", SubPath: "key-0.pem", ReadOnly: true}} + volA := []corev1.Volume{{ + Name: "auth-key-0", + VolumeSource: corev1.VolumeSource{ + Secret: &corev1.SecretVolumeSource{ + SecretName: "keys-v1", + Items: []corev1.KeyToPath{{Key: "pem", Path: "key-0.pem"}}, + }, + }, + }} + volB := []corev1.Volume{{ + Name: "auth-key-0", + VolumeSource: corev1.VolumeSource{ + Secret: &corev1.SecretVolumeSource{ + SecretName: "keys-v2", + Items: []corev1.KeyToPath{{Key: "pem", Path: "key-0.pem"}}, + }, + }, + }} + a, err := podVolumesHash(baseMounts, volA) + require.NoError(t, err) + b, err := podVolumesHash(baseMounts, volB) + require.NoError(t, err) + assert.NotEqual(t, a, b) + }) +} + +// TestDeploymentForVirtualMCPServer_PodVolumes_UpdatePath verifies that edits +// to volume-only inputs (auth-server signing key secret refs) are detected by +// deploymentNeedsUpdate and propagate to the live Deployment. Regression test +// for #5619 where volume-only changes were silently missed. +func TestDeploymentForVirtualMCPServer_PodVolumes_UpdatePath(t *testing.T) { + t.Parallel() + + scheme := testutil.NewScheme(t) + + r := &VirtualMCPServerReconciler{ + Scheme: scheme, + PlatformDetector: ctrlutil.NewSharedPlatformDetector(), + } + + authConfig := func(secretName string) *mcpv1beta1.EmbeddedAuthServerConfig { + return &mcpv1beta1.EmbeddedAuthServerConfig{ + SigningKeySecretRefs: []mcpv1beta1.SecretKeyRef{ + {Name: secretName, Key: "pem"}, + }, + } + } + + vmcp := v1beta1test.NewVirtualMCPServer("test-vmcp", "default", + v1beta1test.WithVMCPGroupRef("test-group"), + v1beta1test.WithVMCPAuthServerConfig(authConfig("keys-v1")), + ) + + initialDep := r.deploymentForVirtualMCPServer(t.Context(), vmcp, "test-checksum", nil, []workloads.TypedWorkload{}) + require.NotNil(t, initialDep) + require.NotEmpty(t, initialDep.Annotations[podVolumesHashAnnotation]) + + vmcp.Spec.AuthServerConfig = authConfig("keys-v2") + + needsUpdate := r.podVolumesNeedsUpdate(t.Context(), initialDep, vmcp, nil, []workloads.TypedWorkload{}) + assert.True(t, needsUpdate, "signing key secret ref change must be detected as drift") + + parentNeedsUpdate := r.deploymentNeedsUpdate( + t.Context(), initialDep, vmcp, "test-checksum", nil, []workloads.TypedWorkload{}, + ) + assert.True(t, parentNeedsUpdate, "deploymentNeedsUpdate must propagate pod volume drift") + + updatedDep := r.deploymentForVirtualMCPServer(t.Context(), vmcp, "test-checksum", nil, []workloads.TypedWorkload{}) + require.NotNil(t, updatedDep) + + var secretName string + for _, vol := range updatedDep.Spec.Template.Spec.Volumes { + if vol.Secret != nil && strings.HasPrefix(vol.Name, "authserver-signing-key-") { + secretName = vol.Secret.SecretName + break + } + } + assert.Equal(t, "keys-v2", secretName, "rebuilt Deployment must mount the new secret") + + settled := r.podVolumesNeedsUpdate(t.Context(), updatedDep, vmcp, nil, []workloads.TypedWorkload{}) + assert.False(t, settled, "drift check must settle once Deployment is rebuilt") +} + // TestBuildHeaderForwardEnvVarsForEntries verifies the operator emits one env // var per (entry, header) declared on an MCPServerEntry.spec.headerForward, // using literal values for plaintext and valueFrom.secretKeyRef for