From 01bffcf97f38362e5e8cca8a89c26fae0105a45c Mon Sep 17 00:00:00 2001 From: Chris Burns <29541485+ChrisJBurns@users.noreply.github.com> Date: Tue, 23 Jun 2026 23:06:29 +0100 Subject: [PATCH 1/2] Default storage version migrator to enabled The StorageVersionMigrator controller shipped opt-in (chart default false) so it could land without functional impact. It has soaked through a release, so make the operator helm chart enable it by default: clusters get their CRD status.storedVersions kept clean automatically, which is the precondition for a future release to drop deprecated API versions without orphaning etcd objects. - Flip operator.features.storageVersionMigrator to true in values.yaml - Update the default-install test to assert the env var renders "true" - Add explicit opt-out (false) coverage to the feature-flags suite - Refresh the chart README, reference docs, and upgrade-guide notes - Sync the now-stale opt-in comments in the operator's env-var gate (the binary still defaults off when the var is unset; the chart sets it) Co-Authored-By: Claude Opus 4.8 (1M context) --- cmd/thv-operator/app/app.go | 18 ++++++++--------- deploy/charts/operator/README.md | 4 ++-- .../operator/tests/default_install_test.yaml | 2 +- .../operator/tests/feature_flags_test.yaml | 20 ++++++++++++++----- deploy/charts/operator/values.yaml | 5 +++-- docs/operator/storage-version-migration.md | 14 ++++++------- docs/operator/upgrade-guide/README.md | 8 ++++---- 7 files changed, 41 insertions(+), 30 deletions(-) diff --git a/cmd/thv-operator/app/app.go b/cmd/thv-operator/app/app.go index 72b704120e..2aad8b882c 100644 --- a/cmd/thv-operator/app/app.go +++ b/cmd/thv-operator/app/app.go @@ -49,11 +49,11 @@ var ( setupLog = log.Log.WithName("setup") ) -// envEnableStorageVersionMigrator is the opt-in for the StorageVersionMigrator -// controller. The controller defaults to OFF in this release so the change can -// ship safely without functional impact. Set to "true" (or "1", "t") to enable. -// A follow-up release will flip the default to true alongside the helm chart -// surface and user docs. +// envEnableStorageVersionMigrator gates the StorageVersionMigrator controller. +// The binary itself defaults to OFF when the var is unset; the operator helm +// chart sets it to "true" by default, so chart-based installs run the migrator +// unless the operator explicitly opts out. Set to "true" (or "1", "t") to +// enable, "false" to disable. const envEnableStorageVersionMigrator = "TOOLHIVE_ENABLE_STORAGE_VERSION_MIGRATOR" func init() { @@ -197,10 +197,10 @@ func setupStorageVersionMigrator(mgr ctrl.Manager) error { } // isStorageVersionMigratorEnabled reports whether the StorageVersionMigrator -// controller should be registered. Defaults to false in this release — admins -// must explicitly opt in via TOOLHIVE_ENABLE_STORAGE_VERSION_MIGRATOR=true. -// An unparsable value returns an error so startup fails loudly rather than -// silently disabling the feature an admin asked to turn on. +// controller should be registered. Defaults to false when +// TOOLHIVE_ENABLE_STORAGE_VERSION_MIGRATOR is unset; the operator helm chart +// sets it to "true" by default. An unparsable value returns an error so startup +// fails loudly rather than silently disabling the feature an admin asked to turn on. func isStorageVersionMigratorEnabled() (bool, error) { value, found := os.LookupEnv(envEnableStorageVersionMigrator) if !found { diff --git a/deploy/charts/operator/README.md b/deploy/charts/operator/README.md index cc3e8125a4..ea70969d5c 100644 --- a/deploy/charts/operator/README.md +++ b/deploy/charts/operator/README.md @@ -46,7 +46,7 @@ The command removes all the Kubernetes components associated with the chart and |-----|------|---------|-------------| | fullnameOverride | string | `"toolhive-operator"` | Provide a fully-qualified name override for resources | | nameOverride | string | `""` | Override the name of the chart | -| operator | object | `{"affinity":{},"autoscaling":{"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80},"containerSecurityContext":{"allowPrivilegeEscalation":false,"capabilities":{"drop":["ALL"]},"readOnlyRootFilesystem":true,"runAsNonRoot":true,"runAsUser":1000,"seccompProfile":{"type":"RuntimeDefault"}},"defaultImagePullSecrets":[],"defaultRedis":{"addr":"","existingSecret":"","existingSecretKey":""},"env":[],"features":{"experimental":false,"storageVersionMigrator":false},"gc":{"gogc":75,"gomemlimit":"110MiB"},"image":"ghcr.io/stacklok/toolhive/operator:v0.30.0","imagePullPolicy":"IfNotPresent","imagePullSecrets":[],"leaderElectionRole":{"binding":{"name":"toolhive-operator-leader-election-rolebinding"},"name":"toolhive-operator-leader-election-role","rules":[{"apiGroups":[""],"resources":["configmaps"],"verbs":["get","list","watch","create","update","patch","delete"]},{"apiGroups":["coordination.k8s.io"],"resources":["leases"],"verbs":["get","list","watch","create","update","patch","delete"]},{"apiGroups":["events.k8s.io"],"resources":["events"],"verbs":["create","patch"]}]},"livenessProbe":{"httpGet":{"path":"/healthz","port":"health"},"initialDelaySeconds":15,"periodSeconds":20},"nodeSelector":{},"podAnnotations":{},"podLabels":{},"podSecurityContext":{"runAsNonRoot":true},"ports":[{"containerPort":8080,"name":"metrics","protocol":"TCP"},{"containerPort":8081,"name":"health","protocol":"TCP"}],"proxyHost":"0.0.0.0","rbac":{"allowedNamespaces":[],"scope":"cluster"},"readinessProbe":{"httpGet":{"path":"/readyz","port":"health"},"initialDelaySeconds":5,"periodSeconds":10},"replicaCount":1,"resources":{"limits":{"cpu":"500m","memory":"128Mi"},"requests":{"cpu":"10m","memory":"64Mi"}},"serviceAccount":{"annotations":{},"automountServiceAccountToken":true,"create":true,"labels":{},"name":"toolhive-operator"},"tolerations":[],"toolhiveRunnerImage":"ghcr.io/stacklok/toolhive/proxyrunner:v0.30.0","vmcpImage":"ghcr.io/stacklok/toolhive/vmcp:v0.30.0","volumeMounts":[],"volumes":[]}` | All values for the operator deployment and associated resources | +| operator | object | `{"affinity":{},"autoscaling":{"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80},"containerSecurityContext":{"allowPrivilegeEscalation":false,"capabilities":{"drop":["ALL"]},"readOnlyRootFilesystem":true,"runAsNonRoot":true,"runAsUser":1000,"seccompProfile":{"type":"RuntimeDefault"}},"defaultImagePullSecrets":[],"defaultRedis":{"addr":"","existingSecret":"","existingSecretKey":""},"env":[],"features":{"experimental":false,"storageVersionMigrator":true},"gc":{"gogc":75,"gomemlimit":"110MiB"},"image":"ghcr.io/stacklok/toolhive/operator:v0.30.0","imagePullPolicy":"IfNotPresent","imagePullSecrets":[],"leaderElectionRole":{"binding":{"name":"toolhive-operator-leader-election-rolebinding"},"name":"toolhive-operator-leader-election-role","rules":[{"apiGroups":[""],"resources":["configmaps"],"verbs":["get","list","watch","create","update","patch","delete"]},{"apiGroups":["coordination.k8s.io"],"resources":["leases"],"verbs":["get","list","watch","create","update","patch","delete"]},{"apiGroups":["events.k8s.io"],"resources":["events"],"verbs":["create","patch"]}]},"livenessProbe":{"httpGet":{"path":"/healthz","port":"health"},"initialDelaySeconds":15,"periodSeconds":20},"nodeSelector":{},"podAnnotations":{},"podLabels":{},"podSecurityContext":{"runAsNonRoot":true},"ports":[{"containerPort":8080,"name":"metrics","protocol":"TCP"},{"containerPort":8081,"name":"health","protocol":"TCP"}],"proxyHost":"0.0.0.0","rbac":{"allowedNamespaces":[],"scope":"cluster"},"readinessProbe":{"httpGet":{"path":"/readyz","port":"health"},"initialDelaySeconds":5,"periodSeconds":10},"replicaCount":1,"resources":{"limits":{"cpu":"500m","memory":"128Mi"},"requests":{"cpu":"10m","memory":"64Mi"}},"serviceAccount":{"annotations":{},"automountServiceAccountToken":true,"create":true,"labels":{},"name":"toolhive-operator"},"tolerations":[],"toolhiveRunnerImage":"ghcr.io/stacklok/toolhive/proxyrunner:v0.30.0","vmcpImage":"ghcr.io/stacklok/toolhive/vmcp:v0.30.0","volumeMounts":[],"volumes":[]}` | All values for the operator deployment and associated resources | | operator.affinity | object | `{}` | Affinity settings for the operator pod | | operator.autoscaling | object | `{"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80}` | Configuration for horizontal pod autoscaling | | operator.autoscaling.enabled | bool | `false` | Enable autoscaling for the operator | @@ -61,7 +61,7 @@ The command removes all the Kubernetes components associated with the chart and | operator.defaultRedis.existingSecretKey | string | `""` | existingSecretKey is the key within existingSecret that holds the password. Empty means use global.redis.existingSecretKey or fall back to "redis-password". | | operator.env | list | `[]` | Environment variables to set in the operator container. Supported toolhive-specific variables include: - TOOLHIVE_SKIP_UPDATE_CHECK: set to "true" to disable the operator's periodic update check against the ToolHive update API. Also disables the usage-metrics collection that is gated on the same check. | | operator.features.experimental | bool | `false` | Enable experimental features | -| operator.features.storageVersionMigrator | bool | `false` | Enable the StorageVersionMigrator controller, which auto-cleans status.storedVersions on opted-in toolhive.stacklok.dev CRDs so a future release can drop deprecated versions (e.g. v1alpha1) without orphaning etcd objects in the cluster. Sets TOOLHIVE_ENABLE_STORAGE_VERSION_MIGRATOR in the operator deployment. | +| operator.features.storageVersionMigrator | bool | `true` | Enable the StorageVersionMigrator controller, which auto-cleans status.storedVersions on opted-in toolhive.stacklok.dev CRDs so a future release can drop deprecated versions (e.g. v1alpha1) without orphaning etcd objects in the cluster. Enabled by default; set to false to opt out and handle storage-version cleanup yourself. Sets TOOLHIVE_ENABLE_STORAGE_VERSION_MIGRATOR in the operator deployment. | | operator.gc | object | `{"gogc":75,"gomemlimit":"110MiB"}` | Go memory limits and garbage collection percentage for the operator container | | operator.gc.gogc | int | `75` | Go garbage collection percentage for the operator container | | operator.gc.gomemlimit | string | `"110MiB"` | Go memory limits for the operator container | diff --git a/deploy/charts/operator/tests/default_install_test.yaml b/deploy/charts/operator/tests/default_install_test.yaml index adc4f2145e..440c0e9f4e 100644 --- a/deploy/charts/operator/tests/default_install_test.yaml +++ b/deploy/charts/operator/tests/default_install_test.yaml @@ -52,7 +52,7 @@ tests: template: deployment.yaml asserts: - contains: { path: 'spec.template.spec.containers[0].env', content: { name: ENABLE_EXPERIMENTAL_FEATURES, value: "false" } } - - contains: { path: 'spec.template.spec.containers[0].env', content: { name: TOOLHIVE_ENABLE_STORAGE_VERSION_MIGRATOR, value: "false" } } + - contains: { path: 'spec.template.spec.containers[0].env', content: { name: TOOLHIVE_ENABLE_STORAGE_VERSION_MIGRATOR, value: "true" } } - contains: { path: 'spec.template.spec.containers[0].env', content: { name: UNSTRUCTURED_LOGS, value: "false" } } - contains: { path: 'spec.template.spec.containers[0].env', content: { name: TOOLHIVE_USE_CONFIGMAP, value: "true" } } - contains: { path: 'spec.template.spec.containers[0].env', content: { name: GOGC, value: "75" } } diff --git a/deploy/charts/operator/tests/feature_flags_test.yaml b/deploy/charts/operator/tests/feature_flags_test.yaml index 6faaf9b521..70b1588386 100644 --- a/deploy/charts/operator/tests/feature_flags_test.yaml +++ b/deploy/charts/operator/tests/feature_flags_test.yaml @@ -1,15 +1,16 @@ suite: opt-in feature flags # The experimental and storage-version-migrator features are quoted-boolean env -# vars. A dropped quote or a renamed var silently disables the feature, so pin -# both the enabled and (in the baseline suite) disabled states. -set: - operator.features.experimental: true - operator.features.storageVersionMigrator: true +# vars. A dropped quote or a renamed var silently flips the feature, so pin both +# the enabled and disabled states. storageVersionMigrator defaults to true, so +# the disabled state is the one the baseline suite can't cover — pin it here. templates: - deployment.yaml tests: - it: flips both feature-flag env vars to the quoted string "true" template: deployment.yaml + set: + operator.features.experimental: true + operator.features.storageVersionMigrator: true asserts: - contains: path: 'spec.template.spec.containers[0].env' @@ -17,3 +18,12 @@ tests: - contains: path: 'spec.template.spec.containers[0].env' content: { name: TOOLHIVE_ENABLE_STORAGE_VERSION_MIGRATOR, value: "true" } + + - it: opts out of the storage version migrator when set to false + template: deployment.yaml + set: + operator.features.storageVersionMigrator: false + asserts: + - contains: + path: 'spec.template.spec.containers[0].env' + content: { name: TOOLHIVE_ENABLE_STORAGE_VERSION_MIGRATOR, value: "false" } diff --git a/deploy/charts/operator/values.yaml b/deploy/charts/operator/values.yaml index c7f71fb2e2..5550dd72fd 100644 --- a/deploy/charts/operator/values.yaml +++ b/deploy/charts/operator/values.yaml @@ -11,9 +11,10 @@ operator: # -- Enable the StorageVersionMigrator controller, which auto-cleans # status.storedVersions on opted-in toolhive.stacklok.dev CRDs so a # future release can drop deprecated versions (e.g. v1alpha1) without - # orphaning etcd objects in the cluster. Sets + # orphaning etcd objects in the cluster. Enabled by default; set to + # false to opt out and handle storage-version cleanup yourself. Sets # TOOLHIVE_ENABLE_STORAGE_VERSION_MIGRATOR in the operator deployment. - storageVersionMigrator: false + storageVersionMigrator: true # -- Number of replicas for the operator deployment replicaCount: 1 diff --git a/docs/operator/storage-version-migration.md b/docs/operator/storage-version-migration.md index 36e4cc6c84..10fcaaa9b9 100644 --- a/docs/operator/storage-version-migration.md +++ b/docs/operator/storage-version-migration.md @@ -2,7 +2,7 @@ The ToolHive operator ships a `StorageVersionMigrator` controller that keeps every ToolHive CRD's `status.storedVersions` list clean, so a future operator release can drop deprecated API versions (e.g. `v1alpha1`) without orphaning objects in etcd. -The controller is **opt-in** via the `operator.features.storageVersionMigrator` chart value (or the `TOOLHIVE_ENABLE_STORAGE_VERSION_MIGRATOR` environment variable if you inject it directly via `operator.env`). +The controller is **enabled by default** via the `operator.features.storageVersionMigrator` chart value (or the `TOOLHIVE_ENABLE_STORAGE_VERSION_MIGRATOR` environment variable if you inject it directly via `operator.env`). Set the value to `false` to opt out. ## Why this exists @@ -71,16 +71,16 @@ Then `task operator-manifests` to regenerate the CRD YAML. CI verifies the marke ## Enabling the controller -Set the Helm feature flag at install or upgrade time: +The controller is enabled by default — `operator.features.storageVersionMigrator` defaults to `true`, which sets `TOOLHIVE_ENABLE_STORAGE_VERSION_MIGRATOR=true` on the operator Deployment and registers the reconciler with the manager. No action is required to use it. + +To opt out, set the Helm feature flag to `false` at install or upgrade time: ```yaml operator: features: - storageVersionMigrator: true + storageVersionMigrator: false ``` -This sets `TOOLHIVE_ENABLE_STORAGE_VERSION_MIGRATOR=true` on the operator Deployment and registers the reconciler with the manager. - Once enabled, the controller is dormant on CRDs whose `storedVersions` already equals `[]` — most of the time, most CRDs. It only does meaningful work when a CRD's stored-versions list is dirty (typically right after a graduation release). ## Per-CRD emergency escape hatch @@ -98,10 +98,10 @@ Intended for incident response only. If you deploy the operator via GitOps (Argo The `StorageVersionMigrator` must have run against your cluster *before* an operator release that drops a deprecated CRD version ships. The typical sequence is: -1. **Release N**: both versions served, newer version is storage, `StorageVersionMigrator` available (opt-in via the chart flag). Operators that enable the migrator have their `storedVersions` trimmed during this deprecation window. +1. **Release N**: both versions served, newer version is storage, `StorageVersionMigrator` enabled by default. Operators running the migrator have their `storedVersions` trimmed during this deprecation window. 2. **Release N+1+**: the deprecated version is removed from `spec.versions`. Because every cluster that enabled the migrator already has clean `storedVersions`, the CRD update applies. -> **⚠ Skip-a-version upgrade trap.** If your cluster upgrades directly from a pre-migrator release to the version-removal release without ever running an intermediate release that runs the migrator, the helm upgrade will **fail** when the API server refuses to remove the deprecated version from `spec.versions`. To recover: deploy [kube-storage-version-migrator](https://github.com/kubernetes-sigs/kube-storage-version-migrator) once to clean `storedVersions`, then retry the upgrade. To avoid the trap entirely, install each release in sequence, and enable `storageVersionMigrator: true` at least one release before any version-removal release. +> **⚠ Skip-a-version upgrade trap.** If your cluster upgrades directly from a pre-migrator release to the version-removal release without ever running an intermediate release that runs the migrator, the helm upgrade will **fail** when the API server refuses to remove the deprecated version from `spec.versions`. To recover: deploy [kube-storage-version-migrator](https://github.com/kubernetes-sigs/kube-storage-version-migrator) once to clean `storedVersions`, then retry the upgrade. To avoid the trap entirely, install each release in sequence, and keep `storageVersionMigrator` enabled (the default) for at least one release before any version-removal release. ## Verification diff --git a/docs/operator/upgrade-guide/README.md b/docs/operator/upgrade-guide/README.md index 4b0b0e3911..3dd1f3b7ce 100644 --- a/docs/operator/upgrade-guide/README.md +++ b/docs/operator/upgrade-guide/README.md @@ -211,10 +211,10 @@ kubectl rollout status deployment -n toolhive-system --timeout=180s # Confirm flag is now true — read off the Deployment spec directly (see step 5 # for why a pod-based check races with the old pod's Terminating state). -# NOTE: the flag must be set explicitly. The chart default is -# storageVersionMigrator=false (the feature is opt-in), and `helm upgrade` does -# not reuse the previous release's --set values, so omitting it here renders the -# env var as false and the migrator never runs. +# NOTE: this is set explicitly here only because step 5 explicitly disabled it +# (and `helm upgrade` does not reuse the previous release's --set values). The +# chart default is storageVersionMigrator=true, so a normal install/upgrade that +# omits the flag entirely already runs the migrator. kubectl get deploy -n toolhive-system toolhive-operator \ -o jsonpath='{.spec.template.spec.containers[0].env[?(@.name=="TOOLHIVE_ENABLE_STORAGE_VERSION_MIGRATOR")].value}' # expected: true From b3a57f0b3ba000fd18395e2684c17758b634830f Mon Sep 17 00:00:00 2001 From: Chris Burns <29541485+ChrisJBurns@users.noreply.github.com> Date: Thu, 25 Jun 2026 12:53:08 +0100 Subject: [PATCH 2/2] Reject storage version migrator in namespace scope MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The StorageVersionMigrator watches cluster-scoped CustomResourceDefinitions and re-stores custom resources across all namespaces, so it requires cluster-scoped RBAC and a cluster-scoped manager cache. A namespace-scoped operator gets only per-namespace RoleBindings and a namespace-restricted cache, so the controller cannot sync its CRD informer and wedges the manager — which is what broke the multi-tenancy chainsaw E2E once the feature was defaulted on. Add a Helm validation that fails the render when storageVersionMigrator is true under operator.rbac.scope=namespace, with an actionable message. Fail loudly rather than silently dropping the feature: a silent drop would let a namespace-scoped admin believe storedVersions are being kept clean when they are not, leading to the skip-a-version upgrade trap later. - Add validateStorageVersionMigrator helper; invoke it from deployment.yaml - Opt the namespace-scoped multi-tenancy chainsaw setup out of the migrator - Pin the rejection and the supported namespace opt-out in helm unittests; disable the migrator in the namespace-scope suite's shared values - Document the cluster-scope requirement and the standalone-migrator alternative for namespace-scoped clusters Co-Authored-By: Claude Opus 4.8 (1M context) --- deploy/charts/operator/README.md | 2 +- deploy/charts/operator/templates/_helpers.tpl | 22 +++++++++++++- .../charts/operator/templates/deployment.yaml | 1 + .../operator/tests/feature_flags_test.yaml | 29 +++++++++++++++++++ .../operator/tests/namespace_scope_test.yaml | 3 ++ deploy/charts/operator/values.yaml | 3 ++ docs/operator/storage-version-migration.md | 13 +++++++++ .../multi-tenancy/setup/chainsaw-test.yaml | 4 +++ 8 files changed, 75 insertions(+), 2 deletions(-) diff --git a/deploy/charts/operator/README.md b/deploy/charts/operator/README.md index ea70969d5c..8055d76c18 100644 --- a/deploy/charts/operator/README.md +++ b/deploy/charts/operator/README.md @@ -61,7 +61,7 @@ The command removes all the Kubernetes components associated with the chart and | operator.defaultRedis.existingSecretKey | string | `""` | existingSecretKey is the key within existingSecret that holds the password. Empty means use global.redis.existingSecretKey or fall back to "redis-password". | | operator.env | list | `[]` | Environment variables to set in the operator container. Supported toolhive-specific variables include: - TOOLHIVE_SKIP_UPDATE_CHECK: set to "true" to disable the operator's periodic update check against the ToolHive update API. Also disables the usage-metrics collection that is gated on the same check. | | operator.features.experimental | bool | `false` | Enable experimental features | -| operator.features.storageVersionMigrator | bool | `true` | Enable the StorageVersionMigrator controller, which auto-cleans status.storedVersions on opted-in toolhive.stacklok.dev CRDs so a future release can drop deprecated versions (e.g. v1alpha1) without orphaning etcd objects in the cluster. Enabled by default; set to false to opt out and handle storage-version cleanup yourself. Sets TOOLHIVE_ENABLE_STORAGE_VERSION_MIGRATOR in the operator deployment. | +| operator.features.storageVersionMigrator | bool | `true` | Enable the StorageVersionMigrator controller, which auto-cleans status.storedVersions on opted-in toolhive.stacklok.dev CRDs so a future release can drop deprecated versions (e.g. v1alpha1) without orphaning etcd objects in the cluster. Enabled by default; set to false to opt out and handle storage-version cleanup yourself. Sets TOOLHIVE_ENABLE_STORAGE_VERSION_MIGRATOR in the operator deployment. Requires `operator.rbac.scope=cluster` — the controller watches cluster-scoped CRDs and re-stores resources across all namespaces, so the chart rejects this being true when scope is namespace. | | operator.gc | object | `{"gogc":75,"gomemlimit":"110MiB"}` | Go memory limits and garbage collection percentage for the operator container | | operator.gc.gogc | int | `75` | Go garbage collection percentage for the operator container | | operator.gc.gomemlimit | string | `"110MiB"` | Go memory limits for the operator container | diff --git a/deploy/charts/operator/templates/_helpers.tpl b/deploy/charts/operator/templates/_helpers.tpl index c158a32423..8b78e6742a 100644 --- a/deploy/charts/operator/templates/_helpers.tpl +++ b/deploy/charts/operator/templates/_helpers.tpl @@ -68,4 +68,24 @@ Common labels for the toolhive resources {{- define "toolhive.labels" -}} app: toolhive app.kubernetes.io/name: toolhive -{{- end }} \ No newline at end of file +{{- end }} + +{{/* +Validate feature-flag / RBAC-scope combinations and fail the render early with +an actionable message rather than deploying a wedged operator. + +The StorageVersionMigrator controller watches cluster-scoped +CustomResourceDefinition objects and re-stores custom resources across every +namespace, so it requires cluster-scoped RBAC and a cluster-scoped manager +cache. A namespace-scoped operator (operator.rbac.scope=namespace) gets only +per-namespace RoleBindings and a namespace-restricted cache, so the controller +cannot sync its CRD informer and would prevent the manager from starting. We +fail loudly here instead of silently dropping the feature, because a silent +drop would let a namespace-scoped admin believe storedVersions are being kept +clean when they are not. +*/}} +{{- define "toolhive-operator.validateStorageVersionMigrator" -}} +{{- if and .Values.operator.features.storageVersionMigrator (ne .Values.operator.rbac.scope "cluster") -}} +{{- fail "operator.features.storageVersionMigrator requires operator.rbac.scope=cluster: the StorageVersionMigrator controller watches cluster-scoped CustomResourceDefinitions and re-stores resources across all namespaces, which a namespace-scoped operator cannot do. Set operator.features.storageVersionMigrator=false for namespace-scoped installs." -}} +{{- end -}} +{{- end -}} \ No newline at end of file diff --git a/deploy/charts/operator/templates/deployment.yaml b/deploy/charts/operator/templates/deployment.yaml index c83ba99db0..8961a5bb5a 100644 --- a/deploy/charts/operator/templates/deployment.yaml +++ b/deploy/charts/operator/templates/deployment.yaml @@ -1,3 +1,4 @@ +{{- include "toolhive-operator.validateStorageVersionMigrator" . }} apiVersion: apps/v1 kind: Deployment metadata: diff --git a/deploy/charts/operator/tests/feature_flags_test.yaml b/deploy/charts/operator/tests/feature_flags_test.yaml index 70b1588386..3f78eced9a 100644 --- a/deploy/charts/operator/tests/feature_flags_test.yaml +++ b/deploy/charts/operator/tests/feature_flags_test.yaml @@ -3,6 +3,8 @@ suite: opt-in feature flags # vars. A dropped quote or a renamed var silently flips the feature, so pin both # the enabled and disabled states. storageVersionMigrator defaults to true, so # the disabled state is the one the baseline suite can't cover — pin it here. +# The migrator also requires cluster-scoped RBAC, so the chart rejects it in +# namespace scope; pin that validation too. templates: - deployment.yaml tests: @@ -27,3 +29,30 @@ tests: - contains: path: 'spec.template.spec.containers[0].env' content: { name: TOOLHIVE_ENABLE_STORAGE_VERSION_MIGRATOR, value: "false" } + + # The migrator watches cluster-scoped CRDs, so it must never be enabled in a + # namespace-scoped install. The chart fails the render rather than deploying a + # wedged operator. Pin both the rejection and the supported opt-out. + - it: rejects the storage version migrator in namespace scope + template: deployment.yaml + set: + operator.rbac.scope: namespace + operator.rbac.allowedNamespaces: [toolhive-system] + operator.features.storageVersionMigrator: true + asserts: + - failedTemplate: + errorPattern: "operator.features.storageVersionMigrator requires operator.rbac.scope=cluster" + + - it: allows namespace scope when the storage version migrator is disabled + template: deployment.yaml + set: + operator.rbac.scope: namespace + operator.rbac.allowedNamespaces: [toolhive-system] + operator.features.storageVersionMigrator: false + asserts: + - contains: + path: 'spec.template.spec.containers[0].env' + content: { name: TOOLHIVE_ENABLE_STORAGE_VERSION_MIGRATOR, value: "false" } + - contains: + path: 'spec.template.spec.containers[0].env' + content: { name: WATCH_NAMESPACE, value: "toolhive-system" } diff --git a/deploy/charts/operator/tests/namespace_scope_test.yaml b/deploy/charts/operator/tests/namespace_scope_test.yaml index b45858e15b..957f9421eb 100644 --- a/deploy/charts/operator/tests/namespace_scope_test.yaml +++ b/deploy/charts/operator/tests/namespace_scope_test.yaml @@ -6,6 +6,9 @@ suite: namespace-scoped install set: operator.rbac.scope: namespace operator.rbac.allowedNamespaces: [team-a, team-b] + # storageVersionMigrator requires cluster scope; the chart rejects it under + # namespace scope (see feature_flags_test.yaml), so disable it here. + operator.features.storageVersionMigrator: false templates: - deployment.yaml - clusterrole/rolebinding.yaml diff --git a/deploy/charts/operator/values.yaml b/deploy/charts/operator/values.yaml index 5550dd72fd..c645385537 100644 --- a/deploy/charts/operator/values.yaml +++ b/deploy/charts/operator/values.yaml @@ -14,6 +14,9 @@ operator: # orphaning etcd objects in the cluster. Enabled by default; set to # false to opt out and handle storage-version cleanup yourself. Sets # TOOLHIVE_ENABLE_STORAGE_VERSION_MIGRATOR in the operator deployment. + # Requires `operator.rbac.scope=cluster` — the controller watches + # cluster-scoped CRDs and re-stores resources across all namespaces, so + # the chart rejects this being true when scope is namespace. storageVersionMigrator: true # -- Number of replicas for the operator deployment replicaCount: 1 diff --git a/docs/operator/storage-version-migration.md b/docs/operator/storage-version-migration.md index 10fcaaa9b9..794b0bfefc 100644 --- a/docs/operator/storage-version-migration.md +++ b/docs/operator/storage-version-migration.md @@ -83,6 +83,19 @@ operator: Once enabled, the controller is dormant on CRDs whose `storedVersions` already equals `[]` — most of the time, most CRDs. It only does meaningful work when a CRD's stored-versions list is dirty (typically right after a graduation release). +### Requires cluster-scoped RBAC + +The migrator only works when the operator runs cluster-scoped (`operator.rbac.scope=cluster`, the default). It watches cluster-scoped `CustomResourceDefinition` objects and re-stores custom resources across **all** namespaces — neither is possible for a namespace-scoped operator, which gets only per-namespace `RoleBindings` and a namespace-restricted manager cache. To prevent a silently wedged operator, the chart **fails the render** if `storageVersionMigrator: true` is combined with `operator.rbac.scope=namespace`: + +``` +operator.features.storageVersionMigrator requires operator.rbac.scope=cluster: the +StorageVersionMigrator controller watches cluster-scoped CustomResourceDefinitions and +re-stores resources across all namespaces, which a namespace-scoped operator cannot do. +Set operator.features.storageVersionMigrator=false for namespace-scoped installs. +``` + +Namespace-scoped installs must set `storageVersionMigrator: false` and handle storage-version cleanup by other means — e.g. running the standalone [`kube-storage-version-migrator`](https://github.com/kubernetes-sigs/kube-storage-version-migrator), which is a cluster-scoped component with its own identity, before any version-removal release. + ## Per-CRD emergency escape hatch Removing the label on a live cluster excludes that single CRD from migration immediately: diff --git a/test/e2e/chainsaw/operator/multi-tenancy/setup/chainsaw-test.yaml b/test/e2e/chainsaw/operator/multi-tenancy/setup/chainsaw-test.yaml index 4aabcf830a..d7b2cd8877 100644 --- a/test/e2e/chainsaw/operator/multi-tenancy/setup/chainsaw-test.yaml +++ b/test/e2e/chainsaw/operator/multi-tenancy/setup/chainsaw-test.yaml @@ -42,6 +42,10 @@ spec: - operator.rbac.scope=namespace - --set - operator.rbac.allowedNamespaces={toolhive-system,test-namespace,toolhive-test-ns-1,toolhive-test-ns-2} + # The StorageVersionMigrator watches cluster-scoped CRDs, so it is + # unsupported (and chart-rejected) in namespace scope. Opt out here. + - --set + - operator.features.storageVersionMigrator=false - assert: file: assert-operator-ready.yaml - assert: