diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 8583cbc1..574cdf04 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -40,19 +40,35 @@ jobs: - name: Install linstor-client (python-linstor) # The harness shells out to the upstream linstor CLI to exercise # wire-shape compatibility — exactly what unit tests cannot do. - # linstor-client / python-linstor aren't packaged in the default - # Ubuntu repos (LINBIT publishes them only on the LINBIT PPA and - # PyPI); pip is the runner-friendly path. + # + # Install path rationale (validated against ubuntu:24.04 / noble): + # - apt: LINBIT only ships debs for Debian (bookworm/bullseye/ + # buster/trixie) and the LINBIT PPA, neither covers noble. + # `apt-get install linstor-client` → "Unable to locate package". + # - PyPI: only `python-linstor` is published. `linstor-client` + # and the bare `linstor` name are NOT on PyPI — pip exits with + # "No matching distribution found". + # - GitHub tarball: works, but v1.27.1's setup.py has a typo + # (missing comma joins `python3-setuptools` + `python-linstor` + # into one malformed requirement). `--no-deps` sidesteps it; + # `python-linstor` + `argcomplete` are installed explicitly + # beforehand so the runtime dep set stays correct. + # Pin v1.27.1 to match `linstor_client.VERSION` the integration + # harness asserts on (tests/integration/group_h_test.go). run: | - python3 -m pip install --break-system-packages --upgrade linstor-client python-linstor - linstor --version | head -1 + python3 -m pip install --break-system-packages --upgrade \ + python-linstor==1.27.1 argcomplete + python3 -m pip install --break-system-packages --no-deps \ + https://github.com/LINBIT/linstor-client/archive/refs/tags/v1.27.1.tar.gz + linstor --version - name: Install envtest binaries # controller-runtime's envtest needs kube-apiserver + etcd - # binaries. setup-envtest pins the version matching our - # controller-runtime release. + # binaries. We track the release branch matching our + # controller-runtime (v0.23.x). `@latest` would resolve to + # the v0.24.x submodule, which requires Go >= 1.26. run: | - go install sigs.k8s.io/controller-runtime/tools/setup-envtest@latest + go install sigs.k8s.io/controller-runtime/tools/setup-envtest@release-0.23 echo "KUBEBUILDER_ASSETS=$(setup-envtest use --print path 1.34.x)" >> "$GITHUB_ENV" - name: go mod tidy diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml index 7a9432c9..bb2f628f 100644 --- a/.github/workflows/pull-request.yml +++ b/.github/workflows/pull-request.yml @@ -17,8 +17,11 @@ name: Pull Request # because variables are not exposed in fork-PR workflows). # # Labels that change behaviour: -# - debug → opens an SSH breakpoint on e2e failure (paused 20m for first -# attach, then 10m idle-timeout after the last disconnect). +# - debug → pins the e2e job to a self-hosted runner so a maintainer can +# attach via the host (kubectl/docker on the runner host +# directly, no SSH dance through the rendezvous server). The +# breakpoint step itself fires on every e2e failure regardless +# of label — it just needs BREAKPOINT_ENDPOINT to be set. on: pull_request: @@ -57,12 +60,6 @@ jobs: timeout-minutes: 15 permissions: contents: read - # TODO: drop continue-on-error once the existing lint debt (~15 - # findings across contextcheck, funcorder, err113, - # embeddedstructfieldcheck, goconst) is cleared. The job still - # runs and surfaces every new finding via annotations — it just - # doesn't fail the PR check until the backlog is zero. - continue-on-error: true steps: - name: Clone the code uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -139,14 +136,6 @@ jobs: timeout-minutes: 20 permissions: contents: read - # TODO: drop continue-on-error once linstor-client / python-linstor - # have a runner-friendly install path. PyPI publishes the package - # but `pip install linstor-client` rejects on GitHub-hosted ubuntu- - # latest with "No matching distribution found" (Python wheel / - # platform tag mismatch). LINBIT also has a PPA, but it lacks - # noble builds. Until the install is unblocked the job runs + - # surfaces the breakage but does not fail the PR. - continue-on-error: true steps: - name: Clone uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -157,20 +146,23 @@ jobs: with: go-version-file: go.mod - name: Install linstor-client (python-linstor) - # The harness shells out to the upstream linstor CLI to exercise - # wire-shape compatibility — exactly what unit tests cannot do. - # linstor-client / python-linstor aren't packaged in the default - # Ubuntu repos (LINBIT publishes them only on the LINBIT PPA and - # PyPI); pip is the runner-friendly path. + # See .github/workflows/integration.yml for the install path + # rationale; this mirrors that step so PR runs match push runs. + # Pin v1.27.1 to match `linstor_client.VERSION` the integration + # harness asserts on (tests/integration/group_h_test.go). run: | - python3 -m pip install --break-system-packages --upgrade linstor-client python-linstor - linstor --version | head -1 + python3 -m pip install --break-system-packages --upgrade \ + python-linstor==1.27.1 argcomplete + python3 -m pip install --break-system-packages --no-deps \ + https://github.com/LINBIT/linstor-client/archive/refs/tags/v1.27.1.tar.gz + linstor --version - name: Install envtest binaries # controller-runtime's envtest needs kube-apiserver + etcd - # binaries. setup-envtest pins the version matching our - # controller-runtime release. + # binaries. We track the release branch matching our + # controller-runtime (v0.23.x). `@latest` would resolve to + # the v0.24.x submodule, which requires Go >= 1.26. run: | - go install sigs.k8s.io/controller-runtime/tools/setup-envtest@latest + go install sigs.k8s.io/controller-runtime/tools/setup-envtest@release-0.23 echo "KUBEBUILDER_ASSETS=$(setup-envtest use --print path 1.34.x)" >> "$GITHUB_ENV" - name: go mod tidy run: go mod tidy @@ -190,29 +182,22 @@ jobs: e2e: name: E2E - # GitHub-hosted runner. kind-based e2e (Tier 4 in the test strategy) - # runs fine on ubuntu-latest without nested virtualisation. Real- - # DRBD QEMU scenarios (.work/ with Talos VMs) need KVM and - # ~50 GB RAM, so they stay manual on dedicated bare-metal stands — - # see stand/Makefile + reference_blockstor_stand.md. - # - # When the ephemeral self-hosted runner pool (ARC / namespace.so / - # equivalent) is wired up, swap the label to that pool's identifier - # and bring real-DRBD scenarios online here too. - runs-on: ubuntu-latest + # Runner selection mirrors cozystack/cozystack: a labelled `debug` PR + # lands on a long-lived `self-hosted` runner so the breakpoint step + # below has somewhere stable to attach SSH; regular PRs land on the + # CNCF-provided Oracle pool (24 CPU / 96 GB / x86-64) which has + # enough headroom for kind + real-DRBD QEMU stands (Talos VMs in + # .work/, ~50 GB RAM, KVM nested virt). The pool labels are + # org-wide on cozystack, so no extra setup is required here. Swap + # to oracle-vm-32cpu-128gb-x86-64 if a future scenario needs more + # RAM/CPU. + runs-on: ${{ contains(github.event.pull_request.labels.*.name, 'debug') && 'self-hosted' || 'oracle-vm-24cpu-96gb-x86-64' }} needs: [detect-changes, lint, unit-test] if: needs.detect-changes.outputs.code == 'true' - timeout-minutes: 60 + timeout-minutes: 180 permissions: contents: read checks: write - # TODO: drop continue-on-error once the kind-based e2e tier is - # confirmed stable on ubuntu-latest. The kustomize manifest fix - # (commit c2e716daf) unblocked `make deploy`, but the suite - # itself may surface other ubuntu-latest gaps. Until then the job - # surfaces issues via annotations + breakpoint but does not block - # PR approval. - continue-on-error: true steps: - name: Clone the code uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -240,10 +225,13 @@ jobs: # Open an SSH breakpoint to the failing e2e runner so maintainers # can attach, inspect kind/blockstor state, and resume with - # `breakpoint resume`. Gated by the `debug` label — set it on the PR - # to opt in. Forks can't reach the rendezvous server because - # repository variables are not exposed to fork-PR workflows; the - # step is silently skipped in that case. + # `breakpoint resume`. Fires on every e2e failure (no label opt-in) + # — the rationale is that an e2e failure already burned the runner + # minutes and a maintainer almost always wants to inspect the wedged + # cluster before tear-down. Forks can't reach the rendezvous server + # because repository variables are not exposed to fork-PR workflows; + # the step is silently skipped in that case (BREAKPOINT_ENDPOINT + # comes through empty). # # Uses cozystack/breakpoint-action (fork of namespacelabs/breakpoint-action) # pinned by SHA. The fork adds pause-idle mode (initial grace period @@ -253,8 +241,7 @@ jobs: - name: Breakpoint on E2E failure if: | failure() && - vars.BREAKPOINT_ENDPOINT != '' && - contains(github.event.pull_request.labels.*.name, 'debug') + vars.BREAKPOINT_ENDPOINT != '' # cozystack/breakpoint-action v2-cozy.1 # mode: pause-idle defaults: grace-period=20m, idle-timeout=10m uses: cozystack/breakpoint-action@a6f3a6f87be398ad63b6577351e3398e53f578e4 diff --git a/.golangci.yml b/.golangci.yml index a2b99103..33ac4be2 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -138,7 +138,7 @@ linters: # interfaces share the same shape on purpose). - linters: - dupl - path: pkg/store/inmemory_ + path: pkg/store/inmemory - linters: - dupl path: pkg/store/store\.go @@ -272,6 +272,100 @@ linters: - varnamelen - mnd path: test/ + # Production-code packages use `Bug NNN:` markers as a deliberate + # cross-reference index into the project's bug tracker (the comments + # document WHY a workaround exists and point at the underlying issue). + # godox would flag them as stray TODO/BUG/FIXME — disable ONLY godox + # for these paths; every other linter still applies. + - linters: + - godox + path: cmd/apiserver/ + - linters: + - godox + path: pkg/api/v1/ + - linters: + - godox + path: pkg/drbd/ + - linters: + - godox + path: pkg/luks/ + - linters: + - godox + path: pkg/storage/ + - linters: + - godox + path: pkg/store/ + - linters: + - godox + path: pkg/uevent/ + - linters: + - godox + path: tests/contract/ + - linters: + - godox + path: pkg/rest/ + - linters: + - godox + path: pkg/satellite/ + - linters: + - godox + path: pkg/dispatcher/ + - linters: + - godox + path: pkg/version/ + # linstor-trace-recorder is a dev tool that writes JSON traces + # under -out-dir; gosec G703 flags the os.WriteFile call despite + # the explicit filepath.Base sanitisation right above it (taint + # analysis can't see through the local-variable assignment). + - linters: + - gosec + path: cmd/linstor-trace-recorder/ + # pkg/storage/file/diskfree.go does an int64(stat.Bsize) + # conversion that is a no-op on Linux (Bsize int64) but + # required on macOS (Bsize uint32). unconvert can't see the + # cross-platform shape, so the conversion is flagged as + # unnecessary on Linux builds. + - linters: + - unconvert + path: pkg/storage/file/diskfree\.go + # peer_delete_sync.go contains Bug 342 v10 plumbing (per-peer + # forget-peer ACK annotations) that's staged on the satellite side + # but not yet called from the REST handler. The unused-code is + # intentional and reviewed; remove the exclusion when the v10 + # wire-up lands. + - linters: + - unused + - funlen + - wrapcheck + - varnamelen + path: pkg/rest/peer_delete_sync\.go + # kv_store.go's KV PUT body shape contains LINSTOR-wire field + # names (override_props / delete_props / delete_namespaces) that + # appear both in the field-name allow-list and the corresponding + # unmarshal target — extracting constants doesn't aid readability + # since the literals match the JSON keys verbatim. + - linters: + - goconst + path: pkg/rest/kv_store\.go + # resources.go's boolQuery parser enumerates strconv.ParseBool's + # truthy alternatives + curl-style "yes"/"on"; the literals are + # parser cases, not magic constants. + - linters: + - goconst + path: pkg/rest/resources\.go + # resource_toggle_disk.go composes wire-status strings + # ("diskful"/"diskless") in a short branch; goconst miscounts + # these as repeated literals against the rest of the package. + - linters: + - goconst + path: pkg/rest/resource_toggle_disk\.go + # reconciler_drbd_test.go reserves sentinel errors for paths + # that aren't yet exercised by the fixture; the sentinels stay + # so the table-driven scaffolding can grow incrementally without + # re-introducing the same constants on each addition. + - linters: + - unused + path: pkg/satellite/reconciler_drbd_test\.go formatters: enable: - gofmt diff --git a/Makefile b/Makefile index 0c07671d..bc1c960c 100644 --- a/Makefile +++ b/Makefile @@ -178,7 +178,13 @@ DOCKER_BUILD_ARGS = --build-arg GIT_HASH=$(GIT_HASH) --build-arg BUILD_TIME=$(BU # More info: https://docs.docker.com/develop/develop-images/build_enhancements/ .PHONY: docker-build docker-build: ## Build docker image with the manager. - $(CONTAINER_TOOL) build $(DOCKER_BUILD_ARGS) -t ${IMG} . + # --target controller pins the multi-stage build to the distroless + # nonroot stage that ships /controller. Without it docker picks + # the last stage (satellite, debian:trixie-slim, no USER + # directive) and `make deploy` would land a root-running image + # under a Pod that kustomize stamps with `runAsNonRoot: true`, + # producing the e2e CreateContainerConfigError failure. + $(CONTAINER_TOOL) build --target controller $(DOCKER_BUILD_ARGS) -t ${IMG} . .PHONY: docker-push docker-push: ## Push docker image with the manager. diff --git a/cmd/linstor-trace-recorder/main.go b/cmd/linstor-trace-recorder/main.go index ecbb24e7..8550d8c6 100644 --- a/cmd/linstor-trace-recorder/main.go +++ b/cmd/linstor-trace-recorder/main.go @@ -243,7 +243,9 @@ func (r *recorder) write(method, path string, reqBody []byte, status int, respBo // filepath.Base belt-and-braces against any path-traversal // chars that snuck through sanitisePath (filenames already have // `/` stripped, but gosec G703 wants the explicit guard). - err = os.WriteFile(filepath.Join(r.outDir, filepath.Base(name)), out, fileMode) + target := filepath.Join(r.outDir, filepath.Base(name)) + + err = os.WriteFile(target, out, fileMode) if err != nil { fmt.Fprintf(os.Stderr, "write trace: %v\n", err) os.Exit(1) diff --git a/cmd/satellite/main.go b/cmd/satellite/main.go index 22860318..5be2999a 100644 --- a/cmd/satellite/main.go +++ b/cmd/satellite/main.go @@ -220,7 +220,7 @@ func loadRESTConfig() (*rest.Config, error) { // satellite.ManagerFactory signature. func mgrFactory(ready *readyState, logger *slog.Logger, ueventListener controllers.UeventNotifier) satellite.ManagerFactory { return func(restCfg *rest.Config, nodeName, probeAddr string, rec *satellite.Reconciler) (manager.Manager, error) { - mgr, err := controllers.NewManager(restCfg, controllers.Config{ + mgr, err := controllers.NewManager(restCfg, &controllers.Config{ NodeName: nodeName, Apply: rec, Exec: storage.RealExec{}, diff --git a/config/manager/manager.yaml b/config/manager/manager.yaml index 8f2bea8b..bec86987 100644 --- a/config/manager/manager.yaml +++ b/config/manager/manager.yaml @@ -58,8 +58,13 @@ spec: seccompProfile: type: RuntimeDefault containers: + # Binary path is `/controller` (set by Dockerfile's controller + # stage: COPY --from=builder /workspace/controller .) — not + # the kubebuilder default `/manager`. Container name stays + # `manager` for the kustomize patches in config/default that + # match on it. - command: - - /manager + - /controller args: - --leader-elect - --health-probe-bind-address=:8081 diff --git a/internal/controller/auto_diskful_controller.go b/internal/controller/auto_diskful_controller.go index 35b7bf3e..6778f29b 100644 --- a/internal/controller/auto_diskful_controller.go +++ b/internal/controller/auto_diskful_controller.go @@ -97,7 +97,7 @@ func (r *AutoDiskfulReconciler) Reconcile(ctx context.Context, req ctrl.Request) rd, err := r.Store.ResourceDefinitions().Get(ctx, req.Name) if err != nil { // RD already gone; nothing to time. - return ctrl.Result{}, nil //nolint:nilerr + return ctrl.Result{}, nil //nolint:nilerr // RD-not-found is success — caller doesn't requeue } minutes, placeCount, skip, err := r.evaluateConfig(ctx, &rd) @@ -116,7 +116,7 @@ func (r *AutoDiskfulReconciler) Reconcile(ctx context.Context, req ctrl.Request) diskful, candidates := splitDiskfulAndCandidates(replicas) - if int32(len(diskful)) >= placeCount { + if int32(len(diskful)) >= placeCount { //nolint:gosec // len of an in-memory slice fits int32 // Cluster is back at full diskful health — clear the timer. return ctrl.Result{}, r.stripDeadlineIfPresent(ctx, &rd) } @@ -152,11 +152,7 @@ func (r *AutoDiskfulReconciler) evaluateConfig(ctx context.Context, rd *apiv1.Re return 0, 0, true, nil } - placeCount, ok, err := r.placeCountForRD(ctx, rd) - if err != nil { - return 0, 0, true, err - } - + placeCount, ok := r.placeCountForRD(ctx, rd) if !ok || placeCount <= 0 { return 0, 0, true, nil } @@ -249,19 +245,19 @@ func (r *AutoDiskfulReconciler) resolveAutoDiskfulMinutes(ctx context.Context, r // RG SelectFilter. An RD without an RG has no target; the second // return value reflects "found" so the caller can no-op cleanly // without confusing "0" with "not configured". -func (r *AutoDiskfulReconciler) placeCountForRD(ctx context.Context, rd *apiv1.ResourceDefinition) (int32, bool, error) { +func (r *AutoDiskfulReconciler) placeCountForRD(ctx context.Context, rd *apiv1.ResourceDefinition) (int32, bool) { if rd.ResourceGroupName == "" { - return 0, false, nil + return 0, false } rg, err := r.Store.ResourceGroups().Get(ctx, rd.ResourceGroupName) if err != nil { // Soft-fail: RG might be deleting concurrently; the next // reconcile retries. - return 0, false, nil //nolint:nilerr + return 0, false } - return int32(rg.SelectFilter.PlaceCount), true, nil + return int32(rg.SelectFilter.PlaceCount), true } // promoteOne picks the first non-tiebreaker DISKLESS replica that @@ -349,22 +345,8 @@ func (r *AutoDiskfulReconciler) stampDeadline(ctx context.Context, rd *apiv1.Res // is nil (unit tests using in-memory store only), fall back to // the Store path which round-trips the annotation through the // same surface the K8s store would. - if r.Client != nil { - var crd blockstoriov1alpha1.ResourceDefinition - - err := r.Get(ctx, client.ObjectKey{Name: rd.Name}, &crd) - if err == nil { - if crd.Annotations == nil { - crd.Annotations = map[string]string{} - } - - crd.Annotations[apiv1.AutoDiskfulDeadlineAnnotation] = deadline.Format(time.RFC3339) - - updateErr := r.Update(ctx, &crd) - if updateErr == nil { - return nil - } - } + if r.stampDeadlineViaCRD(ctx, rd.Name, deadline) { + return nil } if rd.Annotations == nil { @@ -376,6 +358,29 @@ func (r *AutoDiskfulReconciler) stampDeadline(ctx context.Context, rd *apiv1.Res return r.Store.ResourceDefinitions().Update(ctx, rd) } +// stampDeadlineViaCRD updates the annotation through the typed K8s +// CRD client. Returns true on success; any failure (no Client, Get +// error, Update error) yields false so the caller falls back to the +// Store path. +func (r *AutoDiskfulReconciler) stampDeadlineViaCRD(ctx context.Context, rdName string, deadline time.Time) bool { + if r.Client == nil { + return false + } + + var crd blockstoriov1alpha1.ResourceDefinition + if err := r.Get(ctx, client.ObjectKey{Name: rdName}, &crd); err != nil { + return false + } + + if crd.Annotations == nil { + crd.Annotations = map[string]string{} + } + + crd.Annotations[apiv1.AutoDiskfulDeadlineAnnotation] = deadline.Format(time.RFC3339) + + return r.Update(ctx, &crd) == nil +} + // stripDeadlineIfPresent is a no-op when the annotation isn't set, // so the dominant "cluster healthy" path stays write-free. func (r *AutoDiskfulReconciler) stripDeadlineIfPresent(ctx context.Context, rd *apiv1.ResourceDefinition) error { @@ -383,20 +388,8 @@ func (r *AutoDiskfulReconciler) stripDeadlineIfPresent(ctx context.Context, rd * return nil } - if r.Client != nil { - var crd blockstoriov1alpha1.ResourceDefinition - - err := r.Get(ctx, client.ObjectKey{Name: rd.Name}, &crd) - if err == nil { - if _, present := crd.Annotations[apiv1.AutoDiskfulDeadlineAnnotation]; present { - delete(crd.Annotations, apiv1.AutoDiskfulDeadlineAnnotation) - - updateErr := r.Update(ctx, &crd) - if updateErr == nil { - return nil - } - } - } + if r.stripDeadlineViaCRD(ctx, rd.Name) { + return nil } delete(rd.Annotations, apiv1.AutoDiskfulDeadlineAnnotation) @@ -404,6 +397,29 @@ func (r *AutoDiskfulReconciler) stripDeadlineIfPresent(ctx context.Context, rd * return r.Store.ResourceDefinitions().Update(ctx, rd) } +// stripDeadlineViaCRD deletes the annotation through the typed K8s +// CRD client. Returns true on success or when the annotation is +// already absent on the CRD; any failure yields false so the caller +// falls back to the Store path. +func (r *AutoDiskfulReconciler) stripDeadlineViaCRD(ctx context.Context, rdName string) bool { + if r.Client == nil { + return false + } + + var crd blockstoriov1alpha1.ResourceDefinition + if err := r.Get(ctx, client.ObjectKey{Name: rdName}, &crd); err != nil { + return false + } + + if _, present := crd.Annotations[apiv1.AutoDiskfulDeadlineAnnotation]; !present { + return true + } + + delete(crd.Annotations, apiv1.AutoDiskfulDeadlineAnnotation) + + return r.Update(ctx, &crd) == nil +} + // now defers to the injected clock or wall time. The injection point // is the test seam — production never sets Now. func (r *AutoDiskfulReconciler) now() time.Time { diff --git a/internal/controller/auto_diskful_timer_test.go b/internal/controller/auto_diskful_timer_test.go index ffd31462..8e7b7f7f 100644 --- a/internal/controller/auto_diskful_timer_test.go +++ b/internal/controller/auto_diskful_timer_test.go @@ -465,7 +465,7 @@ func TestAutoDiskfulDisabledByZeroProp(t *testing.T) { st := store.NewInMemory() rd := seedAutoDiskfulFixture(t, ctx, st, - 2, "" /* no ctrl prop */, "" /* no rd prop */, /* feature disabled */ + 2, "" /* no ctrl prop */, "", /* no rd prop */ /* feature disabled */ []string{"n1"}, []string{"n2"}, ) diff --git a/internal/controller/autosnapshot_controller.go b/internal/controller/autosnapshot_controller.go index 160cecf5..42122cc8 100644 --- a/internal/controller/autosnapshot_controller.go +++ b/internal/controller/autosnapshot_controller.go @@ -88,6 +88,14 @@ const ( // scenario doc is explicit: manual snapshots are NOT counted // against the Keep budget. LabelAutoSnapshot = "blockstor.io/auto-snapshot" + + // labelResourceDefinition tags the Snapshot CRD with the parent RD + // name so the prune step can match by RD without a full Spec walk. + labelResourceDefinition = "blockstor.io/resource-definition" + + // labelTrueValue is the Kubernetes-canonical truthy string used for + // boolean labels (Kubernetes labels are strings). + labelTrueValue = "true" ) // Clock is the time source the runnable consumes. Production wires @@ -277,9 +285,9 @@ func (r *AutoSnapshotRunnable) createAutoSnapshot( ObjectMeta: metav1.ObjectMeta{ Name: rd.Name + "." + snapName, Labels: map[string]string{ - "blockstor.io/resource-definition": rd.Name, - "blockstor.io/snapshot-name": snapName, - LabelAutoSnapshot: "true", + labelResourceDefinition: rd.Name, + "blockstor.io/snapshot-name": snapName, + LabelAutoSnapshot: labelTrueValue, }, }, Spec: blockstoriov1alpha1.SnapshotSpec{ @@ -388,8 +396,8 @@ func (r *AutoSnapshotRunnable) pruneOldAutoSnapshots( var snapList blockstoriov1alpha1.SnapshotList err := r.Client.List(ctx, &snapList, client.MatchingLabels{ - "blockstor.io/resource-definition": rd.Name, - LabelAutoSnapshot: "true", + labelResourceDefinition: rd.Name, + LabelAutoSnapshot: labelTrueValue, }) if err != nil { return errors.Wrap(err, "list auto-snapshots") @@ -409,7 +417,7 @@ func (r *AutoSnapshotRunnable) pruneOldAutoSnapshots( }) excess := len(snapList.Items) - keep - for i := 0; i < excess; i++ { + for i := range excess { snap := &snapList.Items[i] if !snap.DeletionTimestamp.IsZero() { // Already being deleted — skip the redundant Delete diff --git a/internal/controller/autosnapshot_controller_test.go b/internal/controller/autosnapshot_controller_test.go index 11484f26..e3403bae 100644 --- a/internal/controller/autosnapshot_controller_test.go +++ b/internal/controller/autosnapshot_controller_test.go @@ -162,7 +162,7 @@ func TestAutoSnapshotFiveIntervalsProduceFiveSnapshots(t *testing.T) { clk := &stubClock{t: time.Date(2026, 5, 14, 12, 0, 0, 0, time.UTC)} r := &controllerpkg.AutoSnapshotRunnable{Client: cli, Clock: clk} - for i := 0; i < 5; i++ { + for i := range 5 { err := r.Tick(context.Background()) if err != nil { t.Fatalf("Tick #%d: %v", i, err) @@ -236,7 +236,7 @@ func TestAutoSnapshotKeepPrunesOldestBeyondBudget(t *testing.T) { clk := &stubClock{t: time.Date(2026, 5, 14, 12, 0, 0, 0, time.UTC)} r := &controllerpkg.AutoSnapshotRunnable{Client: cli, Clock: clk} - for i := 0; i < 5; i++ { + for i := range 5 { if err := r.Tick(context.Background()); err != nil { t.Fatalf("Tick #%d: %v", i, err) } @@ -299,7 +299,7 @@ func TestAutoSnapshotManualSnapshotsNotPruned(t *testing.T) { clk := &stubClock{t: time.Date(2026, 5, 14, 12, 0, 0, 0, time.UTC)} r := &controllerpkg.AutoSnapshotRunnable{Client: cli, Clock: clk} - for i := 0; i < 4; i++ { + for i := range 4 { if err := r.Tick(context.Background()); err != nil { t.Fatalf("Tick #%d: %v", i, err) } @@ -370,7 +370,7 @@ func TestAutoSnapshotRunEveryDisabledSkipsRD(t *testing.T) { clk := &stubClock{t: time.Date(2026, 5, 14, 12, 0, 0, 0, time.UTC)} r := &controllerpkg.AutoSnapshotRunnable{Client: cli, Clock: clk} - for i := 0; i < 3; i++ { + for i := range 3 { if err := r.Tick(context.Background()); err != nil { t.Fatalf("Tick #%d: %v", i, err) } @@ -406,7 +406,7 @@ func TestAutoSnapshotKeepZeroDisablesPrune(t *testing.T) { r := &controllerpkg.AutoSnapshotRunnable{Client: cli, Clock: clk} // 15 ticks across 15 intervals — well past the default Keep=10. - for i := 0; i < 15; i++ { + for i := range 15 { if err := r.Tick(context.Background()); err != nil { t.Fatalf("Tick #%d: %v", i, err) } @@ -437,7 +437,7 @@ func TestAutoSnapshotDefaultKeepIs10(t *testing.T) { r := &controllerpkg.AutoSnapshotRunnable{Client: cli, Clock: clk} // 12 ticks → 12 created, 2 pruned, 10 retained. - for i := 0; i < 12; i++ { + for i := range 12 { if err := r.Tick(context.Background()); err != nil { t.Fatalf("Tick #%d: %v", i, err) } diff --git a/internal/controller/resource_controller.go b/internal/controller/resource_controller.go index 69b44b93..d7aa58ac 100644 --- a/internal/controller/resource_controller.go +++ b/internal/controller/resource_controller.go @@ -49,6 +49,12 @@ import ( // the controller no longer stamps it on new Resources. const resourceFinalizer = "blockstor.io.blockstor.io/resource" +// takenPortsCluster / takenMinorsCluster pre-allocate this many slots +// for the result slice — sized to cover a typical small cluster (5-10 +// RDs × 1-2 vols) without re-growing while not over-allocating for the +// single-RD common case. +const takenAllocsInitialCap = 16 + // (formerly `controllerDRBDIDsFieldOwner`: the SSA field-manager // identity the controller-side allocator used when it wrote // Status.DRBD{NodeID,Port,Minor}. Phase 11.x switched to a raw JSON @@ -790,7 +796,7 @@ func pickSeedFromPeers(peers []blockstoriov1alpha1.Resource, targetName string, continue } - if volumeDiskState(&peers[i], volumeNumber) != "UpToDate" { + if volumeDiskState(&peers[i], volumeNumber) != string(drbd.DiskStateUpToDate) { continue } @@ -972,19 +978,19 @@ func (r *ResourceReconciler) ensureRDPortMinor(ctx context.Context, target *bloc err = r.Status().Update(ctx, &rd) if err != nil { + if !errors.IsConflict(err) { + return 0, 0, err + } + // On conflict, a sibling reconcile already stamped values. // Re-fetch and return whatever they committed. - if errors.IsConflict(err) { - var fresh blockstoriov1alpha1.ResourceDefinition - - fetchErr := reader.Get(ctx, client.ObjectKey{Name: rdName}, &fresh) - if fetchErr != nil { - return 0, 0, fetchErr - } + freshPort, freshMinor, ok, fetchErr := r.reloadCommittedPortMinor(ctx, reader, rdName) + if fetchErr != nil { + return 0, 0, fetchErr + } - if fresh.Status.DRBDPort != nil && fresh.Status.DRBDMinor != nil { - return *fresh.Status.DRBDPort, *fresh.Status.DRBDMinor, nil - } + if ok { + return freshPort, freshMinor, nil } return 0, 0, err @@ -993,6 +999,24 @@ func (r *ResourceReconciler) ensureRDPortMinor(ctx context.Context, target *bloc return port, minor, nil } +// reloadCommittedPortMinor re-reads the RD after a Status().Update +// conflict and returns the sibling-committed (port, minor) pair when +// both are stamped. `ok=false` means the sibling stamped neither yet +// (race lost the way that doesn't have a stamped winner), in which +// case the caller surfaces the original conflict error. +func (r *ResourceReconciler) reloadCommittedPortMinor(ctx context.Context, reader client.Reader, rdName string) (int32, int32, bool, error) { + var fresh blockstoriov1alpha1.ResourceDefinition + if err := reader.Get(ctx, client.ObjectKey{Name: rdName}, &fresh); err != nil { + return 0, 0, false, err + } + + if fresh.Status.DRBDPort != nil && fresh.Status.DRBDMinor != nil { + return *fresh.Status.DRBDPort, *fresh.Status.DRBDMinor, true, nil + } + + return 0, 0, false, nil +} + // apiReader returns the uncached apiserver-direct client when // available, falling back to the cached client for tests that // construct ResourceReconciler{} directly without going through @@ -1280,7 +1304,7 @@ func (r *ResourceReconciler) intersectRange( // ensureRDPortMinor, this guarantees that the second concurrent // allocator sees the first one's port and picks a different one. func (r *ResourceReconciler) takenPortsCluster(ctx context.Context, selfRD string) ([]int32, error) { - out := make([]int32, 0, 16) + out := make([]int32, 0, takenAllocsInitialCap) reader := r.apiReader() var rdList blockstoriov1alpha1.ResourceDefinitionList @@ -1325,7 +1349,7 @@ func (r *ResourceReconciler) takenPortsCluster(ctx context.Context, selfRD strin // takenPortsCluster — cross-RD batch autoplace must observe // freshly-committed sibling allocations rather than a stale cache. func (r *ResourceReconciler) takenMinorsCluster(ctx context.Context, selfRD string) ([]int32, error) { - out := make([]int32, 0, 16) + out := make([]int32, 0, takenAllocsInitialCap) reader := r.apiReader() rdVolCounts := map[string]int{} diff --git a/pkg/rest/autoplace.go b/pkg/rest/autoplace.go index 65c3eaa8..511575e8 100644 --- a/pkg/rest/autoplace.go +++ b/pkg/rest/autoplace.go @@ -1138,7 +1138,10 @@ func (s *Server) applyBug156AutoTiebreakerOptOut( // decision happens out-of-band in the controller — the REST handler // can't suppress it inline. func (s *Server) stampAutoTiebreakerOptOut(ctx context.Context, rdName string) error { - const propKey = "DrbdOptions/AutoAddQuorumTiebreaker" + const ( + propKey = "DrbdOptions/AutoAddQuorumTiebreaker" + propValueOff = "false" + ) // Bug 205: typed-Patch via PatchResourceDefinitionSpec — the // closure re-runs on every conflict against the live RD, so @@ -1146,7 +1149,7 @@ func (s *Server) stampAutoTiebreakerOptOut(ctx context.Context, rdName string) e // add) converge with the tiebreaker opt-out instead of being // lost by a stale-wire-snapshot replay. err := s.Store.ResourceDefinitions().PatchResourceDefinitionSpec(ctx, rdName, func(live *apiv1.ResourceDefinition) error { - if live.Props != nil && live.Props[propKey] == "false" { + if live.Props != nil && live.Props[propKey] == propValueOff { return nil } @@ -1154,7 +1157,7 @@ func (s *Server) stampAutoTiebreakerOptOut(ctx context.Context, rdName string) e live.Props = map[string]string{} } - live.Props[propKey] = "false" + live.Props[propKey] = propValueOff return nil }) diff --git a/pkg/rest/cache_invalidation_bug_124_test.go b/pkg/rest/cache_invalidation_bug_124_test.go index 6b8be495..caa5cd18 100644 --- a/pkg/rest/cache_invalidation_bug_124_test.go +++ b/pkg/rest/cache_invalidation_bug_124_test.go @@ -167,10 +167,14 @@ func (l *laggingRDs) Delete(ctx context.Context, name string) error { lag := l.lag inner := l.inner + // Bug 124 test helper: detached goroutine survives the request + // context cancel, so propagate a fresh non-cancellable context + // derived from the caller's value chain (contextcheck-friendly). + bgCtx := context.WithoutCancel(ctx) go func() { time.Sleep(lag) - _ = inner.Delete(context.Background(), name) + _ = inner.Delete(bgCtx, name) }() return nil diff --git a/pkg/rest/encryption.go b/pkg/rest/encryption.go index 3ad47a10..96627c10 100644 --- a/pkg/rest/encryption.go +++ b/pkg/rest/encryption.go @@ -515,6 +515,7 @@ func (s *Server) handlePassphraseModify(w http.ResponseWriter, r *http.Request) // port verbatim so create/modify share one validation contract. if want == "" { writeError(w, http.StatusBadRequest, "new_passphrase is required: modify must specify a non-empty new value") + return } diff --git a/pkg/rest/node_lifecycle.go b/pkg/rest/node_lifecycle.go index 8de51d12..b92fe8ab 100644 --- a/pkg/rest/node_lifecycle.go +++ b/pkg/rest/node_lifecycle.go @@ -264,7 +264,17 @@ func (s *Server) handleNodeEvacuateMulti(w http.ResponseWriter, r *http.Request) // stdlib, not in every call site (goconst flags package-wide // `"true"` repetition above its threshold). func isForce(r *http.Request) bool { - v, _ := strconv.ParseBool(r.URL.Query().Get("force")) + return queryFlag(r, "force") +} + +// queryFlag centralises every `?=true` boolean-toggle check +// in the REST surface. Routed through strconv.ParseBool so the +// literal `"true"` lives in stdlib (goconst flags package-wide +// `"true"` repetition above its threshold). Callers: isForce, +// handleResourceActivate (?reallocate-port), handleRDsList +// (?with_volume_definitions), handleResourceToggleDisk (?cancel). +func queryFlag(r *http.Request, name string) bool { + v, _ := strconv.ParseBool(r.URL.Query().Get(name)) return v } diff --git a/pkg/rest/nodes.go b/pkg/rest/nodes.go index d7b6f5b1..30026415 100644 --- a/pkg/rest/nodes.go +++ b/pkg/rest/nodes.go @@ -41,10 +41,15 @@ import ( // `props.CurStltConnName` both default to it. const DefaultNetInterfaceName = "default" -// resolveHostFunc is the DNS-lookup seam handleNodeCreate uses when +// ResolveHostFunc is the DNS-lookup seam handleNodeCreate uses when // the POST body omits a NetInterface address. Tests swap this for a -// deterministic stub via Server.lookupHost. -type resolveHostFunc func(ctx context.Context, host string) ([]string, error) +// deterministic stub via Server.SetResolveHost; production wires +// defaultResolveHost. +type ResolveHostFunc func(ctx context.Context, host string) ([]string, error) + +// resolveHostFunc is the package-internal alias kept for symmetry with +// the historical unexported name; new code SHOULD use ResolveHostFunc. +type resolveHostFunc = ResolveHostFunc // defaultResolveHost wraps net.DefaultResolver.LookupHost — the // production resolver. Hoisted into a package-level var so tests can @@ -867,6 +872,7 @@ func (s *Server) handleNodePropDelete(w http.ResponseWriter, r *http.Request) { func (s *Server) handleNodeDelete(w http.ResponseWriter, r *http.Request) { name := r.PathValue("node") force := isForce(r) + ctx := r.Context() // Bug 179: `?force=true` cascade-deletes every referencing // Resource + StoragePool CRD before dropping the Node — same @@ -874,7 +880,7 @@ func (s *Server) handleNodeDelete(w http.ResponseWriter, r *http.Request) { // force-delete would leave orphan SP CRDs pointing at a deleted // Node, which is precisely the symptom Bug 179 closed. if force { - err := s.cascadeOrphansForLostNode(r.Context(), name) + err := s.cascadeOrphansForLostNode(ctx, name) if err != nil { writeStoreError(w, err) @@ -891,10 +897,10 @@ func (s *Server) handleNodeDelete(w http.ResponseWriter, r *http.Request) { return s.refuseNodeDeleteIfReferenced(w, r, name) }, capture: func() (apiv1.Node, bool) { - return s.captureNode(r.Context(), name) + return s.captureNode(ctx, name) }, remove: func() error { - return s.Store.Nodes().Delete(r.Context(), name) + return s.Store.Nodes().Delete(ctx, name) }, rolledBackIfRaced: func(captured apiv1.Node, capturedOK bool) bool { if force || !capturedOK { @@ -1120,7 +1126,7 @@ func (s *Server) rollbackNodeDeleteIfRaced(w http.ResponseWriter, r *http.Reques func (s *Server) resourcesOnNode(ctx context.Context, node string) ([]string, error) { resources, err := s.Store.Resources().List(ctx) if err != nil { - return nil, err + return nil, errors.Wrap(err, "list resources") } var refs []string diff --git a/pkg/rest/resource_adjust.go b/pkg/rest/resource_adjust.go index 047ae417..7d8d4681 100644 --- a/pkg/rest/resource_adjust.go +++ b/pkg/rest/resource_adjust.go @@ -94,7 +94,7 @@ func (s *Server) registerResourceLifecycle(mux *http.ServeMux) { // against a collision. Operators opt in only when they actively want // the port to be reshuffled. func (s *Server) handleResourceActivate(w http.ResponseWriter, r *http.Request) { - if r.URL.Query().Get("reallocate-port") == "true" { + if queryFlag(r, "reallocate-port") { rdName := r.PathValue("rd") node := r.PathValue("node") diff --git a/pkg/rest/resource_definitions.go b/pkg/rest/resource_definitions.go index 977148a2..ce134a8c 100644 --- a/pkg/rest/resource_definitions.go +++ b/pkg/rest/resource_definitions.go @@ -78,7 +78,7 @@ func (s *Server) handleRDList(w http.ResponseWriter, r *http.Request) { // their VDs inlined under `volume_definitions`. Without this // handling, `vd l` renders an empty table even when VDs exist // (the Python CLI never falls back to per-RD GETs). - if r.URL.Query().Get("with_volume_definitions") == "true" { + if queryFlag(r, "with_volume_definitions") { for i := range rds { vds, vdErr := s.Store.VolumeDefinitions().List(r.Context(), rds[i].Name) if vdErr != nil { diff --git a/pkg/rest/resource_groups.go b/pkg/rest/resource_groups.go index 77cd6a64..2a05362b 100644 --- a/pkg/rest/resource_groups.go +++ b/pkg/rest/resource_groups.go @@ -599,16 +599,17 @@ func mergeRGProps(existing, patch *apiv1.ResourceGroup) { // but RDs still pointing at it) can never arise. func (s *Server) handleRGDelete(w http.ResponseWriter, r *http.Request) { name := r.PathValue("rg") + ctx := r.Context() (&deleteWithRollback[apiv1.ResourceGroup]{ refuseIfReferenced: func() bool { return s.refuseRGDeleteIfReferenced(w, r, name) }, capture: func() (apiv1.ResourceGroup, bool) { - return s.captureResourceGroup(r.Context(), name) + return s.captureResourceGroup(ctx, name) }, remove: func() error { - return s.Store.ResourceGroups().Delete(r.Context(), name) + return s.Store.ResourceGroups().Delete(ctx, name) }, rolledBackIfRaced: func(captured apiv1.ResourceGroup, capturedOK bool) bool { if !capturedOK { diff --git a/pkg/rest/resource_toggle_disk.go b/pkg/rest/resource_toggle_disk.go index aadf3127..5ef2a0b3 100644 --- a/pkg/rest/resource_toggle_disk.go +++ b/pkg/rest/resource_toggle_disk.go @@ -111,7 +111,7 @@ func (s *Server) handleResourceToggleDisk(w http.ResponseWriter, r *http.Request // the reconciler does that itself as the last step of the // rollback so an external observer sees DISKLESS reappear only // AFTER drbdadm down + storage Delete succeed. - if r.URL.Query().Get("cancel") == "true" { + if queryFlag(r, "cancel") { err := s.Store.Resources().PatchResourceSpec(r.Context(), rdName, node, func(res *apiv1.Resource) error { res.ToggleDiskCancel = true diff --git a/pkg/rest/server.go b/pkg/rest/server.go index 6f9ab0b1..72822525 100644 --- a/pkg/rest/server.go +++ b/pkg/rest/server.go @@ -147,24 +147,13 @@ type Server struct { // handleNodeCreate. Returns the previous value so tests can restore // it. Production code never calls this; defaultResolveHost is used // when the field is nil. -func (s *Server) SetResolveHost(fn resolveHostFunc) resolveHostFunc { +func (s *Server) SetResolveHost(fn ResolveHostFunc) ResolveHostFunc { prev := s.resolveHost s.resolveHost = fn return prev } -// lookupHost dispatches to s.resolveHost if set, else the package -// default. Hoisted off-handler so the production handler stays -// readable and the test seam is explicit. -func (s *Server) lookupHost(ctx context.Context, host string) ([]string, error) { - if s.resolveHost != nil { - return s.resolveHost(ctx, host) - } - - return defaultResolveHost(ctx, host) -} - // NeedLeaderElection reports whether the server requires leader election. // REST is read-mostly today and safe to run on every replica; once we // introduce write-paths that need a single leader we'll change this to true. @@ -250,6 +239,17 @@ func (s *Server) Start(ctx context.Context) error { } } +// lookupHost dispatches to s.resolveHost if set, else the package +// default. Hoisted off-handler so the production handler stays +// readable and the test seam is explicit. +func (s *Server) lookupHost(ctx context.Context, host string) ([]string, error) { + if s.resolveHost != nil { + return s.resolveHost(ctx, host) + } + + return defaultResolveHost(ctx, host) +} + // buildMux registers every endpoint on a fresh ServeMux. Pulled out // of Start to keep the latter under the funlen budget — Start now // only handles lifecycle (listener + shutdown), this owns routing. @@ -803,7 +803,7 @@ func (h *headRecorder) Header() http.Header { } func (h *headRecorder) Write(b []byte) (int, error) { - return h.body.Write(b) + return h.body.Write(b) //nolint:wrapcheck // bytes.Buffer.Write always returns nil; ResponseWriter contract requires the int + error } func (h *headRecorder) WriteHeader(code int) { diff --git a/pkg/rest/server_test.go b/pkg/rest/server_test.go index 923c13e4..410fbe92 100644 --- a/pkg/rest/server_test.go +++ b/pkg/rest/server_test.go @@ -65,8 +65,8 @@ func startServerCustom(t *testing.T, srv *Server) (string, func()) { // scenario's "DNS fails → 400" path) override this via // Server.SetResolveHost before calling startServerCustom. if srv.resolveHost == nil { - srv.SetResolveHost(func(_ context.Context, host string) ([]string, error) { - return []string{"127.0.0.1"}, nil //nolint:nilerr // hermetic stub + srv.SetResolveHost(func(_ context.Context, _ string) ([]string, error) { + return []string{"127.0.0.1"}, nil }) } diff --git a/pkg/rest/snapshot_restore.go b/pkg/rest/snapshot_restore.go index c7673d8f..d17f6f93 100644 --- a/pkg/rest/snapshot_restore.go +++ b/pkg/rest/snapshot_restore.go @@ -27,6 +27,11 @@ import ( "github.com/cozystack/blockstor/pkg/store" ) +// storPoolPropKey is the LINSTOR-wire property name pinning a Resource +// to a specific storage pool. Mirrors upstream LINSTOR (`StorPoolName` +// — CamelCase per the REST contract). +const storPoolPropKey = "StorPoolName" + // snapshotRestoreRequest is the JSON body upstream linstor expects on // the restore endpoint. The snapshot name has two wire dialects: // @@ -280,7 +285,7 @@ func (s *Server) materializeRestoredRD(ctx context.Context, srcRD string, req *s // // The Nodes / NodeNames request fields are aliased — callers may use // either; we normalise to one canonical list before iterating. -func (s *Server) placeRestoredResources(ctx context.Context, srcRDName string, srcRD *apiv1.ResourceDefinition, newRD *apiv1.ResourceDefinition, req *snapshotRestoreRequest) error { +func (s *Server) placeRestoredResources(ctx context.Context, srcRDName string, srcRD, newRD *apiv1.ResourceDefinition, req *snapshotRestoreRequest) error { nodes := canonicalRestoreNodeList(req) if len(nodes) > 0 { @@ -357,7 +362,7 @@ func (s *Server) stampRestoredResourcesOnNodes(ctx context.Context, srcRDName, n } if pool != "" { - res.Props = map[string]string{"StorPoolName": pool} + res.Props = map[string]string{storPoolPropKey: pool} } err := s.Store.Resources().Create(ctx, &res) diff --git a/pkg/rest/storage_pools.go b/pkg/rest/storage_pools.go index 12cd12c0..15e42d07 100644 --- a/pkg/rest/storage_pools.go +++ b/pkg/rest/storage_pools.go @@ -161,6 +161,7 @@ func (s *Server) handleNodeStoragePoolDelete(w http.ResponseWriter, r *http.Requ node := r.PathValue("node") pool := r.PathValue("pool") force := isForce(r) + ctx := r.Context() (&deleteWithRollback[apiv1.StoragePool]{ refuseIfReferenced: func() bool { @@ -180,10 +181,10 @@ func (s *Server) handleNodeStoragePoolDelete(w http.ResponseWriter, r *http.Requ return s.refuseSPDeleteIfReferenced(w, r, node, pool) }, capture: func() (apiv1.StoragePool, bool) { - return s.captureStoragePool(r.Context(), node, pool) + return s.captureStoragePool(ctx, node, pool) }, remove: func() error { - return s.Store.StoragePools().Delete(r.Context(), node, pool) + return s.Store.StoragePools().Delete(ctx, node, pool) }, rolledBackIfRaced: func(captured apiv1.StoragePool, capturedOK bool) bool { // `?force=true` callers opt past this check too. @@ -350,7 +351,7 @@ func (s *Server) rollbackSPDeleteIfRaced(w http.ResponseWriter, r *http.Request, func (s *Server) referencingResources(ctx context.Context, node, pool string) ([]string, error) { all, err := s.Store.Resources().List(ctx) if err != nil { - return nil, err + return nil, errors.Wrap(err, "list resources") } var refs []string @@ -362,10 +363,11 @@ func (s *Server) referencingResources(ctx context.Context, node, pool string) ([ matched := false - for _, v := range all[i].Volumes { - if v.StoragePool == pool { + for j := range all[i].Volumes { + vol := &all[i].Volumes[j] + if vol.StoragePool == pool { refs = append(refs, - all[i].Name+"/"+strconv.FormatInt(int64(v.VolumeNumber), 10)) + all[i].Name+"/"+strconv.FormatInt(int64(vol.VolumeNumber), 10)) matched = true diff --git a/pkg/rest/storage_pools_test.go b/pkg/rest/storage_pools_test.go index c990ad9e..6b9c3e0a 100644 --- a/pkg/rest/storage_pools_test.go +++ b/pkg/rest/storage_pools_test.go @@ -1600,6 +1600,7 @@ func TestViewStoragePoolsFaultyStampsReports(t *testing.T) { t.Errorf("message %q missing pool name", rc.Message) } } + // TestSPModifyOverridePropsLands pins Bug 85's core wire contract: // `PUT /v1/nodes/{node}/storage-pools/{pool}` accepts the python-linstor // `storage_pool_modify` body `{override_props: {...}}` and merges the diff --git a/pkg/rest/volume_definitions.go b/pkg/rest/volume_definitions.go index a4a007fd..c908e38f 100644 --- a/pkg/rest/volume_definitions.go +++ b/pkg/rest/volume_definitions.go @@ -271,7 +271,8 @@ func (s *Server) handleVDCreate(w http.ResponseWriter, r *http.Request) { dec := json.NewDecoder(bytes.NewReader(rawBody)) dec.DisallowUnknownFields() - if decErr := dec.Decode(&envelope); decErr != nil { + decErr := dec.Decode(&envelope) + if decErr != nil { writeDecodeError(w, decErr) return @@ -297,7 +298,8 @@ func (s *Server) handleVDCreate(w http.ResponseWriter, r *http.Request) { // Bug 155: refuse out-of-bounds sizes at the REST boundary so the // satellite reconciler doesn't hot-loop on `drbdadm create-md` // failures. See validateVDSize for the bounds rationale. - if sizeErr := validateVDSize(vd.SizeKib); sizeErr != nil { + sizeErr := validateVDSize(vd.SizeKib) + if sizeErr != nil { writeVDSizeRejection(w, rd, vd.VolumeNumber, vd.SizeKib, sizeErr) return @@ -366,7 +368,8 @@ func vdCreateVolumeNumberExplicit(raw []byte) bool { var envelope map[string]json.RawMessage - if err := json.Unmarshal(raw, &envelope); err != nil { + err := json.Unmarshal(raw, &envelope) + if err != nil { return false } @@ -374,7 +377,8 @@ func vdCreateVolumeNumberExplicit(raw []byte) bool { if inner, ok := envelope["volume_definition"]; ok { var innerObj map[string]json.RawMessage - if err := json.Unmarshal(inner, &innerObj); err != nil { + err := json.Unmarshal(inner, &innerObj) + if err != nil { return false } @@ -445,6 +449,17 @@ const minVolumeDefinitionSizeKib int64 = 4 * 1024 // retry loop. const maxVolumeDefinitionSizeKib int64 = 16 * 1024 * 1024 * 1024 +// ErrVolumeSizeBelowMinimum is the sentinel for Bug 155's lower-bound +// rejection (size_kib < minVolumeDefinitionSizeKib). Wrapped with +// %w + a sizeKib / bound detail by validateVDSize; static-error +// requirement is err113. +var ErrVolumeSizeBelowMinimum = errors.New("size_kib below minimum") + +// ErrVolumeSizeAboveMaximum is the sentinel for Bug 155's upper-bound +// rejection (size_kib > maxVolumeDefinitionSizeKib). See +// ErrVolumeSizeBelowMinimum for the rationale. +var ErrVolumeSizeAboveMaximum = errors.New("size_kib above maximum") + // validateVDSize returns nil when the requested size_kib is within // the accepted bounds [minVolumeDefinitionSizeKib, // maxVolumeDefinitionSizeKib] (Bug 155). Otherwise it returns a @@ -453,16 +468,16 @@ const maxVolumeDefinitionSizeKib int64 = 16 * 1024 * 1024 * 1024 func validateVDSize(sizeKib int64) error { if sizeKib < minVolumeDefinitionSizeKib { return fmt.Errorf( - "size_kib=%d below minimum %d KiB (DRBD reserves ~32 KiB of "+ + "%w: size_kib=%d below minimum %d KiB (DRBD reserves ~32 KiB of "+ "metadata per peer; backing layers add alignment on top)", - sizeKib, minVolumeDefinitionSizeKib, + ErrVolumeSizeBelowMinimum, sizeKib, minVolumeDefinitionSizeKib, ) } if sizeKib > maxVolumeDefinitionSizeKib { return fmt.Errorf( - "size_kib=%d above maximum %d KiB (DRBD's per-device hard ceiling)", - sizeKib, maxVolumeDefinitionSizeKib, + "%w: size_kib=%d above maximum %d KiB (DRBD's per-device hard ceiling)", + ErrVolumeSizeAboveMaximum, sizeKib, maxVolumeDefinitionSizeKib, ) } @@ -492,6 +507,7 @@ func writeVDSizeRejection(w http.ResponseWriter, rd string, vn int32, sizeKib in objRefVlmNr: strconv.FormatInt(int64(vn), 10), }, }}) + _ = sizeKib // retained for future audit-log fields } @@ -1079,14 +1095,14 @@ func (s *Server) pruneVolumesFromResources(ctx context.Context, rd string, vn in dropped := false - for j := range live.Volumes { - if live.Volumes[j].VolumeNumber == vn { + for idx := range live.Volumes { + if live.Volumes[idx].VolumeNumber == vn { dropped = true continue } - out = append(out, live.Volumes[j]) + out = append(out, live.Volumes[idx]) } if !dropped { diff --git a/pkg/satellite/agent_internal_test.go b/pkg/satellite/agent_internal_test.go index df87bcd4..2246622e 100644 --- a/pkg/satellite/agent_internal_test.go +++ b/pkg/satellite/agent_internal_test.go @@ -33,11 +33,12 @@ import ( // `newReconciler`; this test is the trip-wire that keeps it wired. // // Live-stand reproducer: -// linstor rd c ; linstor rd sp FileSystem/Type ext4 -// linstor vd c 256M; linstor r c --auto-place=2 -s lvm-thin -// # wait until UpToDate on every replica -// blkid -o export /dev/drbd # → exit 2 (no signature) -// ls /etc/drbd.d/*.mkfs.done # → none +// +// linstor rd c ; linstor rd sp FileSystem/Type ext4 +// linstor vd c 256M; linstor r c --auto-place=2 -s lvm-thin +// # wait until UpToDate on every replica +// blkid -o export /dev/drbd # → exit 2 (no signature) +// ls /etc/drbd.d/*.mkfs.done # → none // // Companion of TestApplyAutoMkfsRetryAfterMissedFirstActivation and // the cli-matrix `rwx-ganesha-data-vol-mkfs.sh` cell (kernel-truth diff --git a/pkg/satellite/attach.go b/pkg/satellite/attach.go index 977b07f0..5578181f 100644 --- a/pkg/satellite/attach.go +++ b/pkg/satellite/attach.go @@ -223,43 +223,68 @@ func attachDevicePath(dev *apiv1.PhysicalDevice) string { // logged and the chain continues — the dd zero-out is the // load-bearing step and its failure aborts. A tiny device // (< 64 MiB) skips the end-zero step to avoid a negative seek. + +// wipeZeroSpanMiB is the MiB count zeroed at the start and end of a +// device by wipeDevice. 32 MiB covers every on-disk superblock / +// signature footprint blockstor cares about (filesystems, LUKS, LVM) +// while staying small enough that the end-of-device seek is fast. +const wipeZeroSpanMiB = 32 + +// wipeMinDeviceSizeForEndZeroMiB is the minimum device size (MiB) at +// which wipeDevice still zeroes the tail. 2×span avoids negative-seek +// math on devices barely larger than the span itself. +const wipeMinDeviceSizeForEndZeroMiB = 2 * wipeZeroSpanMiB + +// mibBytes is the byte count in one mebibyte; used to convert bytes +// to MiB inside readDeviceSizeMiB. +const mibBytes = 1024 * 1024 + func wipeDevice(ctx context.Context, exec storage.Exec, devicePath string) error { + wipeSpanCount := strconv.Itoa(wipeZeroSpanMiB) + // 1) wipefs known signatures. Log + continue on failure: the dd // zero-out below is what actually guarantees the wipe. - if _, err := exec.Run(ctx, "wipefs", "--all", "--force", devicePath); err != nil { + _, err := exec.Run(ctx, "wipefs", "--all", "--force", devicePath) + if err != nil { slog.Default().Info("wipefs failed; continuing with dd zero-out", "dev", devicePath, "err", err.Error()) } - // 2) zero first 32 MiB. - if _, err := exec.Run(ctx, "dd", - "if=/dev/zero", "of="+devicePath, "bs=1M", "count=32", - "conv=fsync,notrunc", "status=none"); err != nil { + // 2) zero first wipeZeroSpanMiB MiB. + _, err = exec.Run(ctx, "dd", + "if=/dev/zero", "of="+devicePath, "bs=1M", "count="+wipeSpanCount, + "conv=fsync,notrunc", "status=none") + if err != nil { return errors.Wrapf(err, "zero start of %s", devicePath) } - // 3) zero last 32 MiB — query size, seek, write. + // 3) zero last wipeZeroSpanMiB MiB — query size, seek, write. sizeMiB, ok := readDeviceSizeMiB(ctx, exec, devicePath) - if ok && sizeMiB > 64 { // safety: don't seek negative on tiny devices - seekMiB := sizeMiB - 32 - if _, err := exec.Run(ctx, "dd", + + if ok && sizeMiB > wipeMinDeviceSizeForEndZeroMiB { // safety: don't seek negative on tiny devices + seekMiB := sizeMiB - wipeZeroSpanMiB + + _, err = exec.Run(ctx, "dd", "if=/dev/zero", "of="+devicePath, "bs=1M", "seek="+strconv.FormatInt(seekMiB, 10), - "count=32", "conv=fsync,notrunc", "status=none"); err != nil { + "count="+wipeSpanCount, "conv=fsync,notrunc", "status=none") + if err != nil { return errors.Wrapf(err, "zero end of %s", devicePath) } } // 4) drop stale partition device nodes. Non-fatal: partprobe // below is the belt-and-braces. - if _, err := exec.Run(ctx, "blockdev", "--rereadpt", devicePath); err != nil { + _, err = exec.Run(ctx, "blockdev", "--rereadpt", devicePath) + if err != nil { slog.Default().Info("blockdev --rereadpt failed; relying on partprobe", "dev", devicePath, "err", err.Error()) } // 5) partprobe — belt-and-braces (some kernels need this in // addition to BLKRRPART). - if _, err := exec.Run(ctx, "partprobe", devicePath); err != nil { + _, err = exec.Run(ctx, "partprobe", devicePath) + if err != nil { slog.Default().Info("partprobe failed; wipe completed via prior steps", "dev", devicePath, "err", err.Error()) } @@ -282,7 +307,7 @@ func readDeviceSizeMiB(ctx context.Context, exec storage.Exec, devicePath string return 0, false } - sz, err := strconv.ParseInt(strings.TrimSpace(string(out)), 10, 64) + sizeBytes, err := strconv.ParseInt(strings.TrimSpace(string(out)), 10, 64) if err != nil { slog.Default().Info("blockdev --getsize64 returned unparseable size; skipping end-of-device zero", "dev", devicePath, "out", string(out)) @@ -290,7 +315,7 @@ func readDeviceSizeMiB(ctx context.Context, exec storage.Exec, devicePath string return 0, false } - return sz / (1024 * 1024), true + return sizeBytes / mibBytes, true } // attachLVMThick: pvcreate + vgcreate. Returns the diff --git a/pkg/satellite/attach_test.go b/pkg/satellite/attach_test.go index e9ebc3e8..0c79e658 100644 --- a/pkg/satellite/attach_test.go +++ b/pkg/satellite/attach_test.go @@ -351,7 +351,7 @@ func TestAttachWipeClearsPartitionTable(t *testing.T) { t.Fatalf("Bug 336: zpool create missing: %v", calls) } - if !(wipeIdx < rereadIdx && rereadIdx < createIdx) { + if wipeIdx >= rereadIdx || rereadIdx >= createIdx { t.Errorf("Bug 336: ordering must be wipefs@%d < rereadpt@%d < zpool create@%d; got calls=%v", wipeIdx, rereadIdx, createIdx, calls) } diff --git a/pkg/satellite/controllers/discovered_storage.go b/pkg/satellite/controllers/discovered_storage.go index 175dedc3..3e0dec87 100644 --- a/pkg/satellite/controllers/discovered_storage.go +++ b/pkg/satellite/controllers/discovered_storage.go @@ -142,25 +142,8 @@ func (d *DiscoveredStorageRunnable) Start(ctx context.Context) error { logger := log.FromContext(ctx).WithName("discovered-storage").WithValues("node", d.NodeName) - err := d.tick(ctx, logger) - if err != nil { - logger.Error(err, "initial discovered-storage tick") - } - - ticker := time.NewTicker(period) - defer ticker.Stop() - - for { - select { - case <-ctx.Done(): - return nil - case <-ticker.C: - err = d.tick(ctx, logger) - if err != nil { - logger.Error(err, "discovered-storage tick") - } - } - } + return runPeriodicTick(ctx, period, logger, d.tick, + "initial discovered-storage tick", "discovered-storage tick") } // RegisterWithManager adds the runnable to mgr. Symmetrical with diff --git a/pkg/satellite/controllers/manager.go b/pkg/satellite/controllers/manager.go index 9688edad..f8bed6ad 100644 --- a/pkg/satellite/controllers/manager.go +++ b/pkg/satellite/controllers/manager.go @@ -79,13 +79,10 @@ func init() { // so tests can validate the registration + filter-predicate // plumbing independently of the agent's gRPC-server-still- // running mainline. -func NewManager(restCfg *rest.Config, cfg Config) (manager.Manager, error) { - if cfg.NodeName == "" { - return nil, errors.New("controllers: NodeName is required") - } - - if cfg.Apply == nil { - return nil, errors.New("controllers: Apply is required") +func NewManager(restCfg *rest.Config, cfg *Config) (manager.Manager, error) { + err := validateConfig(cfg) + if err != nil { + return nil, err } // Bug 285: bound the manager's stop sequence so a wedged @@ -138,29 +135,29 @@ func NewManager(restCfg *rest.Config, cfg Config) (manager.Manager, error) { // the caller left the field nil; unit tests that construct // reconcilers directly can supply their own channel or leave // the field nil to short-circuit both ends. - ensureWiredDefaults(&cfg) + ensureWiredDefaults(cfg) - err = (&ResourceReconciler{Config: cfg, Client: mgr.GetClient()}).SetupWithManager(mgr) + err = (&ResourceReconciler{Config: *cfg, Client: mgr.GetClient()}).SetupWithManager(mgr) if err != nil { return nil, errors.Wrap(err, "setup ResourceReconciler") } - err = (&ResourceDefinitionReconciler{Config: cfg, Client: mgr.GetClient()}).SetupWithManager(mgr) + err = (&ResourceDefinitionReconciler{Config: *cfg, Client: mgr.GetClient()}).SetupWithManager(mgr) if err != nil { return nil, errors.Wrap(err, "setup ResourceDefinitionReconciler") } - err = (&SnapshotReconciler{Config: cfg, Client: mgr.GetClient()}).SetupWithManager(mgr) + err = (&SnapshotReconciler{Config: *cfg, Client: mgr.GetClient()}).SetupWithManager(mgr) if err != nil { return nil, errors.Wrap(err, "setup SnapshotReconciler") } - err = (&StoragePoolReconciler{Config: cfg, Client: mgr.GetClient()}).SetupWithManager(mgr) + err = (&StoragePoolReconciler{Config: *cfg, Client: mgr.GetClient()}).SetupWithManager(mgr) if err != nil { return nil, errors.Wrap(err, "setup StoragePoolReconciler") } - err = (&PhysicalDeviceReconciler{Config: cfg, Client: mgr.GetClient()}).SetupWithManager(mgr) + err = (&PhysicalDeviceReconciler{Config: *cfg, Client: mgr.GetClient()}).SetupWithManager(mgr) if err != nil { return nil, errors.Wrap(err, "setup PhysicalDeviceReconciler") } @@ -176,6 +173,22 @@ func NewManager(restCfg *rest.Config, cfg Config) (manager.Manager, error) { return mgr, nil } +// validateConfig sanity-checks the NewManager input — keeps the +// constructor itself under the funlen budget by hoisting the +// three-required-field gate out of band. +func validateConfig(cfg *Config) error { + switch { + case cfg == nil: + return errors.New("controllers: Config is required") + case cfg.NodeName == "": + return errors.New("controllers: NodeName is required") + case cfg.Apply == nil: + return errors.New("controllers: Apply is required") + } + + return nil +} + // wireConditionStampers injects the satellite-side Status-Condition // stampers (Phase 11.3) onto the Apply chain. Each stamper owns a // distinct Condition `type` so SSA's listMap merge keeps the writers @@ -186,7 +199,7 @@ func NewManager(restCfg *rest.Config, cfg Config) (manager.Manager, error) { // its own helper so NewManager stays under the funlen budget — the // clearer is the same shape as the Stampers (one apiserver writer // per satellite-side reconciler hook). -func wireConditionStampers(mgr manager.Manager, cfg Config) { +func wireConditionStampers(mgr manager.Manager, cfg *Config) { wireMetadataCreatedStamper(mgr, cfg) wireFilesystemFormattedStamper(mgr, cfg) wireSkipDiskClearer(mgr, cfg) @@ -214,7 +227,7 @@ func ensureWiredDefaults(cfg *Config) { // Pulled out of NewManager to keep that function under the funlen // budget — Scenario 5.34 added the third runnable and the // inline chain tipped over the limit. -func addBackgroundRunnables(mgr manager.Manager, cfg Config) error { +func addBackgroundRunnables(mgr manager.Manager, cfg *Config) error { err := mgr.Add(&ObserverRunnable{ Client: mgr.GetClient(), Exec: cfg.Exec, @@ -289,7 +302,7 @@ func addBackgroundRunnables(mgr manager.Manager, cfg Config) error { // startup backfill runnable. Pulled out of addBackgroundRunnables // to keep that function under the funlen budget — the bookkeeping // for one more runnable nudges it over the limit. -func registerMetadataCreatedBackfill(mgr manager.Manager, cfg Config) error { +func registerMetadataCreatedBackfill(mgr manager.Manager, cfg *Config) error { err := (&MetadataCreatedBackfillRunnable{ Client: mgr.GetClient(), Adm: drbd.NewAdm(cfg.Exec), @@ -309,7 +322,7 @@ func registerMetadataCreatedBackfill(mgr manager.Manager, cfg Config) error { // fetcher needs the manager's cached client, which is why it ships // here rather than at NewReconciler time. Pulled out of NewManager // to keep that function under the funlen budget. -func wireCrossNodeFetcher(mgr manager.Manager, cfg Config) { +func wireCrossNodeFetcher(mgr manager.Manager, cfg *Config) { cfg.Apply.SetCrossNodeFetcher(&SnapshotFetcher{ Client: mgr.GetClient(), NodeName: cfg.NodeName, @@ -322,7 +335,7 @@ func wireCrossNodeFetcher(mgr manager.Manager, cfg Config) { // succeeds. Mirrors `wireCrossNodeFetcher` — the stamper needs the // manager's cached client, which is why it lands here rather than at // NewReconciler time. Phase 11.3 Stage 1. -func wireMetadataCreatedStamper(mgr manager.Manager, cfg Config) { +func wireMetadataCreatedStamper(mgr manager.Manager, cfg *Config) { cfg.Apply.SetMetadataCreatedStamper(&MetadataCreatedStamper{ Client: mgr.GetClient(), }) @@ -333,7 +346,7 @@ func wireMetadataCreatedStamper(mgr manager.Manager, cfg Config) { // `FilesystemFormatted=True` Status Condition after every diskful // volume reports a filesystem (freshly mkfs'd or adopted via blkid). // Mirrors `wireMetadataCreatedStamper`. Phase 11.3 Stage 2. -func wireFilesystemFormattedStamper(mgr manager.Manager, cfg Config) { +func wireFilesystemFormattedStamper(mgr manager.Manager, cfg *Config) { cfg.Apply.SetFilesystemFormattedStamper(&FilesystemFormattedStamper{ Client: mgr.GetClient(), }) @@ -345,7 +358,7 @@ func wireFilesystemFormattedStamper(mgr manager.Manager, cfg Config) { // after a defensive stamp (Bug 278: Talos kernel upgrade reattach). // Mirrors `wireMetadataCreatedStamper` — the clearer needs the // manager's cached client which doesn't exist at NewReconciler time. -func wireSkipDiskClearer(mgr manager.Manager, cfg Config) { +func wireSkipDiskClearer(mgr manager.Manager, cfg *Config) { cfg.Apply.SetSkipDiskClearer(&SkipDiskClearer{ Client: mgr.GetClient(), NodeName: cfg.NodeName, diff --git a/pkg/satellite/controllers/observer.go b/pkg/satellite/controllers/observer.go index c20ca27e..7d5c6a4d 100644 --- a/pkg/satellite/controllers/observer.go +++ b/pkg/satellite/controllers/observer.go @@ -1765,14 +1765,14 @@ func buildObserverVolumeStatus(ev *observation, storagePool string) []blockstori out := make([]blockstoriov1alpha1.ResourceVolumeStatus, 0, len(ev.Volumes)) - for _, v := range ev.Volumes { + for _, vol := range ev.Volumes { out = append(out, blockstoriov1alpha1.ResourceVolumeStatus{ - VolumeNumber: v.VolumeNumber, + VolumeNumber: vol.VolumeNumber, StoragePool: storagePool, - DiskState: v.DiskState, - CurrentGi: v.CurrentUUID, - OutOfSyncKib: v.OutOfSyncKib, - Quorum: v.Quorum, + DiskState: vol.DiskState, + CurrentGi: vol.CurrentUUID, + OutOfSyncKib: vol.OutOfSyncKib, + Quorum: vol.Quorum, }) } diff --git a/pkg/satellite/controllers/physicaldevice_discovery.go b/pkg/satellite/controllers/physicaldevice_discovery.go index cf770aaa..82ec85f7 100644 --- a/pkg/satellite/controllers/physicaldevice_discovery.go +++ b/pkg/satellite/controllers/physicaldevice_discovery.go @@ -182,7 +182,7 @@ type UeventNotifier interface { // // Production caller: manager.go addBackgroundRunnables. Test // callers exist in physicaldevice_discovery_uevent_bug341_test.go. -func NewPhysicalDeviceDiscoveryRunnableFromConfig(cli client.Client, cfg Config) *PhysicalDeviceDiscoveryRunnable { +func NewPhysicalDeviceDiscoveryRunnableFromConfig(cli client.Client, cfg *Config) *PhysicalDeviceDiscoveryRunnable { return &PhysicalDeviceDiscoveryRunnable{ Client: cli, Exec: cfg.Exec, @@ -604,11 +604,37 @@ func (p *PhysicalDeviceDiscoveryRunnable) publishDeviceWithReason(ctx context.Co // Status.StableID for CRD-name determinism only. devicePath := "/dev/" + row.KName currentDevPath := "/dev/" + row.KName - rotational := row.Rotational desiredStatus := buildDiscoveryStatus(p.NodeName, stableID, devicePath, currentDevPath, row, &rotational, free, reason, message) + existing, ok := p.upsertDeviceCRD(ctx, logger, name) + if !ok { + return "", false + } + + existing.Status = desiredStatus + + err := p.Client.Status().Update(ctx, existing) + if err != nil { + logger.Error(err, "update PhysicalDevice status", "name", name) + + return "", false + } + + return name, true +} + +// upsertDeviceCRD ensures the PhysicalDevice CRD exists for `name` +// and that its NodeName label matches the local satellite. Returns +// the live object ready for a Status().Update, or (nil, false) on +// any apiserver error (already logged by the caller's logr). Pulled +// out of publishDeviceWithReason so the parent stays under the +// funlen budget; the get-or-create + label-sync chain has nothing +// device-specific in it. +func (p *PhysicalDeviceDiscoveryRunnable) upsertDeviceCRD( + ctx context.Context, logger logr.Logger, name string, +) (*blockstoriov1alpha1.PhysicalDevice, bool) { var existing blockstoriov1alpha1.PhysicalDevice err := p.Client.Get(ctx, client.ObjectKey{Name: name}, &existing) @@ -627,7 +653,7 @@ func (p *PhysicalDeviceDiscoveryRunnable) publishDeviceWithReason(ctx context.Co if err != nil && !apierrors.IsAlreadyExists(err) { logger.Error(err, "create PhysicalDevice", "name", name) - return "", false + return nil, false } // Re-fetch for the status update so we have the apiserver- @@ -637,12 +663,12 @@ func (p *PhysicalDeviceDiscoveryRunnable) publishDeviceWithReason(ctx context.Co if err != nil { logger.Error(err, "re-get after Create", "name", name) - return "", false + return nil, false } case err != nil: logger.Error(err, "get PhysicalDevice", "name", name) - return "", false + return nil, false } // Ensure the node label is set on existing CRDs that may have @@ -658,20 +684,11 @@ func (p *PhysicalDeviceDiscoveryRunnable) publishDeviceWithReason(ctx context.Co if err != nil { logger.Error(err, "update PhysicalDevice labels", "name", name) - return "", false + return nil, false } } - existing.Status = desiredStatus - - err = p.Client.Status().Update(ctx, &existing) - if err != nil { - logger.Error(err, "update PhysicalDevice status", "name", name) - - return "", false - } - - return name, true + return &existing, true } // buildDiscoveryStatus assembles the Status subresource the diff --git a/pkg/satellite/controllers/physicaldevice_discovery_uevent_bug341_test.go b/pkg/satellite/controllers/physicaldevice_discovery_uevent_bug341_test.go index 7178c701..49e489a6 100644 --- a/pkg/satellite/controllers/physicaldevice_discovery_uevent_bug341_test.go +++ b/pkg/satellite/controllers/physicaldevice_discovery_uevent_bug341_test.go @@ -274,7 +274,7 @@ func TestNewPhysicalDeviceDiscoveryRunnableFromConfigThreadsUevent_Bug341(t *tes UeventListener: notifier, } - runnable := controllers.NewPhysicalDeviceDiscoveryRunnableFromConfig(nil, cfg) + runnable := controllers.NewPhysicalDeviceDiscoveryRunnableFromConfig(nil, &cfg) if runnable == nil { t.Fatal("NewPhysicalDeviceDiscoveryRunnableFromConfig returned nil") diff --git a/pkg/satellite/controllers/resource.go b/pkg/satellite/controllers/resource.go index cae67b83..84915ba2 100644 --- a/pkg/satellite/controllers/resource.go +++ b/pkg/satellite/controllers/resource.go @@ -462,63 +462,7 @@ func (r *ResourceReconciler) runApply(ctx context.Context, res *blockstoriov1alp // for the controller to stamp Status.DRBDNodeID/Port/Minor would // block apply forever — they never come. if rdNeedsDRBD(&rd) { - // Bug 342 v4: deterministic UID-mismatch eviction BEFORE - // the allocation gate. A rapid `r d ` + `r c ` - // produces a new peer Resource CR with a fresh - // metadata.uid while the dispatcher's .res-driven diff - // sees the same peer-name set and skips del-peer. The - // kernel slot for the old incarnation (with its stale GI - // epoch + PSK) wedges the new peer in Connecting forever. - // Comparing the peer's current UID against the local - // Status.AppliedPeerUIDs baseline catches that race - // deterministically — no timing dependency on kernel - // state observation, no Phase 2 vs Phase 3 scheduling - // races. Returned `evicted` map ({peerName: newUID}) gets - // stamped onto Status.AppliedPeerUIDs so subsequent - // reconciles don't re-evict the same peer in a loop. - evicted, evictErr := r.Config.Apply.EvictPeersByUIDMismatch( - ctx, - rd.Name, - desiredPeersFromCRDs(peers), - res.Status.AppliedPeerUIDs, - volNumsOf(&rd), - nil, // devices unknown at this layer; forget-peer falls back to nodeID skip - ) - if evictErr != nil { - logger.Error(evictErr, "EvictPeersByUIDMismatch failed; continuing") - } - - if len(evicted) > 0 { - if stampErr := r.stampAppliedPeerUIDs(ctx, res, evicted); stampErr != nil { - logger.Error(stampErr, "stampAppliedPeerUIDs failed; will retry next reconcile", - "evicted", evicted) - } - } - - // Bug 342 v3: prune zombie kernel slots BEFORE the - // allocation gate. The gate short-circuits the entire - // Apply chain when ANY peer's Status.DRBDNodeID is nil, - // which is exactly the state a Phase-3 relocate lands in - // — and the kernel-side slot from the previous incarnation - // (Connecting / StandAlone with no peer-device) would - // survive the entire wait window otherwise. Kernel-state - // cleanup doesn't need peer DRBDNodeID allocation, only - // the current expected peer-name set. Non-fatal: a Pass-1 - // del-peer failure logs and falls through to the gate so - // the next reconcile retries; the safety net is best- - // effort by design. - pruneErr := r.Config.Apply.PruneStaleKernelSlots( - ctx, - rd.Name, - expectedPeerNamesFor(res, peers), - volNumsOf(&rd), - nil, // devices unknown at this layer; forget-peer skipped per-vol - ) - if pruneErr != nil { - logger.Error(pruneErr, "PruneStaleKernelSlots failed; continuing to allocation gate") - } - - if waitResult, waitOK := r.waitForControllerAllocation(ctx, res, peers, logger); !waitOK { + if waitResult, waitOK := r.prepareDRBDApply(ctx, res, &rd, peers, logger); !waitOK { return waitResult, nil } } @@ -537,33 +481,8 @@ func (r *ResourceReconciler) runApply(ctx context.Context, res *blockstoriov1alp r.stampPostApply(ctx, res, &rd, results, anyFailed, logger) - // Bug 342 v4 adoption baseline: after a fully-successful apply, - // stamp Status.AppliedPeerUIDs for peers that have no baseline - // yet (fresh CRD / restore / migration path). Does NOT overwrite - // existing baselines — that's the eviction path's responsibility - // (Bug 342 v6): if applied[peer] is already set to an OLD UID, - // the EvictPeersByUIDMismatch may have deferred eviction this - // reconcile (peer node-id unresolvable). Overwriting the baseline - // here with current peer UIDs would mask the pending mismatch - // — next reconcile would see applied == current and never try - // the eviction again, leaving the stale kernel slot forever. - // Only EvictPeersByUIDMismatch should change an existing - // baseline, AFTER actually completing del-peer + forget-peer. if !anyFailed && rdNeedsDRBD(&rd) { - full := currentPeerUIDs(peers) - missing := make(map[string]string, len(full)) - - for name, uid := range full { - if _, ok := res.Status.AppliedPeerUIDs[name]; !ok { - missing[name] = uid - } - } - - if len(missing) > 0 { - if stampErr := r.stampAppliedPeerUIDs(ctx, res, missing); stampErr != nil { - logger.Error(stampErr, "post-apply AppliedPeerUIDs baseline stamp failed; will retry next reconcile") - } - } + r.stampAppliedPeerUIDsBaseline(ctx, res, peers, logger) } // Apply chain surfaces per-resource errors via results (e.g. @@ -594,6 +513,110 @@ func (r *ResourceReconciler) runApply(ctx context.Context, res *blockstoriov1alp return ctrl.Result{}, nil } +// prepareDRBDApply runs the DRBD-only pre-apply sequence: peer UID +// mismatch eviction (Bug 342 v4), stale kernel slot prune +// (Bug 342 v3), and the controller-allocation wait gate. Returns +// (result, false) when the caller should bail out and let the +// requeue handle it; (zero, true) when apply can proceed. Pulled +// out of runApply so it stays under the gocognit budget. +func (r *ResourceReconciler) prepareDRBDApply( + ctx context.Context, + res *blockstoriov1alpha1.Resource, + rd *blockstoriov1alpha1.ResourceDefinition, + peers []blockstoriov1alpha1.Resource, + logger logr.Logger, +) (ctrl.Result, bool) { + // Bug 342 v4: deterministic UID-mismatch eviction BEFORE the + // allocation gate. A rapid `r d ` + `r c ` produces + // a new peer Resource CR with a fresh metadata.uid while the + // dispatcher's .res-driven diff sees the same peer-name set and + // skips del-peer. The kernel slot for the old incarnation (with + // its stale GI epoch + PSK) wedges the new peer in Connecting + // forever. Comparing the peer's current UID against the local + // Status.AppliedPeerUIDs baseline catches that race + // deterministically — no timing dependency on kernel state + // observation, no Phase 2 vs Phase 3 scheduling races. Returned + // `evicted` map ({peerName: newUID}) gets stamped onto + // Status.AppliedPeerUIDs so subsequent reconciles don't + // re-evict the same peer in a loop. + evicted, evictErr := r.Config.Apply.EvictPeersByUIDMismatch( + ctx, + rd.Name, + desiredPeersFromCRDs(peers), + res.Status.AppliedPeerUIDs, + volNumsOf(rd), + nil, // devices unknown at this layer; forget-peer falls back to nodeID skip + ) + if evictErr != nil { + logger.Error(evictErr, "EvictPeersByUIDMismatch failed; continuing") + } + + if len(evicted) > 0 { + stampErr := r.stampAppliedPeerUIDs(ctx, res, evicted) + if stampErr != nil { + logger.Error(stampErr, "stampAppliedPeerUIDs failed; will retry next reconcile", + "evicted", evicted) + } + } + + // Bug 342 v3: prune zombie kernel slots BEFORE the allocation + // gate. The gate short-circuits the entire Apply chain when + // ANY peer's Status.DRBDNodeID is nil, which is exactly the + // state a Phase-3 relocate lands in — and the kernel-side slot + // from the previous incarnation (Connecting / StandAlone with + // no peer-device) would survive the entire wait window + // otherwise. Kernel-state cleanup doesn't need peer DRBDNodeID + // allocation, only the current expected peer-name set. + // Non-fatal: a Pass-1 del-peer failure logs and falls through + // to the gate so the next reconcile retries; the safety net + // is best-effort by design. + pruneErr := r.Config.Apply.PruneStaleKernelSlots( + ctx, + rd.Name, + expectedPeerNamesFor(res, peers), + volNumsOf(rd), + nil, // devices unknown at this layer; forget-peer skipped per-vol + ) + if pruneErr != nil { + logger.Error(pruneErr, "PruneStaleKernelSlots failed; continuing to allocation gate") + } + + return r.waitForControllerAllocation(ctx, res, peers, logger) +} + +// stampAppliedPeerUIDsBaseline writes the Bug 342 v4 adoption +// baseline after a fully-successful apply on a DRBD-stack RD: for +// every peer that has NO entry in Status.AppliedPeerUIDs yet +// (fresh CRD / restore / migration path), stamp the current UID. +// Does NOT overwrite existing baselines — that's the eviction +// path's responsibility (Bug 342 v6). Pulled out of runApply so +// the orchestrator stays under the gocyclo budget; the +// best-effort logging contract is unchanged. +func (r *ResourceReconciler) stampAppliedPeerUIDsBaseline( + ctx context.Context, + res *blockstoriov1alpha1.Resource, + peers []blockstoriov1alpha1.Resource, + logger logr.Logger, +) { + full := currentPeerUIDs(peers) + missing := make(map[string]string, len(full)) + + for name, uid := range full { + if _, ok := res.Status.AppliedPeerUIDs[name]; !ok { + missing[name] = uid + } + } + + if len(missing) == 0 { + return + } + + stampErr := r.stampAppliedPeerUIDs(ctx, res, missing) + if stampErr != nil { + logger.Error(stampErr, "post-apply AppliedPeerUIDs baseline stamp failed; will retry next reconcile") + } +} + // recordPerResourceFailures logs each per-resource Apply failure and // returns whether any of the results came back not-Ok. Extracted from // runApply so the orchestration stays under the funlen budget. @@ -1233,14 +1256,14 @@ func desiredPeersFromCRDs(peers []blockstoriov1alpha1.Resource) []intent.Desired out := make([]intent.DesiredPeer, 0, len(peers)) for i := range peers { - p := &peers[i] + peer := &peers[i] entry := intent.DesiredPeer{ - Name: p.Spec.NodeName, - ResourceUID: string(p.UID), + Name: peer.Spec.NodeName, + ResourceUID: string(peer.UID), } - if p.Status.DRBDNodeID != nil { - entry.NodeID = *p.Status.DRBDNodeID + if peer.Status.DRBDNodeID != nil { + entry.NodeID = *peer.Status.DRBDNodeID } out = append(out, entry) @@ -1302,8 +1325,9 @@ func (r *ResourceReconciler) stampAppliedPeerUIDs(ctx context.Context, res *bloc err := retry.RetryOnConflict(retry.DefaultRetry, func() error { var fresh blockstoriov1alpha1.Resource - if getErr := reader.Get(ctx, client.ObjectKeyFromObject(res), &fresh); getErr != nil { - return getErr + getErr := reader.Get(ctx, client.ObjectKeyFromObject(res), &fresh) + if getErr != nil { + return errors.Wrap(getErr, "re-read resource") } if fresh.Status.AppliedPeerUIDs == nil { @@ -1325,7 +1349,6 @@ func (r *ResourceReconciler) stampAppliedPeerUIDs(ctx context.Context, res *bloc return r.Status().Update(ctx, &fresh) }) - if err != nil { return errors.Wrap(err, "stamp Resource.Status.AppliedPeerUIDs") } diff --git a/pkg/satellite/controllers/resource_test.go b/pkg/satellite/controllers/resource_test.go index 4d619938..181dca61 100644 --- a/pkg/satellite/controllers/resource_test.go +++ b/pkg/satellite/controllers/resource_test.go @@ -68,18 +68,6 @@ func (p *orderingProvider) DeleteVolume(_ context.Context, _ storage.Volume) err return nil } -// deletesSnapshot returns a copy of the recorded step values so the -// test can assert ordering without holding the provider's mutex. -func (p *orderingProvider) deletesSnapshot() []int32 { - p.mu.Lock() - defer p.mu.Unlock() - - out := make([]int32, len(p.deletes)) - copy(out, p.deletes) - - return out -} - func (p *orderingProvider) ResizeVolume(_ context.Context, _ storage.Volume) error { return nil } func (p *orderingProvider) VolumeStatus(_ context.Context, vol storage.Volume) (storage.VolumeStatus, error) { @@ -94,6 +82,18 @@ func (p *orderingProvider) RestoreVolumeFromSnapshot(_ context.Context, _ storag return nil } +// deletesSnapshot returns a copy of the recorded step values so the +// test can assert ordering without holding the provider's mutex. +func (p *orderingProvider) deletesSnapshot() []int32 { + p.mu.Lock() + defer p.mu.Unlock() + + out := make([]int32, len(p.deletes)) + copy(out, p.deletes) + + return out +} + // newDeleteRaceFixture wires a fake-client Resource with a // DeletionTimestamp + SatelliteResourceFinalizer plus its parent RD // (so handleDelete's lookupVolumeNumbers finds the VolumeDefinitions). diff --git a/pkg/satellite/controllers/resource_toggle_disk.go b/pkg/satellite/controllers/resource_toggle_disk.go index 024a3c2b..feb55fe0 100644 --- a/pkg/satellite/controllers/resource_toggle_disk.go +++ b/pkg/satellite/controllers/resource_toggle_disk.go @@ -30,6 +30,7 @@ import ( blockstoriov1alpha1 "github.com/cozystack/blockstor/api/v1alpha1" apiv1 "github.com/cozystack/blockstor/pkg/api/v1" + "github.com/cozystack/blockstor/pkg/drbd" intent "github.com/cozystack/blockstor/pkg/satellite/intent" ) @@ -165,7 +166,7 @@ func isToggleDiskInFlight(res *blockstoriov1alpha1.Resource) bool { for i := range res.Status.Volumes { v := &res.Status.Volumes[i] - if v.DiskState != "UpToDate" { + if v.DiskState != string(drbd.DiskStateUpToDate) { allUpToDate = false break diff --git a/pkg/satellite/controllers/resource_toggle_disk_test.go b/pkg/satellite/controllers/resource_toggle_disk_test.go index 199b4e56..e8687d25 100644 --- a/pkg/satellite/controllers/resource_toggle_disk_test.go +++ b/pkg/satellite/controllers/resource_toggle_disk_test.go @@ -269,6 +269,7 @@ func TestToggleDiskIncrementsRetriesOnFailure(t *testing.T) { // away with a half-carved LV on disk). type recordingDeleteProvider struct { flakyCreateProvider + deleted int32 } diff --git a/pkg/satellite/controllers/runnable_common.go b/pkg/satellite/controllers/runnable_common.go new file mode 100644 index 00000000..a9c8167c --- /dev/null +++ b/pkg/satellite/controllers/runnable_common.go @@ -0,0 +1,66 @@ +/* +Copyright 2026 Cozystack contributors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + "context" + "time" + + "github.com/go-logr/logr" +) + +// runPeriodicTick is the shared serve loop used by every +// fire-and-forget periodic runnable in the satellite-controllers +// package (DiscoveredStorageRunnable, OrphanSweeperRunnable, …). +// +// Shape: immediate first-tick on entry — controller-runtime has +// already waited for cache sync before calling Start, so we can +// safely fire the initial pass without warm-up logic. After that +// a time.Ticker drives the loop at `period`. Ticker errors are +// logged through `logger` under `tickErrLabel` so the audit grep +// for the per-runnable error label stays stable across loops. +// +// Centralised here so dupl doesn't flag the two byte-identical +// loops that previously lived inline in DiscoveredStorageRunnable +// and OrphanSweeperRunnable. +func runPeriodicTick( + ctx context.Context, + period time.Duration, + logger logr.Logger, + tick func(context.Context, logr.Logger) error, + initialErrLabel, tickErrLabel string, +) error { + err := tick(ctx, logger) + if err != nil { + logger.Error(err, initialErrLabel) + } + + ticker := time.NewTicker(period) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return nil + case <-ticker.C: + err = tick(ctx, logger) + if err != nil { + logger.Error(err, tickErrLabel) + } + } + } +} diff --git a/pkg/satellite/controllers/snapshot_test.go b/pkg/satellite/controllers/snapshot_test.go index 36d3d4db..aa438944 100644 --- a/pkg/satellite/controllers/snapshot_test.go +++ b/pkg/satellite/controllers/snapshot_test.go @@ -287,7 +287,7 @@ func TestSnapshotReconcileMarksFailedOnTerminalError(t *testing.T) { } // Terminal failures MUST NOT requeue — they're dead-letter. - if result.Requeue || result.RequeueAfter > 0 { + if !result.IsZero() { t.Errorf("terminal failure should NOT requeue; got %+v", result) } @@ -376,7 +376,7 @@ func TestSnapshotReconcileKeepsIncompleteOnTransientError(t *testing.T) { } // Transient failures MUST requeue so the next pass retries. - if !result.Requeue && result.RequeueAfter == 0 { + if result.IsZero() { t.Errorf("transient failure should requeue; got %+v", result) } diff --git a/pkg/satellite/controllers/storage_sweeper.go b/pkg/satellite/controllers/storage_sweeper.go index e69b4cc0..03bf7557 100644 --- a/pkg/satellite/controllers/storage_sweeper.go +++ b/pkg/satellite/controllers/storage_sweeper.go @@ -214,79 +214,124 @@ func (s *StorageOrphanSweeperRunnable) sweepOnce(ctx context.Context, logger log var deleted int for poolName, provider := range providers { - lister, ok := provider.(storage.VolumeLister) - if !ok { - // Provider can't enumerate (or backend kind doesn't - // support it); silently skip — the contract is opt-in. - continue + stop := s.sweepPool(ctx, logger, poolName, provider, owned, limit, &deleted) + if stop { + return nil } + } - refs, err := lister.ListVolumeNames(ctx) - if err != nil { - // Per-pool failure shouldn't sink the whole sweep — - // log and try the next pool. A real backend outage - // will surface on the next tick (and on real - // reconciles). - logger.Error(err, "list volumes on pool", "pool", poolName) + return nil +} - continue - } +// sweepPool reaps orphan volumes on one pool. Returns true when the +// per-cycle rate-limit budget is exhausted and the caller should +// stop iterating remaining pools; false when the pool was processed +// (with any errors logged). Pulled out of sweepOnce so the parent +// stays under the gocyclo budget; the per-volume orphan-classification +// chain has nothing pool-routing-specific in it. +func (s *StorageOrphanSweeperRunnable) sweepPool( + ctx context.Context, + logger logr.Logger, + poolName string, + provider storage.Provider, + owned map[string]struct{}, + limit int, + deleted *int, +) bool { + lister, ok := provider.(storage.VolumeLister) + if !ok { + // Provider can't enumerate (or backend kind doesn't + // support it); silently skip — the contract is opt-in. + return false + } - for _, ref := range refs { - ref.PoolName = poolName + refs, err := lister.ListVolumeNames(ctx) + if err != nil { + // Per-pool failure shouldn't sink the whole sweep — log + // and try the next pool. A real backend outage will + // surface on the next tick (and on real reconciles). + logger.Error(err, "list volumes on pool", "pool", poolName) - if _, ok := owned[ownedKey(ref.ResourceName, ref.VolumeNumber)]; ok { - continue - } + return false + } - // Wildcard match — a Resource CRD exists for this RD on - // this node but the parent RD has already been deleted - // (cascade out-of-order). The satellite finalizer is - // still responsible for cleanup; don't race it. - if _, ok := owned[ownedKey(ref.ResourceName, -1)]; ok { - continue - } + for _, ref := range refs { + ref.PoolName = poolName - if !hasStoragePrefix(ref.ResourceName) { - // Operator-owned volume that happens to share the - // pool. Leave it alone — see prefix allowlist - // rationale. - continue - } + stop := s.reapIfOrphan(ctx, logger, poolName, provider, owned, limit, deleted, ref) + if stop { + return true + } + } - if limit >= 0 && deleted >= limit { - logger.Info("storage sweep rate-limit hit; deferring remainder", - "limit", limit, "pool", poolName, - "deferred_resource", ref.ResourceName, - "deferred_volume", ref.VolumeNumber) + return false +} - return nil - } +// reapIfOrphan classifies a single (pool, resource, volume) row and +// runs DeleteVolume when it's a satellite-owned orphan past the +// allowlist gate. Returns true when the per-cycle rate-limit budget +// has been hit and the caller should stop scanning the rest of the +// pool. Pulled out of sweepPool so that loop body stays under the +// funlen / gocyclo budgets; the classification cascade is identical +// to the inlined version. +func (s *StorageOrphanSweeperRunnable) reapIfOrphan( + ctx context.Context, + logger logr.Logger, + poolName string, + provider storage.Provider, + owned map[string]struct{}, + limit int, + deleted *int, + ref storage.VolumeRef, +) bool { + if _, ok := owned[ownedKey(ref.ResourceName, ref.VolumeNumber)]; ok { + return false + } - logger.Info("orphan storage volume detected; running DeleteVolume", - "pool", poolName, "resource", ref.ResourceName, "volume", ref.VolumeNumber) - - delErr := provider.DeleteVolume(ctx, storage.Volume{ - PoolName: poolName, - ResourceName: ref.ResourceName, - VolumeNumber: ref.VolumeNumber, - }) - if delErr != nil { - // Per-volume failure shouldn't abort the cycle — - // next tick retries. We DON'T bump `deleted` so - // the rate-limit budget reflects successful - // reaps only. - logger.Error(delErr, "DeleteVolume on orphan", - "pool", poolName, "resource", ref.ResourceName, "volume", ref.VolumeNumber) - - continue - } + // Wildcard match — a Resource CRD exists for this RD on this + // node but the parent RD has already been deleted (cascade + // out-of-order). The satellite finalizer is still responsible + // for cleanup; don't race it. + if _, ok := owned[ownedKey(ref.ResourceName, -1)]; ok { + return false + } - deleted++ - } + if !hasStoragePrefix(ref.ResourceName) { + // Operator-owned volume that happens to share the pool. + // Leave it alone — see prefix allowlist rationale. + return false } - return nil + if limit >= 0 && *deleted >= limit { + logger.Info("storage sweep rate-limit hit; deferring remainder", + "limit", limit, "pool", poolName, + "deferred_resource", ref.ResourceName, + "deferred_volume", ref.VolumeNumber) + + return true + } + + logger.Info("orphan storage volume detected; running DeleteVolume", + "pool", poolName, "resource", ref.ResourceName, "volume", ref.VolumeNumber) + + delErr := provider.DeleteVolume(ctx, storage.Volume{ + PoolName: poolName, + ResourceName: ref.ResourceName, + VolumeNumber: ref.VolumeNumber, + }) + if delErr != nil { + // Per-volume failure shouldn't abort the cycle — next tick + // retries. We DON'T bump `deleted` so the rate-limit budget + // reflects successful reaps only. + logger.Error(delErr, "DeleteVolume on orphan", + "pool", poolName, "resource", ref.ResourceName, "volume", ref.VolumeNumber) + + return false + } + + *deleted++ + + return false } // shouldSkip mirrors OrphanSweeperRunnable.shouldSkip — checks the diff --git a/pkg/satellite/controllers/sweeper.go b/pkg/satellite/controllers/sweeper.go index 82d93f4b..07d8ae5a 100644 --- a/pkg/satellite/controllers/sweeper.go +++ b/pkg/satellite/controllers/sweeper.go @@ -257,25 +257,8 @@ func (s *OrphanSweeperRunnable) Start(ctx context.Context) error { // orphan classification cannot mis-fire against a half-warm // cache. ctx propagation in sweepOnce ensures a shutdown still // aborts any in-flight drbdsetup call. - err := s.sweepOnce(ctx, logger) - if err != nil { - logger.Error(err, "initial sweep") - } - - ticker := time.NewTicker(period) - defer ticker.Stop() - - for { - select { - case <-ctx.Done(): - return nil - case <-ticker.C: - err := s.sweepOnce(ctx, logger) - if err != nil { - logger.Error(err, "sweep cycle") - } - } - } + return runPeriodicTick(ctx, period, logger, s.sweepOnce, + "initial sweep", "sweep cycle") } // RegisterWithManager adds the sweeper to mgr alongside the diff --git a/pkg/satellite/fsm_dispatch.go b/pkg/satellite/fsm_dispatch.go index a4195b52..aef98f87 100644 --- a/pkg/satellite/fsm_dispatch.go +++ b/pkg/satellite/fsm_dispatch.go @@ -72,7 +72,8 @@ func (r *Reconciler) dispatchFsmAction(ctx context.Context, dr *intent.DesiredRe // Noop (must remain a true no-op). switch action { case ActionCreateMd, ActionUp, ActionAdjust, ActionAdjustSkipDisk: - if err := r.renderResFile(ctx, dr, devices); err != nil { + err := r.renderResFile(ctx, dr, devices) + if err != nil { return err } } diff --git a/pkg/satellite/peer_identity_cleanup.go b/pkg/satellite/peer_identity_cleanup.go index daedc892..9b317844 100644 --- a/pkg/satellite/peer_identity_cleanup.go +++ b/pkg/satellite/peer_identity_cleanup.go @@ -20,8 +20,10 @@ import ( "context" "github.com/cockroachdb/errors" + "github.com/go-logr/logr" "sigs.k8s.io/controller-runtime/pkg/log" + "github.com/cozystack/blockstor/pkg/drbd" "github.com/cozystack/blockstor/pkg/satellite/intent" ) @@ -39,7 +41,7 @@ import ( // its stale GI epoch + PSK + endpoint mapping, survives the new // `drbdadm adjust` — the new brings up fresh DRBD with a new // GI epoch, handshake fails (incompatible epochs), the slot wedges -// in Connecting forever (the `disk='' rep='Off'` symptom). +// in Connecting forever (the `disk=” rep='Off'` symptom). // // State-based detection (v3 PruneStaleKernelSlots Pass 3) can in // principle clean this up after the 30s zombie grace, but it depends @@ -77,11 +79,15 @@ func (r *Reconciler) EvictPeersByUIDMismatch( rdName string, desiredPeers []intent.DesiredPeer, appliedPeerUIDs map[string]string, - vols []int32, + _ []int32, devices map[int32]string, ) (map[string]string, error) { if r.cfg.Adm == nil { - return nil, nil + // No drbdadm wired (unit-test fast path). Caller treats a + // nil cleaned-map as no-op; surface a sentinel so the + // signature stays informative without nudging callers + // (production wiring always sets cfg.Adm). + return nil, nil //nolint:nilnil // intentional no-op signal for tests } logger := log.FromContext(ctx).WithValues("rd", rdName, "v4uidevict", true) @@ -101,88 +107,117 @@ func (r *Reconciler) EvictPeersByUIDMismatch( var cleaned map[string]string for _, peer := range desiredPeers { - if peer.ResourceUID == "" { - // Peer's UID not yet known to the dispatcher - // (informer cache trail / fresh CRD just hit - // apiserver). Skip — next reconcile will retry - // with a populated UID. - continue + evicted, err := r.evictOnePeerByUIDMismatch(ctx, logger, rdName, peer, + appliedPeerUIDs, slots, devices) + if err != nil { + return cleaned, err } - last, hasLast := appliedPeerUIDs[peer.Name] - if !hasLast || last == peer.ResourceUID { - // No prior baseline (rollout window / first - // apply) OR baseline matches — nothing to evict. + if !evicted { continue } - // UID mismatch — force-evict the kernel slot for this - // peer. Forget-peer needs a node-id; prefer the kernel's - // observation, fall back to the dispatcher's. - nodeID := peer.NodeID - - if slot, ok := slots[peer.Name]; ok && slot.NodeID != 0 { - nodeID = slot.NodeID + if cleaned == nil { + cleaned = make(map[string]string) } - // Bug 342 v5: when neither the kernel-observed slot nor - // the K8s-allocated peer NodeID is available, DEFER - // eviction to a future reconcile. A del-peer without - // matching forget-peer drops the kernel connection but - // leaves stale per-volume GI/bitmap metadata; the - // subsequent new-peer handshake (after the relocated - // peer brings DRBD up with a fresh GI epoch) exposes - // the mismatch and the LOCAL stable peer regresses its - // own disk_state to Inconsistent / Outdated. Skipping - // this reconcile is safe: the next one (after kernel - // loads OR allocation lands) will see the same UID - // mismatch and try again with a resolvable node-id. - if nodeID == 0 { - logger.Info("UID mismatch detected but peer node-id unresolved — deferring eviction", - "peer", peer.Name, - "oldUID", last, - "newUID", peer.ResourceUID) + cleaned[peer.Name] = peer.ResourceUID + } - continue - } + return cleaned, nil +} + +// evictOnePeerByUIDMismatch performs the per-peer half of +// EvictPeersByUIDMismatch: classify whether the peer needs eviction, +// resolve the kernel-side node-id (preferring the kernel's +// observation), and run del-peer + per-volume forget-peer when so. +// +// Returns (true, nil) when this peer was evicted and the caller +// should record it in `cleaned`; (false, nil) when the peer was +// skipped (no UID known yet / no prior baseline / baseline matches +// / node-id unresolved); (false, err) on a fatal del-peer error. +// Pulled out of EvictPeersByUIDMismatch so the orchestrator stays +// under the gocyclo budget; the cascade is byte-identical. +func (r *Reconciler) evictOnePeerByUIDMismatch( + ctx context.Context, + logger logr.Logger, + rdName string, + peer intent.DesiredPeer, + appliedPeerUIDs map[string]string, + slots map[string]drbd.KernelSlot, + devices map[int32]string, +) (bool, error) { + if peer.ResourceUID == "" { + // Peer's UID not yet known to the dispatcher (informer + // cache trail / fresh CRD just hit apiserver). Skip — + // next reconcile will retry with a populated UID. + return false, nil + } - logger.Info("UID mismatch — evicting kernel slot for re-incarnated peer", + last, hasLast := appliedPeerUIDs[peer.Name] + if !hasLast || last == peer.ResourceUID { + // No prior baseline (rollout window / first apply) OR + // baseline matches — nothing to evict. + return false, nil + } + + // UID mismatch — force-evict the kernel slot for this peer. + // Forget-peer needs a node-id; prefer the kernel's observation, + // fall back to the dispatcher's. + nodeID := peer.NodeID + + if slot, ok := slots[peer.Name]; ok && slot.NodeID != 0 { + nodeID = slot.NodeID + } + + // Bug 342 v5: when neither the kernel-observed slot nor the + // K8s-allocated peer NodeID is available, DEFER eviction to a + // future reconcile. A del-peer without matching forget-peer + // drops the kernel connection but leaves stale per-volume + // GI/bitmap metadata; the subsequent new-peer handshake (after + // the relocated peer brings DRBD up with a fresh GI epoch) + // exposes the mismatch and the LOCAL stable peer regresses its + // own disk_state to Inconsistent / Outdated. Skipping this + // reconcile is safe: the next one (after kernel loads OR + // allocation lands) will see the same UID mismatch and try + // again with a resolvable node-id. + if nodeID == 0 { + logger.Info("UID mismatch detected but peer node-id unresolved — deferring eviction", "peer", peer.Name, "oldUID", last, - "newUID", peer.ResourceUID, - "nodeID", nodeID) + "newUID", peer.ResourceUID) - if err := r.cfg.Adm.DelPeer(ctx, rdName, peer.Name); err != nil { - return cleaned, errors.Wrapf(err, "drbdadm del-peer %s:%s", peer.Name, rdName) - } + return false, nil + } - // forget-peer is per-volume because v09 metadata lives - // in the per-volume block. Skip volumes without a device - // path (DISKLESS local replica — no metadata to clean). - // Skip when nodeID is zero (no resolvable id — leaves a - // slow-leak slot, recoverable later). - if nodeID != 0 { - for volNum, device := range devices { - if device == "" { - continue - } - - if forgetErr := r.cfg.Adm.ForgetPeer(ctx, rdName, volNum, device, nodeID); forgetErr != nil { - logger.Info("EvictPeersByUIDMismatch: forget-peer failed (non-fatal)", - "peer", peer.Name, - "vol", volNum, - "nodeID", nodeID, - "err", forgetErr.Error()) - } - } - } + logger.Info("UID mismatch — evicting kernel slot for re-incarnated peer", + "peer", peer.Name, + "oldUID", last, + "newUID", peer.ResourceUID, + "nodeID", nodeID) - if cleaned == nil { - cleaned = make(map[string]string) + err := r.cfg.Adm.DelPeer(ctx, rdName, peer.Name) + if err != nil { + return false, errors.Wrapf(err, "drbdadm del-peer %s:%s", peer.Name, rdName) + } + + // forget-peer is per-volume because v09 metadata lives in the + // per-volume block. Skip volumes without a device path + // (DISKLESS local replica — no metadata to clean). + for volNum, device := range devices { + if device == "" { + continue } - cleaned[peer.Name] = peer.ResourceUID + forgetErr := r.cfg.Adm.ForgetPeer(ctx, rdName, volNum, device, nodeID) + if forgetErr != nil { + logger.Info("EvictPeersByUIDMismatch: forget-peer failed (non-fatal)", + "peer", peer.Name, + "vol", volNum, + "nodeID", nodeID, + "err", forgetErr.Error()) + } } - return cleaned, nil + return true, nil } diff --git a/pkg/satellite/reconciler.go b/pkg/satellite/reconciler.go index 6928f352..9fea8904 100644 --- a/pkg/satellite/reconciler.go +++ b/pkg/satellite/reconciler.go @@ -21,6 +21,7 @@ import ( "context" "fmt" "io" + "maps" "os" "path/filepath" "slices" @@ -291,9 +292,7 @@ func (r *Reconciler) SnapshotProviders() map[string]storage.Provider { defer r.mu.Unlock() out := make(map[string]storage.Provider, len(r.cfg.Providers)) - for k, v := range r.cfg.Providers { - out[k] = v - } + maps.Copy(out, r.cfg.Providers) return out } @@ -406,7 +405,7 @@ func (r *Reconciler) CreateSnapshot(ctx context.Context, req *intent.CreateSnaps }) if err != nil { terminal := errors.Is(err, storage.ErrTerminal) || errors.Is(err, storage.ErrNotFound) - //nolint:nilerr // per-resource errors land in Ok=false; gRPC error reserved for transport faults + return &intent.CreateSnapshotResponse{Ok: false, Terminal: terminal, Message: err.Error()}, nil } @@ -818,11 +817,12 @@ func (r *Reconciler) applyStorageIfDiskful(ctx context.Context, dr *intent.Desir // detachIfStillAttached is a no-op when the kernel has already // dropped to Diskless on its own (idempotent re-entry on a // satellite restart mid-toggle). - if err := r.detachIfStillAttached(ctx, dr); err != nil { + err := r.detachIfStillAttached(ctx, dr) + if err != nil { return nil, false, false, err } - err := r.reclaimVolumesForDiskless(ctx, dr) + err = r.reclaimVolumesForDiskless(ctx, dr) if err != nil { return nil, false, false, err } @@ -883,7 +883,8 @@ func (r *Reconciler) detachIfStillAttached(ctx context.Context, dr *intent.Desir return nil } - if detachErr := r.cfg.Adm.Detach(ctx, dr.GetName()); detachErr != nil { + detachErr := r.cfg.Adm.Detach(ctx, dr.GetName()) + if detachErr != nil { return errors.Wrapf(detachErr, "detach %s on diskless toggle", dr.GetName()) } @@ -1335,8 +1336,8 @@ func extractResFilePeerNodeIDs(resPath string) map[string]int32 { // Block opener: `on {`. Stash the name; the // matching `node-id` line follows within the block. - if strings.HasPrefix(trimmed, "on ") { - rest := strings.TrimPrefix(trimmed, "on ") + if after, ok := strings.CutPrefix(trimmed, "on "); ok { + rest := after head, _, ok := strings.Cut(rest, "{") if !ok { @@ -1384,15 +1385,19 @@ func extractResFilePeerNodeIDs(resPath string) map[string]int32 { // LVM-shaped guess. func (r *Reconciler) renderResFile(ctx context.Context, dr *intent.DesiredResource, devices map[int32]string) error { _ = ctx + body, err := buildResFile(dr, r.cfg.NodeName, r.cfg.LocalAddress, devices) if err != nil { return errors.Wrapf(err, "build .res for %s", dr.GetName()) } + resPath := filepath.Join(r.cfg.StateDir, dr.GetName()+".res") + current, _ := os.ReadFile(resPath) if bytes.Equal(current, []byte(body)) { return nil } + return errors.Wrapf(os.WriteFile(resPath, []byte(body), resFilePerm), "write %s", resPath) } @@ -1490,11 +1495,14 @@ func (r *Reconciler) applyDRBD(ctx context.Context, dr *intent.DesiredResource, // handle the cold-start PhaseUnprovisioned case. { obs := r.observeForFsm(ctx, dr, diskless) + phase := ObservePhase(obs) if next := NextTransition(phase, obs); next != nil { - if err := r.dispatchFsmAction(ctx, dr, devices, next.Action, obs); err != nil { + err := r.dispatchFsmAction(ctx, dr, devices, next.Action, obs) + if err != nil { return errors.Wrapf(err, "fsm dispatch %s", next.Action) } + fsmShadowAgreeCount.Add(next.Action+":fsm-dispatched", 1) } } @@ -1850,6 +1858,7 @@ func (r *Reconciler) ensureMetadata(ctx context.Context, dr *intent.DesiredResou // Stage 1 (#489). Best-effort tolerated (file marker is the // source of truth) so no functional regression, just noise. resourceCRDName := dr.GetName() + "." + dr.GetNodeName() + stampErr := r.cfg.MetadataCreatedStamper.StampMetadataCreated(ctx, resourceCRDName) if stampErr != nil { log.FromContext(ctx).Error(stampErr, "stamp MetadataCreated Condition; will retry next reconcile", @@ -2179,6 +2188,7 @@ func (r *Reconciler) runAutoMkfs(ctx context.Context, dr *intent.DesiredResource // Best-effort tolerated (file marker is the source of truth) // so no functional regression on a transient apiserver hiccup. resourceCRDName := dr.GetName() + "." + dr.GetNodeName() + stampErr := r.cfg.FilesystemFormattedStamper.StampFilesystemFormatted(ctx, resourceCRDName) if stampErr != nil { log.FromContext(ctx).Error(stampErr, "stamp FilesystemFormatted Condition; will retry next reconcile", @@ -2315,7 +2325,8 @@ func (r *Reconciler) runApplyDRBDVerb(ctx context.Context, dr *intent.DesiredRes // (Stage 2), createMetadata (Stage 3a), and adjustResource (Stage // 3b) shadows. func (r *Reconciler) bringUpResource(ctx context.Context, dr *intent.DesiredResource) error { - if err := r.cfg.Adm.Up(ctx, dr.GetName()); err != nil { + err := r.cfg.Adm.Up(ctx, dr.GetName()) + if err != nil { return errors.Wrapf(err, "drbdadm up %s", dr.GetName()) } diff --git a/pkg/satellite/reconciler_drbd_test.go b/pkg/satellite/reconciler_drbd_test.go index c5d9168a..1d31e9c3 100644 --- a/pkg/satellite/reconciler_drbd_test.go +++ b/pkg/satellite/reconciler_drbd_test.go @@ -2142,7 +2142,7 @@ func TestApplyFirstActivationSeedsGiBeforeAdjust(t *testing.T) { { Name: "pvc-seed", NodeName: "n1", - Peers: []intent.DesiredPeer{{Name: "n2"}}, + Peers: []intent.DesiredPeer{{Name: "n2"}}, Volumes: []*intent.DesiredVolume{ { VolumeNumber: 0, @@ -2319,7 +2319,7 @@ func TestApplyFirstActivationDiskReplaceInternalMetadata(t *testing.T) { // order is a regression of the upstream recipe — and adjust before // create-md would fail in production with "No valid meta data // found" on the freshly-allocated lower disk. - if !(lvcreate < createMD && createMD < adjust) { + if lvcreate >= createMD || createMD >= adjust { t.Errorf("ordering: lvcreate@%d → create-md@%d → adjust@%d (want strictly ascending); calls=%v", lvcreate, createMD, adjust, calls) } @@ -2578,7 +2578,7 @@ func TestApplyFirstActivationSkipsInitialSyncOnThinOrZFS(t *testing.T) { { Name: "pvc-zskip", NodeName: "n1", - Peers: []intent.DesiredPeer{{Name: "n2"}}, + Peers: []intent.DesiredPeer{{Name: "n2"}}, Volumes: []*intent.DesiredVolume{ {VolumeNumber: 0, SizeKib: 1024 * 1024, StoragePool: tc.poolName}, }, @@ -2679,7 +2679,7 @@ func TestApplyFirstActivationSeedsEveryPeerSlotConsistently(t *testing.T) { { Name: "pvc-race", NodeName: "n1", - Peers: []intent.DesiredPeer{{Name: "n2"}, {Name: "n3"}}, + Peers: []intent.DesiredPeer{{Name: "n2"}, {Name: "n3"}}, Volumes: []*intent.DesiredVolume{ {VolumeNumber: 0, SizeKib: 1024 * 1024, StoragePool: "thin1"}, }, @@ -4519,7 +4519,7 @@ func TestTemporarySecondaryFailureAutoRecovers(t *testing.T) { // forbidden-verb checks below can exclude pass 1's expected // `drbdadm create-md` (one-shot metadata write — fine on first // activation, forbidden anywhere else). - steadyCmds := []string{} + steadyCmds := make([]string, 0, len(fx.CommandLines())*len(cases)) for i, tc := range cases { fx.Reset() diff --git a/pkg/satellite/reconciler_internal_test.go b/pkg/satellite/reconciler_internal_test.go index 481cdc70..7e32c493 100644 --- a/pkg/satellite/reconciler_internal_test.go +++ b/pkg/satellite/reconciler_internal_test.go @@ -59,15 +59,18 @@ var ( // drbdadm's `'' not defined in your config` — the .res-less // failure mode; distinct from 158, must NOT fire up. + //nolint:revive,staticcheck // verbatim drbdadm stderr; capitalization + trailing punct required errFixtNotDefined = errors.New("'pvc-x' not defined in your config (for this host).") // "no resources defined!" from drbdadm when /etc/drbd.d is // empty — also NOT 158. + //nolint:revive,staticcheck // verbatim drbdadm stderr errFixtNoResources = errors.New("no resources defined!") // errno other than 158 paired with "Unknown resource" — e.g. // drbdadm-9's (10) Unknown resource for a different failure // mode. The anchored regex MUST reject this. + //nolint:staticcheck // verbatim drbdadm stderr; capitalization required errFixtErrno10 = errors.New("Failure: (10) Unknown resource: exit status 1") ) diff --git a/pkg/satellite/reconciler_locale_test.go b/pkg/satellite/reconciler_locale_test.go index 3f1cc412..0866e342 100644 --- a/pkg/satellite/reconciler_locale_test.go +++ b/pkg/satellite/reconciler_locale_test.go @@ -75,6 +75,7 @@ func TestApplyLUKSOpenAlreadyExistsNonEnglishLocale(t *testing.T) { fx.Expect( "cryptsetup luksOpen /dev/vg/pvc-luks-de_00000 pvc-luks-de-0-luks --key-file -", storage.FakeResponse{ + //nolint:revive,err113 // verbatim de_DE cryptsetup stderr; locale fidelity required Err: errors.New("Gerät pvc-luks-de-0-luks existiert bereits."), }) diff --git a/pkg/satellite/ship_dispatch_test.go b/pkg/satellite/ship_dispatch_test.go index be9de06d..eff3e9b4 100644 --- a/pkg/satellite/ship_dispatch_test.go +++ b/pkg/satellite/ship_dispatch_test.go @@ -217,6 +217,7 @@ func (*fakeNonShipperProvider) VolumeStatus(_ context.Context, vol storage.Volum func (*fakeNonShipperProvider) CreateSnapshot(_ context.Context, _ storage.Snapshot) error { return nil } + func (*fakeNonShipperProvider) DeleteSnapshot(_ context.Context, _ storage.Snapshot) error { return nil } diff --git a/pkg/storage/lvm/lvm_common.go b/pkg/storage/lvm/lvm_common.go index 4e34a217..fe80dfef 100644 --- a/pkg/storage/lvm/lvm_common.go +++ b/pkg/storage/lvm/lvm_common.go @@ -143,7 +143,7 @@ func lvHasRestoreIncompleteTag(ctx context.Context, ex storage.Exec, vg, lvName return false } - for _, tag := range strings.Split(strings.TrimSpace(string(out)), ",") { + for tag := range strings.SplitSeq(strings.TrimSpace(string(out)), ",") { if strings.TrimSpace(tag) == RestoreIncompleteTag { return true } diff --git a/pkg/storage/lvm/lvm_thin.go b/pkg/storage/lvm/lvm_thin.go index 2ca650ca..d1ed0dd7 100644 --- a/pkg/storage/lvm/lvm_thin.go +++ b/pkg/storage/lvm/lvm_thin.go @@ -455,7 +455,7 @@ func listLVMVolumes(ctx context.Context, ex storage.Exec, vg string) ([]storage. refs := make([]storage.VolumeRef, 0) - for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") { + for line := range strings.SplitSeq(strings.TrimSpace(string(out)), "\n") { line = strings.TrimSpace(line) if line == "" { continue @@ -474,7 +474,7 @@ func listLVMVolumes(ctx context.Context, ex storage.Exec, vg string) ([]storage. // same VG; we don't manage them through DeleteVolume // so the sweeper skips them. Volume LVs have type '-' // (thick) or 'V' (thin virtual). - if len(attr) == 0 { + if attr == "" { continue } @@ -516,7 +516,8 @@ func parseLVName(name string) (string, int32, bool) { return "", 0, false } - return name[:idx], int32(n), true + // suffix is `lvNumberDigits` digits (<=99999), so int->int32 is lossless. + return name[:idx], int32(n), true //nolint:gosec // bounded by lvNumberDigits=5 } // lvNumberDigits is the fixed-width volume-number suffix length — diff --git a/pkg/storage/zfs/zfs.go b/pkg/storage/zfs/zfs.go index c9999aa8..b9aa83a8 100644 --- a/pkg/storage/zfs/zfs.go +++ b/pkg/storage/zfs/zfs.go @@ -183,7 +183,7 @@ func (p *Provider) VolumeStatus(ctx context.Context, vol storage.Volume) (storag "-o", "name,volsize,used", p.volumeDataset(vol)) if err != nil { - return storage.VolumeStatus{State: stateNotProvisioned}, nil //nolint:nilerr // missing == not provisioned + return storage.VolumeStatus{State: stateNotProvisioned}, nil } line := strings.TrimSpace(string(out)) @@ -572,7 +572,8 @@ func (p *Provider) ListVolumeNames(ctx context.Context) ([]storage.VolumeRef, er refs := make([]storage.VolumeRef, 0) prefix := p.cfg.Pool + "/" - for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") { + + for line := range strings.SplitSeq(strings.TrimSpace(string(out)), "\n") { line = strings.TrimSpace(line) if line == "" { continue @@ -628,7 +629,8 @@ func parseVolumeName(name string) (string, int32, bool) { return "", 0, false } - return name[:idx], int32(n), true + // suffix is `volNumberDigits` digits (<=99999), so int->int32 is lossless. + return name[:idx], int32(n), true //nolint:gosec // bounded by volNumberDigits=5 } // volNumberDigits is the fixed-width digit count blockstor uses for diff --git a/pkg/store/inmemory.go b/pkg/store/inmemory.go index 6844d285..606d15d0 100644 --- a/pkg/store/inmemory.go +++ b/pkg/store/inmemory.go @@ -263,7 +263,8 @@ func (s *inMemoryNodes) PatchProps(_ context.Context, name string, mutate func(m node.Props = map[string]string{} } - if err := mutate(node.Props); err != nil { + err := mutate(node.Props) + if err != nil { return errors.Wrapf(err, "patch Props of node %q", name) } @@ -289,7 +290,8 @@ func (s *inMemoryNodes) PatchNodeSpec(_ context.Context, name string, mutate fun return errors.Wrapf(ErrNotFound, "node %q", name) } - if err := mutate(&node); err != nil { + err := mutate(&node) + if err != nil { return errors.Wrapf(err, "patch Node %q", name) } diff --git a/pkg/store/inmemory_resource.go b/pkg/store/inmemory_resource.go index 7602f9fd..5ebf3abc 100644 --- a/pkg/store/inmemory_resource.go +++ b/pkg/store/inmemory_resource.go @@ -136,18 +136,19 @@ func (s *inMemoryResources) PatchResourceSpec(_ context.Context, rdName, node st s.mu.Lock() defer s.mu.Unlock() - k := rKey{rdName, node} + key := rKey{rdName, node} - r, ok := s.m[k] + r, ok := s.m[key] if !ok { return errors.Wrapf(ErrNotFound, "resource %q on node %q", rdName, node) } - if err := mutate(&r); err != nil { + err := mutate(&r) + if err != nil { return errors.Wrapf(err, "patch resource %q on node %q", rdName, node) } - s.m[k] = r + s.m[key] = r return nil } @@ -156,12 +157,12 @@ func (s *inMemoryResources) Delete(_ context.Context, rdName, node string) error s.mu.Lock() defer s.mu.Unlock() - k := rKey{rdName, node} - if _, exists := s.m[k]; !exists { + key := rKey{rdName, node} + if _, exists := s.m[key]; !exists { return errors.Wrapf(ErrNotFound, "resource %q on node %q", rdName, node) } - delete(s.m, k) + delete(s.m, key) return nil } diff --git a/pkg/store/inmemory_resource_definition.go b/pkg/store/inmemory_resource_definition.go index 2f54314c..7830b9a6 100644 --- a/pkg/store/inmemory_resource_definition.go +++ b/pkg/store/inmemory_resource_definition.go @@ -108,7 +108,8 @@ func (s *inMemoryResourceDefinitions) PatchResourceDefinitionSpec(_ context.Cont return errors.Wrapf(ErrNotFound, "resource definition %q", name) } - if err := mutate(&rd); err != nil { + err := mutate(&rd) + if err != nil { return errors.Wrapf(err, "patch resource definition %q", name) } diff --git a/pkg/store/inmemory_resource_group.go b/pkg/store/inmemory_resource_group.go index 710b77c6..a1cf797b 100644 --- a/pkg/store/inmemory_resource_group.go +++ b/pkg/store/inmemory_resource_group.go @@ -108,7 +108,8 @@ func (s *inMemoryResourceGroups) PatchResourceGroup(_ context.Context, name stri return errors.Wrapf(ErrNotFound, "resource group %q", name) } - if err := mutate(&rg); err != nil { + err := mutate(&rg) + if err != nil { return errors.Wrapf(err, "patch ResourceGroup %q", name) } diff --git a/pkg/store/inmemory_storage_pool.go b/pkg/store/inmemory_storage_pool.go index 4cad2eab..1329a59b 100644 --- a/pkg/store/inmemory_storage_pool.go +++ b/pkg/store/inmemory_storage_pool.go @@ -235,18 +235,19 @@ func (s *inMemoryStoragePools) PatchStoragePoolSpec(_ context.Context, node, poo s.mu.Lock() defer s.mu.Unlock() - k := spKey{node, pool} + key := spKey{node, pool} - sp, ok := s.m[k] + sp, ok := s.m[key] if !ok { return errors.Wrapf(ErrNotFound, "storage pool %q on node %q", pool, node) } - if err := mutate(&sp); err != nil { + err := mutate(&sp) + if err != nil { return errors.Wrapf(err, "patch storage pool %q on node %q", pool, node) } - s.m[k] = sp + s.m[key] = sp return nil } diff --git a/pkg/store/inmemory_volume_definition.go b/pkg/store/inmemory_volume_definition.go index b9fd7bd5..c0e30653 100644 --- a/pkg/store/inmemory_volume_definition.go +++ b/pkg/store/inmemory_volume_definition.go @@ -114,18 +114,19 @@ func (s *inMemoryVolumeDefinitions) PatchVolumeDefinitionSpec(_ context.Context, s.mu.Lock() defer s.mu.Unlock() - k := vdKey{rdName, volumeNumber} + key := vdKey{rdName, volumeNumber} - vd, ok := s.m[k] + vd, ok := s.m[key] if !ok { return errors.Wrapf(ErrNotFound, "volume %d on resource definition %q", volumeNumber, rdName) } - if err := mutate(&vd); err != nil { + err := mutate(&vd) + if err != nil { return errors.Wrapf(err, "patch volume %d on resource definition %q", volumeNumber, rdName) } - s.m[k] = vd + s.m[key] = vd return nil } diff --git a/pkg/store/k8s/patch_bug_204a_test.go b/pkg/store/k8s/patch_bug_204a_test.go index e116ad58..21c3a7b7 100644 --- a/pkg/store/k8s/patch_bug_204a_test.go +++ b/pkg/store/k8s/patch_bug_204a_test.go @@ -191,12 +191,19 @@ func TestBug204aExtraPropsAutoCreatesSingleton(t *testing.T) { } // TestBug204aLostUpdateOnVanillaUpdate is the regression-witness: -// it bursts the same 100 disjoint ExtraProps writes using the OLD -// REST-handler-style `Get -> mutate -> Update` pattern (no Patch -// helper) and shows that AT LEAST ONE mutation is silently lost. -// Mirrors TestBug201LostUpdateOnVanillaUpdate. If this PASSES then -// the fixture doesn't exercise the race — re-check the seed before -// proceeding. +// it bursts disjoint ExtraProps writes using the OLD REST-handler-style +// `Get -> mutate -> Update` pattern (no Patch helper) and shows that +// AT LEAST ONE mutation is silently lost. Mirrors +// TestBug201LostUpdateOnVanillaUpdate. +// +// Flake control: on a fast scheduler (24-vCPU Oracle runner) 50 +// goroutines can serialise through the in-memory fakeClient before +// any of them race, producing lost=0 and a false-positive test +// failure. Retry the burst up to maxAttempts and accept the witness +// as long as at least one attempt observes the race. If ALL attempts +// come back clean the underlying fakeClient really doesn't model the +// race and the test fails — that's a real fixture regression, not +// scheduler luck. func TestBug204aLostUpdateOnVanillaUpdate(t *testing.T) { t.Parallel() @@ -204,60 +211,70 @@ func TestBug204aLostUpdateOnVanillaUpdate(t *testing.T) { t.Skip("witness for Bug 204a — runs in long mode only") } - const burst = 50 + const ( + burst = 50 + maxAttempts = 10 + ) - cli := bug204aNewFakeClient(t) - seedControllerConfig(t, cli) + for attempt := 1; attempt <= maxAttempts; attempt++ { + cli := bug204aNewFakeClient(t) + seedControllerConfig(t, cli) - var wg sync.WaitGroup + var wg sync.WaitGroup - for i := range burst { - wg.Add(1) + for i := range burst { + wg.Add(1) - go func(idx int) { - defer wg.Done() + go func(idx int) { + defer wg.Done() - // Old REST-handler pattern: Get -> mutate -> Update. - var cc crdv1alpha1.ControllerConfig - if err := cli.Get(t.Context(), - ctrlclient.ObjectKey{Name: crdv1alpha1.ControllerConfigName}, - &cc, - ); err != nil { - return - } + // Old REST-handler pattern: Get -> mutate -> Update. + var cc crdv1alpha1.ControllerConfig + if err := cli.Get(t.Context(), + ctrlclient.ObjectKey{Name: crdv1alpha1.ControllerConfigName}, + &cc, + ); err != nil { + return + } - if cc.Spec.ExtraProps == nil { - cc.Spec.ExtraProps = map[string]string{} - } + if cc.Spec.ExtraProps == nil { + cc.Spec.ExtraProps = map[string]string{} + } - cc.Spec.ExtraProps[fmt.Sprintf("k-%d", idx)] = fmt.Sprintf("v-%d", idx) + cc.Spec.ExtraProps[fmt.Sprintf("k-%d", idx)] = fmt.Sprintf("v-%d", idx) - _ = cli.Update(t.Context(), &cc) - }(i) - } + _ = cli.Update(t.Context(), &cc) + }(i) + } - wg.Wait() + wg.Wait() - var got crdv1alpha1.ControllerConfig - if err := cli.Get(t.Context(), - ctrlclient.ObjectKey{Name: crdv1alpha1.ControllerConfigName}, - &got, - ); err != nil { - t.Fatalf("final Get: %v", err) - } + var got crdv1alpha1.ControllerConfig + if err := cli.Get(t.Context(), + ctrlclient.ObjectKey{Name: crdv1alpha1.ControllerConfigName}, + &got, + ); err != nil { + t.Fatalf("attempt %d: final Get: %v", attempt, err) + } - lost := 0 - for i := range burst { - if _, present := got.Spec.ExtraProps[fmt.Sprintf("k-%d", i)]; !present { - lost++ + lost := 0 + for i := range burst { + if _, present := got.Spec.ExtraProps[fmt.Sprintf("k-%d", i)]; !present { + lost++ + } } - } - if lost == 0 { - t.Errorf("expected lost-update under burst, got 0 — fixture may not exercise the race") - } else { - t.Logf("Bug 204a witness: lost %d/%d ExtraProps writes under wholesale Update", lost, burst) + if lost > 0 { + t.Logf("Bug 204a witness (attempt %d/%d): lost %d/%d ExtraProps writes under wholesale Update", + attempt, maxAttempts, lost, burst) + + return + } + + t.Logf("attempt %d/%d: scheduler serialised the burst (lost=0), retrying", attempt, maxAttempts) } + + t.Errorf("expected lost-update under burst after %d attempts, got 0 every time — fixture may not exercise the race", maxAttempts) } // TestBug204aExtraPropsModifyAndDeleteConverges pairs disjoint diff --git a/pkg/store/k8s/snapshots.go b/pkg/store/k8s/snapshots.go index fdce7844..d30bca85 100644 --- a/pkg/store/k8s/snapshots.go +++ b/pkg/store/k8s/snapshots.go @@ -118,57 +118,6 @@ func (s *snapshots) Get(ctx context.Context, rdName, snapName string) (apiv1.Sna return crdToWireSnapshot(&crd, parent), nil } -// getParentRD fetches the parent ResourceDefinition for a Snapshot, -// returning nil + the not-found error when the parent has already -// been deleted (orphan Snapshot — the view layer still has to render -// the snapshot row without panicking on nil-deref). -func (s *snapshots) getParentRD(ctx context.Context, rdName string) (*crdv1alpha1.ResourceDefinition, error) { - if rdName == "" { - return nil, nil //nolint:nilnil // empty name == no parent to look up - } - - var rd crdv1alpha1.ResourceDefinition - - err := s.c.Get(ctx, types.NamespacedName{Name: Name(rdName)}, &rd) - if err != nil { - if apierrors.IsNotFound(err) { - return nil, nil //nolint:nilnil // orphan snapshot — no parent - } - - return nil, errors.Wrapf(err, "get parent RD %q for Snapshot", rdName) - } - - return &rd, nil -} - -// collectParentRDs batches the parent-RD lookups for a List call so -// the view of 50 snapshots-on-the-same-RD doesn't trigger 50 separate -// API server GETs. Missing parents (orphan Snapshots) silently -// produce a nil entry in the map — the wire-shape conversion folds -// nil into empty maps. -func (s *snapshots) collectParentRDs( - ctx context.Context, snaps []crdv1alpha1.Snapshot, -) (map[string]*crdv1alpha1.ResourceDefinition, error) { - out := make(map[string]*crdv1alpha1.ResourceDefinition, len(snaps)) - - for i := range snaps { - rdName := snaps[i].Spec.ResourceDefinitionName - - if _, seen := out[rdName]; seen { - continue - } - - rd, err := s.getParentRD(ctx, rdName) - if err != nil { - return nil, err - } - - out[rdName] = rd - } - - return out, nil -} - func (s *snapshots) Create(ctx context.Context, in *apiv1.Snapshot) error { if in == nil { return errors.New("nil Snapshot") @@ -257,6 +206,57 @@ func (s *snapshots) Delete(ctx context.Context, rdName, snapName string) error { return nil } +// getParentRD fetches the parent ResourceDefinition for a Snapshot, +// returning nil + the not-found error when the parent has already +// been deleted (orphan Snapshot — the view layer still has to render +// the snapshot row without panicking on nil-deref). +func (s *snapshots) getParentRD(ctx context.Context, rdName string) (*crdv1alpha1.ResourceDefinition, error) { + if rdName == "" { + return nil, nil //nolint:nilnil // empty name == no parent to look up + } + + var rd crdv1alpha1.ResourceDefinition + + err := s.c.Get(ctx, types.NamespacedName{Name: Name(rdName)}, &rd) + if err != nil { + if apierrors.IsNotFound(err) { + return nil, nil //nolint:nilnil // orphan snapshot — no parent + } + + return nil, errors.Wrapf(err, "get parent RD %q for Snapshot", rdName) + } + + return &rd, nil +} + +// collectParentRDs batches the parent-RD lookups for a List call so +// the view of 50 snapshots-on-the-same-RD doesn't trigger 50 separate +// API server GETs. Missing parents (orphan Snapshots) silently +// produce a nil entry in the map — the wire-shape conversion folds +// nil into empty maps. +func (s *snapshots) collectParentRDs( + ctx context.Context, snaps []crdv1alpha1.Snapshot, +) (map[string]*crdv1alpha1.ResourceDefinition, error) { + out := make(map[string]*crdv1alpha1.ResourceDefinition, len(snaps)) + + for i := range snaps { + rdName := snaps[i].Spec.ResourceDefinitionName + + if _, seen := out[rdName]; seen { + continue + } + + rd, err := s.getParentRD(ctx, rdName) + if err != nil { + return nil, err + } + + out[rdName] = rd + } + + return out, nil +} + // crdToWireSnapshot converts the Snapshot CRD into the wire DTO. // `parent` is the parent ResourceDefinition CRD at conversion time — // nil when the parent has been deleted (orphan Snapshot). The parent @@ -281,14 +281,7 @@ func crdToWireSnapshot( if parent != nil { out.ResourceDefinitionProps = parent.Spec.Props - - vdPropsByNumber = make(map[int32]map[string]string, len(parent.Spec.VolumeDefinitions)) - for i := range parent.Spec.VolumeDefinitions { - vd := &parent.Spec.VolumeDefinitions[i] - if len(vd.Props) > 0 { - vdPropsByNumber[vd.VolumeNumber] = vd.Props - } - } + vdPropsByNumber = vdPropsMap(parent) } // Surface Status.Flags on the wire so the Python CLI's @@ -302,49 +295,81 @@ func crdToWireSnapshot( } if len(crd.Spec.VolumeDefinitions) > 0 { - out.VolumeDefinitions = make([]apiv1.SnapshotVolumeDef, 0, len(crd.Spec.VolumeDefinitions)) - for i := range crd.Spec.VolumeDefinitions { - out.VolumeDefinitions = append(out.VolumeDefinitions, apiv1.SnapshotVolumeDef{ - VolumeNumber: crd.Spec.VolumeDefinitions[i].VolumeNumber, - SizeKib: crd.Spec.VolumeDefinitions[i].SizeKib, - VolumeDefinitionProps: vdPropsByNumber[crd.Spec.VolumeDefinitions[i].VolumeNumber], - }) + out.VolumeDefinitions = wireVolumeDefinitions(crd.Spec.VolumeDefinitions, vdPropsByNumber) + } + + out.Snapshots = wirePerNodeSnapshots(crd) + + return out +} + +// vdPropsMap indexes the parent RD's per-volume Props by volume number +// so crdToWireSnapshot can attach them to each SnapshotVolumeDef. +func vdPropsMap(parent *crdv1alpha1.ResourceDefinition) map[int32]map[string]string { + out := make(map[int32]map[string]string, len(parent.Spec.VolumeDefinitions)) + + for i := range parent.Spec.VolumeDefinitions { + vd := &parent.Spec.VolumeDefinitions[i] + if len(vd.Props) > 0 { + out[vd.VolumeNumber] = vd.Props } } + return out +} + +// wireVolumeDefinitions builds the on-wire snapshot volume-definition +// list, attaching the parent RD's per-volume Props. +func wireVolumeDefinitions( + vds []crdv1alpha1.SnapshotVolumeRef, + vdPropsByNumber map[int32]map[string]string, +) []apiv1.SnapshotVolumeDef { + out := make([]apiv1.SnapshotVolumeDef, 0, len(vds)) + + for i := range vds { + out = append(out, apiv1.SnapshotVolumeDef{ + VolumeNumber: vds[i].VolumeNumber, + SizeKib: vds[i].SizeKib, + VolumeDefinitionProps: vdPropsByNumber[vds[i].VolumeNumber], + }) + } + + return out +} + +// wirePerNodeSnapshots renders the per-node `snapshots[]` payload. +// Prefers satellite-reported Status.NodeStatus when present; otherwise +// synthesises one entry per Spec.Nodes target so the wire shape stays +// non-empty between CreateSnapshot and the first satellite ack +// (linstor-csi's ListSnapshots hard-fails on an empty list). +func wirePerNodeSnapshots(crd *crdv1alpha1.Snapshot) []apiv1.SnapshotPerNode { switch { case len(crd.Status.NodeStatus) > 0: - out.Snapshots = make([]apiv1.SnapshotPerNode, 0, len(crd.Status.NodeStatus)) + out := make([]apiv1.SnapshotPerNode, 0, len(crd.Status.NodeStatus)) for i := range crd.Status.NodeStatus { - out.Snapshots = append(out.Snapshots, apiv1.SnapshotPerNode{ + out = append(out, apiv1.SnapshotPerNode{ SnapshotName: crd.Spec.SnapshotName, NodeName: crd.Status.NodeStatus[i].NodeName, CreateTimestamp: crd.Status.NodeStatus[i].CreateTimestamp, SnapshotVolumes: snapshotVolumesFromVDs(crd.Spec.VolumeDefinitions), }) } + + return out case len(crd.Spec.Nodes) > 0: - // Status.NodeStatus is satellite-reported and lands after the - // satellite reconciler picks up the new Snapshot CRD. The - // REST shim's view of "where the snapshot landed" needs to - // be visible immediately after CreateSnapshot — linstor-csi - // hard-fails ListSnapshots with "missing snapshots" when - // the per-node Snapshots[] is empty. Synthesise one - // SnapshotPerNode entry per Spec.Nodes target so the wire - // shape matches upstream LINSTOR's "all replicas have a - // SnapshotNode entry once the controller commits the - // definition" semantic. - out.Snapshots = make([]apiv1.SnapshotPerNode, 0, len(crd.Spec.Nodes)) + out := make([]apiv1.SnapshotPerNode, 0, len(crd.Spec.Nodes)) for _, node := range crd.Spec.Nodes { - out.Snapshots = append(out.Snapshots, apiv1.SnapshotPerNode{ + out = append(out, apiv1.SnapshotPerNode{ SnapshotName: crd.Spec.SnapshotName, NodeName: node, SnapshotVolumes: snapshotVolumesFromVDs(crd.Spec.VolumeDefinitions), }) } + + return out } - return out + return nil } // snapshotVolumesFromVDs derives the per-node `snapshot_volumes[]` diff --git a/pkg/store/k8s/storage_pools.go b/pkg/store/k8s/storage_pools.go index e2926c4f..40c9bdd2 100644 --- a/pkg/store/k8s/storage_pools.go +++ b/pkg/store/k8s/storage_pools.go @@ -32,6 +32,13 @@ import ( "github.com/cozystack/blockstor/pkg/store" ) +// Upstream LINSTOR's StoragePool.State enumerates "Ok" / "Faulty" / "Error"; +// blockstor only emits the first two today (PoolMissing → Faulty, else Ok). +const ( + storagePoolStateOk = "Ok" + storagePoolStateFaulty = "Faulty" +) + // storagePools implements store.StoragePoolStore against the StoragePool CRD. type storagePools struct { c ctrlclient.Client @@ -302,12 +309,12 @@ func crdToWireStoragePool(crd *crdv1alpha1.StoragePool) apiv1.StoragePool { // pool still rendered as Ok in `linstor sp l`. Synthesise an // ERROR-severity reports[] entry whenever PoolMissing=true so the // CLI sees the real state. Bug 83. - state := "Ok" + state := storagePoolStateOk var reports []apiv1.APICallRc if crd.Status.PoolMissing { - state = "Faulty" + state = storagePoolStateFaulty reports = []apiv1.APICallRc{poolMissingReport(crd.Spec.NodeName, poolName)} } diff --git a/pkg/uevent/netlink.go b/pkg/uevent/netlink.go index f0fa79b5..05ede36b 100644 --- a/pkg/uevent/netlink.go +++ b/pkg/uevent/netlink.go @@ -94,23 +94,23 @@ type Listener struct { // typically exec but `pkg/storage` shells out to lsblk / pvs / // zpool / drbdmeta on every scan. func New(ctx context.Context) (*Listener, error) { - fd, err := unix.Socket(unix.AF_NETLINK, unix.SOCK_RAW|unix.SOCK_CLOEXEC, unix.NETLINK_KOBJECT_UEVENT) + sockFD, err := unix.Socket(unix.AF_NETLINK, unix.SOCK_RAW|unix.SOCK_CLOEXEC, unix.NETLINK_KOBJECT_UEVENT) if err != nil { return nil, errors.Wrap(err, "socket(AF_NETLINK, NETLINK_KOBJECT_UEVENT)") } - err = unix.Bind(fd, &unix.SockaddrNetlink{ + err = unix.Bind(sockFD, &unix.SockaddrNetlink{ Family: unix.AF_NETLINK, Groups: netlinkGroupKernel, }) if err != nil { - _ = unix.Close(fd) + _ = unix.Close(sockFD) return nil, errors.Wrap(err, "bind netlink uevent socket") } listener := &Listener{ - fd: fd, + fd: sockFD, events: make(chan Event, eventBufferSize), } @@ -121,7 +121,7 @@ func New(ctx context.Context) (*Listener, error) { // "the listener is up but no events are flowing", and we // burned hours diagnosing exactly that. log.FromContext(ctx).Info("uevent listener started", - "fd", fd, + "fd", sockFD, "group", netlinkGroupKernel, "buffer", eventBufferSize) @@ -156,6 +156,7 @@ func (l *Listener) run(ctx context.Context) { // loop falls through to the ctx.Err() check. go func() { <-ctx.Done() + _ = unix.Close(l.fd) }() diff --git a/pkg/version/bug_238_test.go b/pkg/version/bug_238_test.go index 0b6d8c47..2d30099e 100644 --- a/pkg/version/bug_238_test.go +++ b/pkg/version/bug_238_test.go @@ -17,6 +17,7 @@ limitations under the License. package version import ( + "fmt" "os" "regexp" "runtime" @@ -102,7 +103,7 @@ func readVendoredOpenAPIVersion() (string, error) { content, err := os.ReadFile(specPath) if err != nil { - return "", err + return "", fmt.Errorf("read %s: %w", specPath, err) } // `info:` block — match ` version: ` immediately after @@ -155,7 +156,7 @@ func readVersionSource() (string, error) { content, err := os.ReadFile(versionFile) if err != nil { - return "", err + return "", fmt.Errorf("read %s: %w", versionFile, err) } return string(content), nil diff --git a/pkg/version/version.go b/pkg/version/version.go index 1c128512..6c81e00f 100644 --- a/pkg/version/version.go +++ b/pkg/version/version.go @@ -19,9 +19,9 @@ limitations under the License. // /v1/controller/version so that golinstor clients can negotiate. package version -// Build-time identity of this binary. Project is a const because the -// binary's own name never changes; Version / GitCommit are vars so the -// container build can stamp them via `-ldflags -X` (Bug 169). +// Project names the binary. It is a const because the binary's own name +// never changes; Version / GitCommit are vars so the container build can +// stamp them via `-ldflags -X` (Bug 169). const Project = "blockstor" // Version / GitCommit identify the blockstor build. Defaults are @@ -29,6 +29,8 @@ const Project = "blockstor" // them via `go build -ldflags "-X .../version.Version= -X // .../version.GitCommit="`. `-ldflags -X` only works against // package-level string vars, never consts. +// +//nolint:gochecknoglobals // ldflags-injected build identity, see comment above var ( Version = "0.0.0-dev" GitCommit = "unknown" @@ -87,6 +89,8 @@ const ( // rewrites via `-ldflags -X .../version.LinstorBuildTime=$(date -u // +%FT%TZ)`. Parses cleanly as time.RFC3339 so contract tests don't // gate on the exact value. +// +//nolint:gochecknoglobals // ldflags-injected build identity, see comment above var ( LinstorGitHash = "blockstor" LinstorBuildTime = "2026-01-01T00:00:00+00:00" diff --git a/test/cheatsheet/cheatsheet_test.go b/test/cheatsheet/cheatsheet_test.go index 3e5bfc3c..59d1e9f9 100644 --- a/test/cheatsheet/cheatsheet_test.go +++ b/test/cheatsheet/cheatsheet_test.go @@ -44,7 +44,7 @@ func repoRoot(t *testing.T) string { } d := filepath.Dir(here) - for i := 0; i < 8; i++ { + for range 8 { if _, err := os.Stat(filepath.Join(d, "go.mod")); err == nil { return d }