Skip to content

Commit dcd82cb

Browse files
authored
feat: Change AppliedWork to used foregroundDeletion (#60)
1 parent 224f3c6 commit dcd82cb

File tree

13 files changed

+1621
-231
lines changed

13 files changed

+1621
-231
lines changed

cmd/memberagent/main.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ var (
9191
driftDetectionInterval = flag.Int("drift-detection-interval", 15, "The interval in seconds between attempts to detect configuration drifts in the cluster.")
9292
watchWorkWithPriorityQueue = flag.Bool("enable-watch-work-with-priority-queue", false, "If set, the apply_work controller will watch/reconcile work objects that are created new or have recent updates")
9393
watchWorkReconcileAgeMinutes = flag.Int("watch-work-reconcile-age", 60, "maximum age (in minutes) of work objects for apply_work controller to watch/reconcile")
94+
deletionWaitTime = flag.Int("deletion-wait-time", 5, "The time the work-applier will wait for work object to be deleted before updating the applied work owner reference")
9495
enablePprof = flag.Bool("enable-pprof", false, "enable pprof profiling")
9596
pprofPort = flag.Int("pprof-port", 6065, "port for pprof profiling")
9697
hubPprofPort = flag.Int("hub-pprof-port", 6066, "port for hub pprof profiling")
@@ -395,6 +396,7 @@ func Start(ctx context.Context, hubCfg, memberConfig *rest.Config, hubOpts, memb
395396
parallelizer.DefaultNumOfWorkers,
396397
time.Second*time.Duration(*availabilityCheckInterval),
397398
time.Second*time.Duration(*driftDetectionInterval),
399+
time.Minute*time.Duration(*deletionWaitTime),
398400
*watchWorkWithPriorityQueue,
399401
*watchWorkReconcileAgeMinutes,
400402
)

pkg/controllers/internalmembercluster/v1beta1/member_suite_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -379,7 +379,7 @@ var _ = BeforeSuite(func() {
379379

380380
// This controller is created for testing purposes only; no reconciliation loop is actually
381381
// run.
382-
workApplier1 = workapplier.NewReconciler(hubClient, member1ReservedNSName, nil, nil, nil, nil, 0, 1, time.Second*5, time.Second*5, true, 60)
382+
workApplier1 = workapplier.NewReconciler(hubClient, member1ReservedNSName, nil, nil, nil, nil, 0, 1, time.Second*5, time.Second*5, time.Minute, true, 60)
383383

384384
propertyProvider1 = &manuallyUpdatedProvider{}
385385
member1Reconciler, err := NewReconciler(ctx, hubClient, member1Cfg, member1Client, workApplier1, propertyProvider1)
@@ -402,7 +402,7 @@ var _ = BeforeSuite(func() {
402402

403403
// This controller is created for testing purposes only; no reconciliation loop is actually
404404
// run.
405-
workApplier2 = workapplier.NewReconciler(hubClient, member2ReservedNSName, nil, nil, nil, nil, 0, 1, time.Second*5, time.Second*5, true, 60)
405+
workApplier2 = workapplier.NewReconciler(hubClient, member2ReservedNSName, nil, nil, nil, nil, 0, 1, time.Second*5, time.Second*5, time.Minute, true, 60)
406406

407407
member2Reconciler, err := NewReconciler(ctx, hubClient, member2Cfg, member2Client, workApplier2, nil)
408408
Expect(err).NotTo(HaveOccurred())

pkg/controllers/workapplier/apply.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ package workapplier
1919
import (
2020
"context"
2121
"fmt"
22-
"reflect"
2322

2423
corev1 "k8s.io/api/core/v1"
2524
"k8s.io/apimachinery/pkg/api/validation"
@@ -518,7 +517,7 @@ func validateOwnerReferences(
518517
// expected AppliedWork object. For safety reasons, Fleet will still do a sanity check.
519518
found := false
520519
for _, ownerRef := range inMemberClusterObjOwnerRefs {
521-
if reflect.DeepEqual(ownerRef, *expectedAppliedWorkOwnerRef) {
520+
if areOwnerRefsEqual(&ownerRef, expectedAppliedWorkOwnerRef) {
522521
found = true
523522
break
524523
}

pkg/controllers/workapplier/controller.go

Lines changed: 102 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,3 @@
1-
/*
2-
Copyright 2021 The Kubernetes Authors.
3-
4-
Licensed under the Apache License, Version 2.0 (the "License");
5-
you may not use this file except in compliance with the License.
6-
You may obtain a copy of the License at
7-
8-
http://www.apache.org/licenses/LICENSE-2.0
9-
10-
Unless required by applicable law or agreed to in writing, software
11-
distributed under the License is distributed on an "AS IS" BASIS,
12-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13-
See the License for the specific language governing permissions and
14-
limitations under the License.
15-
*/
16-
171
/*
182
Copyright 2025 The KubeFleet Authors.
193
@@ -34,6 +18,7 @@ package workapplier
3418

3519
import (
3620
"context"
21+
"fmt"
3722
"time"
3823

3924
"go.uber.org/atomic"
@@ -212,6 +197,7 @@ type Reconciler struct {
212197

213198
availabilityCheckRequeueAfter time.Duration
214199
driftCheckRequeueAfter time.Duration
200+
deletionWaitTime time.Duration
215201
}
216202

217203
func NewReconciler(
@@ -222,6 +208,7 @@ func NewReconciler(
222208
workerCount int,
223209
availabilityCheckRequestAfter time.Duration,
224210
driftCheckRequestAfter time.Duration,
211+
deletionWaitTime time.Duration,
225212
watchWorkWithPriorityQueue bool,
226213
watchWorkReconcileAgeMinutes int,
227214
) *Reconciler {
@@ -251,6 +238,7 @@ func NewReconciler(
251238
joined: atomic.NewBool(false),
252239
availabilityCheckRequeueAfter: acRequestAfter,
253240
driftCheckRequeueAfter: dcRequestAfter,
241+
deletionWaitTime: deletionWaitTime,
254242
}
255243
}
256244

@@ -417,7 +405,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu
417405
Kind: fleetv1beta1.AppliedWorkKind,
418406
Name: appliedWork.GetName(),
419407
UID: appliedWork.GetUID(),
420-
BlockOwnerDeletion: ptr.To(false),
408+
BlockOwnerDeletion: ptr.To(true),
421409
}
422410

423411
// Set the default values for the Work object to avoid additional validation logic in the
@@ -485,29 +473,112 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu
485473
return ctrl.Result{RequeueAfter: r.driftCheckRequeueAfter}, nil
486474
}
487475

488-
// garbageCollectAppliedWork deletes the appliedWork and all the manifests associated with it from the cluster.
489476
func (r *Reconciler) garbageCollectAppliedWork(ctx context.Context, work *fleetv1beta1.Work) (ctrl.Result, error) {
490-
deletePolicy := metav1.DeletePropagationBackground
477+
deletePolicy := metav1.DeletePropagationForeground
491478
if !controllerutil.ContainsFinalizer(work, fleetv1beta1.WorkFinalizer) {
492479
return ctrl.Result{}, nil
493480
}
494-
// delete the appliedWork which will remove all the manifests associated with it
495-
// TODO: allow orphaned manifest
496-
appliedWork := fleetv1beta1.AppliedWork{
481+
appliedWork := &fleetv1beta1.AppliedWork{
497482
ObjectMeta: metav1.ObjectMeta{Name: work.Name},
498483
}
499-
err := r.spokeClient.Delete(ctx, &appliedWork, &client.DeleteOptions{PropagationPolicy: &deletePolicy})
500-
switch {
501-
case apierrors.IsNotFound(err):
502-
klog.V(2).InfoS("The appliedWork is already deleted", "appliedWork", work.Name)
503-
case err != nil:
504-
klog.ErrorS(err, "Failed to delete the appliedWork", "appliedWork", work.Name)
484+
// Get the AppliedWork object
485+
if err := r.spokeClient.Get(ctx, types.NamespacedName{Name: work.Name}, appliedWork); err != nil {
486+
if apierrors.IsNotFound(err) {
487+
klog.V(2).InfoS("The appliedWork is already deleted, removing the finalizer from the work", "appliedWork", work.Name)
488+
return r.removeWorkFinalizer(ctx, work)
489+
}
490+
klog.ErrorS(err, "Failed to get AppliedWork", "appliedWork", work.Name)
505491
return ctrl.Result{}, controller.NewAPIServerError(false, err)
506-
default:
507-
klog.InfoS("Successfully deleted the appliedWork", "appliedWork", work.Name)
508492
}
509-
controllerutil.RemoveFinalizer(work, fleetv1beta1.WorkFinalizer)
510493

494+
// Handle stuck deletion after 5 minutes where the other owner references might not exist or are invalid.
495+
if !appliedWork.DeletionTimestamp.IsZero() && time.Since(appliedWork.DeletionTimestamp.Time) >= r.deletionWaitTime {
496+
klog.V(2).InfoS("AppliedWork deletion appears stuck; attempting to patch owner references", "appliedWork", work.Name)
497+
if err := r.updateOwnerReference(ctx, work, appliedWork); err != nil {
498+
klog.ErrorS(err, "Failed to update owner references for AppliedWork", "appliedWork", work.Name)
499+
return ctrl.Result{}, controller.NewAPIServerError(false, err)
500+
}
501+
return ctrl.Result{}, fmt.Errorf("AppliedWork %s is being deleted, waiting for the deletion to complete", work.Name)
502+
}
503+
504+
if err := r.spokeClient.Delete(ctx, appliedWork, &client.DeleteOptions{PropagationPolicy: &deletePolicy}); err != nil {
505+
if apierrors.IsNotFound(err) {
506+
klog.V(2).InfoS("AppliedWork already deleted", "appliedWork", work.Name)
507+
return r.removeWorkFinalizer(ctx, work)
508+
}
509+
klog.V(2).ErrorS(err, "Failed to delete the appliedWork", "appliedWork", work.Name)
510+
return ctrl.Result{}, controller.NewAPIServerError(false, err)
511+
}
512+
513+
klog.V(2).InfoS("AppliedWork deletion in progress", "appliedWork", work.Name)
514+
return ctrl.Result{}, fmt.Errorf("AppliedWork %s is being deleted, waiting for the deletion to complete", work.Name)
515+
}
516+
517+
// updateOwnerReference updates the AppliedWork owner reference in the manifest objects.
518+
// It changes the blockOwnerDeletion field to false, so that the AppliedWork can be deleted in cases where
519+
// the other owner references do not exist or are invalid.
520+
// https://kubernetes.io/docs/concepts/overview/working-with-objects/owners-dependents/#owner-references-in-object-specifications
521+
func (r *Reconciler) updateOwnerReference(ctx context.Context, work *fleetv1beta1.Work, appliedWork *fleetv1beta1.AppliedWork) error {
522+
appliedWorkOwnerRef := &metav1.OwnerReference{
523+
APIVersion: fleetv1beta1.GroupVersion.String(),
524+
Kind: "AppliedWork",
525+
Name: appliedWork.Name,
526+
UID: appliedWork.UID,
527+
}
528+
529+
if err := r.hubClient.Get(ctx, types.NamespacedName{Name: work.Name, Namespace: work.Namespace}, work); err != nil {
530+
if apierrors.IsNotFound(err) {
531+
klog.V(2).InfoS("Work object not found, skipping owner reference update", "work", work.Name, "namespace", work.Namespace)
532+
return nil
533+
}
534+
klog.ErrorS(err, "Failed to get Work object for owner reference update", "work", work.Name, "namespace", work.Namespace)
535+
return controller.NewAPIServerError(false, err)
536+
}
537+
538+
for _, cond := range work.Status.ManifestConditions {
539+
res := cond.Identifier
540+
gvr := schema.GroupVersionResource{
541+
Group: res.Group,
542+
Version: res.Version,
543+
Resource: res.Resource,
544+
}
545+
546+
var obj *unstructured.Unstructured
547+
var err error
548+
if obj, err = r.spokeDynamicClient.Resource(gvr).Namespace(res.Namespace).Get(ctx, res.Name, metav1.GetOptions{}); err != nil {
549+
if apierrors.IsNotFound(err) {
550+
continue
551+
}
552+
klog.ErrorS(err, "Failed to get manifest", "gvr", gvr, "name", res.Name, "namespace", res.Namespace)
553+
return err
554+
}
555+
// Check if there is more than one owner reference. If there is only one owner reference, it is the appliedWork itself.
556+
// Otherwise, at least one other owner reference exists, and we need to leave resource alone.
557+
if len(obj.GetOwnerReferences()) > 1 {
558+
ownerRefs := obj.GetOwnerReferences()
559+
updated := false
560+
for idx := range ownerRefs {
561+
if areOwnerRefsEqual(&ownerRefs[idx], appliedWorkOwnerRef) {
562+
ownerRefs[idx].BlockOwnerDeletion = ptr.To(false)
563+
updated = true
564+
}
565+
}
566+
if updated {
567+
obj.SetOwnerReferences(ownerRefs)
568+
if _, err = r.spokeDynamicClient.Resource(gvr).Namespace(obj.GetNamespace()).Update(ctx, obj, metav1.UpdateOptions{}); err != nil {
569+
klog.ErrorS(err, "Failed to update manifest owner references", "gvr", gvr, "name", res.Name, "namespace", res.Namespace)
570+
return err
571+
}
572+
klog.V(4).InfoS("Patched manifest owner references", "gvr", gvr, "name", res.Name, "namespace", res.Namespace)
573+
}
574+
}
575+
}
576+
return nil
577+
}
578+
579+
// removeWorkFinalizer removes the finalizer from the work and updates it in the hub.
580+
func (r *Reconciler) removeWorkFinalizer(ctx context.Context, work *fleetv1beta1.Work) (ctrl.Result, error) {
581+
controllerutil.RemoveFinalizer(work, fleetv1beta1.WorkFinalizer)
511582
if err := r.hubClient.Update(ctx, work, &client.UpdateOptions{}); err != nil {
512583
klog.ErrorS(err, "Failed to remove the finalizer from the work", "work", klog.KObj(work))
513584
return ctrl.Result{}, controller.NewAPIServerError(false, err)

0 commit comments

Comments
 (0)