Skip to content

Commit 03e265d

Browse files
authored
fix: fix updateRun handling unscheduled bindings (#95)
1 parent 8c83fed commit 03e265d

File tree

5 files changed

+240
-26
lines changed

5 files changed

+240
-26
lines changed

pkg/controllers/updaterun/controller_integration_test.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -410,11 +410,17 @@ func generateTestClusterResourceBindingsAndClusters(policySnapshotIndex int) ([]
410410
}
411411

412412
unscheduledClusters := make([]*clusterv1beta1.MemberCluster, numUnscheduledClusters)
413-
for i := range unscheduledClusters {
413+
// Half of the unscheduled clusters have old policy snapshot.
414+
for i := range numUnscheduledClusters / 2 {
414415
unscheduledClusters[i] = generateTestMemberCluster(i, "unscheduled-cluster-"+strconv.Itoa(i), map[string]string{"group": "staging"})
415-
// update the policySnapshot name so that these clusters are considered to-be-deleted
416+
// Update the policySnapshot name so that these clusters are considered to-be-deleted.
416417
resourceBindings[numTargetClusters+i] = generateTestClusterResourceBinding(policySnapshotName+"a", unscheduledClusters[i].Name, placementv1beta1.BindingStateUnscheduled)
417418
}
419+
// The other half of the unscheduled clusters have latest policy snapshot but still unscheduled.
420+
for i := numUnscheduledClusters / 2; i < numUnscheduledClusters; i++ {
421+
unscheduledClusters[i] = generateTestMemberCluster(i, "unscheduled-cluster-"+strconv.Itoa(i), map[string]string{"group": "staging"})
422+
resourceBindings[numTargetClusters+i] = generateTestClusterResourceBinding(policySnapshotName, unscheduledClusters[i].Name, placementv1beta1.BindingStateUnscheduled)
423+
}
418424
return resourceBindings, targetClusters, unscheduledClusters
419425
}
420426

pkg/controllers/updaterun/initialization.go

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -190,22 +190,25 @@ func (r *Reconciler) collectScheduledClusters(
190190
var toBeDeletedBindings, selectedBindings []*placementv1beta1.ClusterResourceBinding
191191
for i, binding := range bindingList.Items {
192192
if binding.Spec.SchedulingPolicySnapshotName == latestPolicySnapshot.Name {
193-
if binding.Spec.State != placementv1beta1.BindingStateScheduled && binding.Spec.State != placementv1beta1.BindingStateBound {
194-
stateErr := controller.NewUnexpectedBehaviorError(fmt.Errorf("binding `%s`'s state %s is not scheduled or bound", binding.Name, binding.Spec.State))
195-
klog.ErrorS(stateErr, "Failed to collect clusterResourceBindings", "clusterResourcePlacement", placementName, "latestPolicySnapshot", latestPolicySnapshot.Name, "clusterStagedUpdateRun", updateRunRef)
196-
// no more retries here.
197-
return nil, nil, fmt.Errorf("%w: %s", errInitializedFailed, stateErr.Error())
193+
if binding.Spec.State == placementv1beta1.BindingStateUnscheduled {
194+
klog.V(2).InfoS("Found an unscheduled binding with the latest policy snapshot, delete it", "binding", binding.Name, "clusterResourcePlacement", placementName,
195+
"latestPolicySnapshot", latestPolicySnapshot.Name, "clusterStagedUpdateRun", updateRunRef)
196+
toBeDeletedBindings = append(toBeDeletedBindings, &bindingList.Items[i])
197+
} else {
198+
klog.V(2).InfoS("Found a scheduled binding", "binding", binding.Name, "clusterResourcePlacement", placementName,
199+
"latestPolicySnapshot", latestPolicySnapshot.Name, "clusterStagedUpdateRun", updateRunRef)
200+
selectedBindings = append(selectedBindings, &bindingList.Items[i])
198201
}
199-
klog.V(2).InfoS("Found a scheduled binding", "binding", binding.Name, "clusterResourcePlacement", placementName, "latestPolicySnapshot", latestPolicySnapshot.Name, "clusterStagedUpdateRun", updateRunRef)
200-
selectedBindings = append(selectedBindings, &bindingList.Items[i])
201202
} else {
202203
if binding.Spec.State != placementv1beta1.BindingStateUnscheduled {
203-
stateErr := controller.NewUnexpectedBehaviorError(fmt.Errorf("binding `%s` with old policy snapshot %s has state %s, not unscheduled", binding.Name, binding.Spec.SchedulingPolicySnapshotName, binding.Spec.State))
204-
klog.ErrorS(stateErr, "Failed to collect clusterResourceBindings", "clusterResourcePlacement", placementName, "latestPolicySnapshot", latestPolicySnapshot.Name, "clusterStagedUpdateRun", updateRunRef)
205-
// no more retries here.
206-
return nil, nil, fmt.Errorf("%w: %s", errInitializedFailed, stateErr.Error())
204+
stateErr := fmt.Errorf("binding `%s` with old policy snapshot %s has state %s, we might observe a transient state, need retry", binding.Name, binding.Spec.SchedulingPolicySnapshotName, binding.Spec.State)
205+
klog.V(2).InfoS("Found a not-unscheduled binding with old policy snapshot, retrying", "binding", binding.Name, "clusterResourcePlacement", placementName,
206+
"latestPolicySnapshot", latestPolicySnapshot.Name, "clusterStagedUpdateRun", updateRunRef)
207+
// Transient state can be retried.
208+
return nil, nil, stateErr
207209
}
208-
klog.V(2).InfoS("Found a to-be-deleted binding", "binding", binding.Name, "cluster", binding.Spec.TargetCluster, "clusterResourcePlacement", placementName, "latestPolicySnapshot", latestPolicySnapshot.Name, "clusterStagedUpdateRun", updateRunRef)
210+
klog.V(2).InfoS("Found an unscheduled binding with old policy snapshot", "binding", binding.Name, "cluster", binding.Spec.TargetCluster,
211+
"clusterResourcePlacement", placementName, "latestPolicySnapshot", latestPolicySnapshot.Name, "clusterStagedUpdateRun", updateRunRef)
209212
toBeDeletedBindings = append(toBeDeletedBindings, &bindingList.Items[i])
210213
}
211214
}

pkg/controllers/updaterun/initialization_integration_test.go

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -438,31 +438,47 @@ var _ = Describe("Updaterun initialization tests", func() {
438438
Expect(updateRun.Status.PolicyObservedClusterCount).To(Equal(1), "failed to update the updateRun PolicyObservedClusterCount status")
439439
})
440440

441-
It("Should fail to initialize if the bindings with latest policy snapshots are not in Scheduled or Bound state", func() {
441+
It("Should not fail to initialize if the bindings with latest policy snapshots are in Unscheduled state", func() {
442442
By("Creating a not scheduled clusterResourceBinding")
443443
binding := generateTestClusterResourceBinding(policySnapshot.Name, "cluster-1", placementv1beta1.BindingStateUnscheduled)
444444
Expect(k8sClient.Create(ctx, binding)).To(Succeed())
445445

446446
By("Creating a new clusterStagedUpdateRun")
447447
Expect(k8sClient.Create(ctx, updateRun)).To(Succeed())
448448

449-
By("Validating the initialization failed")
450-
validateFailedInitCondition(ctx, updateRun, "state Unscheduled is not scheduled or bound")
449+
By("Validating the initialization failed due to observedClusterCount mismatch, not binding state, and no selected clusters")
450+
validateFailedInitCondition(ctx, updateRun, "the number of selected bindings 0 is not equal to the observed cluster count 10")
451451

452452
By("Deleting the clusterResourceBinding")
453453
Expect(k8sClient.Delete(ctx, binding)).Should(Succeed())
454454
})
455455

456-
It("Should fail to initialize if the bindings with old policy snapshots are not in Unscheduled state", func() {
456+
It("Should retry to initialize if the bindings with old policy snapshots are not in Unscheduled state", func() {
457457
By("Creating a scheduled clusterResourceBinding with old policy snapshot")
458458
binding := generateTestClusterResourceBinding(policySnapshot.Name+"a", "cluster-0", placementv1beta1.BindingStateScheduled)
459459
Expect(k8sClient.Create(ctx, binding)).To(Succeed())
460460

461461
By("Creating a new clusterStagedUpdateRun")
462462
Expect(k8sClient.Create(ctx, updateRun)).To(Succeed())
463463

464-
By("Validating the initialization failed")
465-
validateFailedInitCondition(ctx, updateRun, "has state Scheduled, not unscheduled")
464+
By("Validating the initialization not failed consistently")
465+
// Populate the cache first.
466+
Eventually(func() error {
467+
if err := k8sClient.Get(ctx, updateRunNamespacedName, updateRun); err != nil {
468+
return err
469+
}
470+
return nil
471+
}, timeout, interval).Should(Succeed(), "failed to get the updateRun")
472+
Consistently(func() error {
473+
if err := k8sClient.Get(ctx, updateRunNamespacedName, updateRun); err != nil {
474+
return err
475+
}
476+
initCond := meta.FindStatusCondition(updateRun.Status.Conditions, string(placementv1beta1.StagedUpdateRunConditionInitialized))
477+
if initCond != nil {
478+
return fmt.Errorf("got initialization condition: %v, want nil", initCond)
479+
}
480+
return nil
481+
}, duration, interval).Should(Succeed(), "the initialization should keep retrying, not failed")
466482

467483
By("Deleting the clusterResourceBinding")
468484
Expect(k8sClient.Delete(ctx, binding)).Should(Succeed())

test/e2e/setup_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,7 @@ var (
243243
}
244244

245245
updateRunStatusCmpOption = cmp.Options{
246+
cmpopts.SortSlices(lessFuncCondition),
246247
utils.IgnoreConditionLTTAndMessageFields,
247248
cmpopts.IgnoreFields(placementv1beta1.StageUpdatingStatus{}, "StartTime", "EndTime"),
248249
cmpopts.EquateEmpty(),

0 commit comments

Comments
 (0)