Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions controllers/clustersummary_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -531,12 +531,23 @@ func (r *ClusterSummaryReconciler) prepareForDeployment(ctx context.Context,
func (r *ClusterSummaryReconciler) proceedDeployingClusterSummary(ctx context.Context,
clusterSummaryScope *scope.ClusterSummaryScope, logger logr.Logger) (reconcile.Result, error) {

// Snapshot existing failure messages before deploying so we can detect
// new conflicts vs. ongoing ones and avoid re-raising an event every retry.
preDeployFailures := collectFailureMessages(clusterSummaryScope.ClusterSummary)

err := r.deploy(ctx, clusterSummaryScope, logger)
if err != nil {
var conflictErr *deployer.ConflictError
ok := errors.As(err, &conflictErr)
if ok {
logger.V(logs.LogInfo).Error(err, "failed to deploy because of conflict")
if _, alreadyKnown := preDeployFailures[conflictErr.Error()]; !alreadyKnown {
clusterSummary := clusterSummaryScope.ClusterSummary
r.eventRecorder.Eventf(clusterSummary, nil, corev1.EventTypeWarning, "Conflict",
configv1beta1.ClusterSummaryKind, "Conflict detected for cluster %s %s/%s: %s",
clusterSummary.Spec.ClusterType, clusterSummary.Spec.ClusterNamespace,
clusterSummary.Spec.ClusterName, conflictErr.Error())
}
r.setNextReconcileTime(clusterSummaryScope, r.ConflictRetryTime)
return reconcile.Result{Requeue: true, RequeueAfter: r.ConflictRetryTime}, nil
}
Expand Down Expand Up @@ -2095,3 +2106,16 @@ func getClusterSummaryWithInstantiatedCharts(ctx context.Context, cs *configv1be

return &csCopy, nil
}

// collectFailureMessages returns the set of FailureMessages currently recorded
// across all FeatureSummaries. Used to detect whether a conflict is new (not
// yet in the status) versus ongoing (already reported in a previous reconcile).
func collectFailureMessages(cs *configv1beta1.ClusterSummary) map[string]struct{} {
msgs := make(map[string]struct{}, len(cs.Status.FeatureSummaries))
for i := range cs.Status.FeatureSummaries {
if msg := cs.Status.FeatureSummaries[i].FailureMessage; msg != nil {
msgs[*msg] = struct{}{}
}
}
return msgs
}
17 changes: 17 additions & 0 deletions controllers/clustersummary_deployer.go
Original file line number Diff line number Diff line change
Expand Up @@ -238,17 +238,29 @@ func (r *ClusterSummaryReconciler) proceedDeployingFeature(ctx context.Context,
func (r *ClusterSummaryReconciler) handleDeployerError(deployerError error, clusterSummaryScope *scope.ClusterSummaryScope,
f feature, currentHash []byte, logger logr.Logger) (bool, error) {

clusterSummary := clusterSummaryScope.ClusterSummary

// Check if error is a NonRetriableError type
var nonRetriableError *configv1beta1.NonRetriableError
if errors.As(deployerError, &nonRetriableError) {
nonRetriableStatus := libsveltosv1beta1.FeatureStatusFailedNonRetriable
r.updateFeatureStatus(clusterSummaryScope, f.id, &nonRetriableStatus, currentHash, deployerError, logger)
r.eventRecorder.Eventf(clusterSummary, nil, corev1.EventTypeWarning, "FailedNonRetriable",
configv1beta1.ClusterSummaryKind,
"Feature %s for cluster %s %s/%s failed with non-retriable error: %s",
f.id, clusterSummary.Spec.ClusterType, clusterSummary.Spec.ClusterNamespace,
clusterSummary.Spec.ClusterName, deployerError.Error())
return true, nil
}
var templateError *configv1beta1.TemplateInstantiationError
if errors.As(deployerError, &templateError) {
nonRetriableStatus := libsveltosv1beta1.FeatureStatusFailedNonRetriable
r.updateFeatureStatus(clusterSummaryScope, f.id, &nonRetriableStatus, currentHash, deployerError, logger)
r.eventRecorder.Eventf(clusterSummary, nil, corev1.EventTypeWarning, "FailedNonRetriable",
configv1beta1.ClusterSummaryKind,
"Feature %s for cluster %s %s/%s failed to instantiate template: %s",
f.id, clusterSummary.Spec.ClusterType, clusterSummary.Spec.ClusterNamespace,
clusterSummary.Spec.ClusterName, deployerError.Error())
return true, nil
}
var healthCheckError *clusterops.HealthCheckError
Expand All @@ -261,6 +273,11 @@ func (r *ClusterSummaryReconciler) handleDeployerError(deployerError error, clus
nonRetriableStatus := libsveltosv1beta1.FeatureStatusFailedNonRetriable
resultError := errors.New("the maximum number of consecutive errors has been reached")
r.updateFeatureStatus(clusterSummaryScope, f.id, &nonRetriableStatus, currentHash, resultError, logger)
r.eventRecorder.Eventf(clusterSummary, nil, corev1.EventTypeWarning, "FailedNonRetriable",
configv1beta1.ClusterSummaryKind,
"Feature %s for cluster %s %s/%s will no longer be retried: maximum consecutive failures reached",
f.id, clusterSummary.Spec.ClusterType, clusterSummary.Spec.ClusterNamespace,
clusterSummary.Spec.ClusterName)
return true, nil
}

Expand Down