Skip to content

Commit 627016b

Browse files
authored
Merge pull request #1572 from gianlucam76/deployment-errors
Implement Randomized Backoff for ClusterSummary Deployment Failures
2 parents 5800dab + d02c855 commit 627016b

2 files changed

Lines changed: 53 additions & 1 deletion

File tree

controllers/clustersummary_controller.go

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,10 @@ package controllers
1818

1919
import (
2020
"context"
21+
"crypto/rand"
2122
"errors"
2223
"fmt"
24+
"math/big"
2325
"sync"
2426
"syscall"
2527
"time"
@@ -423,8 +425,47 @@ func (r *ClusterSummaryReconciler) proceedDeployingClusterSummary(ctx context.Co
423425
logger.V(logs.LogInfo).Error(err, "failed to deploy because of conflict")
424426
return reconcile.Result{Requeue: true, RequeueAfter: r.ConflictRetryTime}, nil
425427
}
428+
429+
requeueAfter := normalRequeueAfter
430+
maxFailures := r.getMaxConsecutiveFailures(clusterSummaryScope)
431+
432+
if maxFailures > 1 {
433+
const (
434+
minMultiplier = 2
435+
maxMultiplier = 6
436+
)
437+
438+
// 1. Determine the ceiling safely.
439+
// By checking if maxFailures (uint) is less than maxMultiplier (6),
440+
// we ensure the value is small enough to cast to int64 without overflow.
441+
currentMax := int64(maxMultiplier)
442+
if uint64(maxFailures) < uint64(maxMultiplier) {
443+
currentMax = int64(maxFailures)
444+
}
445+
446+
// 2. Calculate the range (delta) for the random generator
447+
delta := currentMax - minMultiplier + 1
448+
multiplier := int64(minMultiplier)
449+
450+
if delta > 0 {
451+
// Use crypto/rand for gosec compliance
452+
n, err := rand.Int(rand.Reader, big.NewInt(delta))
453+
if err == nil {
454+
multiplier = n.Int64() + minMultiplier
455+
}
456+
}
457+
458+
// 3. Apply the multiplier to the duration
459+
requeueAfter = time.Duration(multiplier) * normalRequeueAfter
460+
461+
logger.V(logs.LogDebug).Info("increasing backoff due to consecutive failures",
462+
"consecutiveFailures", maxFailures,
463+
"multiplier", multiplier,
464+
"requeueAfter", requeueAfter)
465+
}
466+
426467
logger.V(logs.LogInfo).Error(err, "failed to deploy")
427-
return reconcile.Result{Requeue: true, RequeueAfter: normalRequeueAfter}, nil
468+
return reconcile.Result{Requeue: true, RequeueAfter: requeueAfter}, nil
428469
}
429470

430471
logger.V(logs.LogDebug).Info("Reconciling ClusterSummary success")

controllers/clustersummary_deployer.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -853,6 +853,17 @@ func (r *ClusterSummaryReconciler) maxNumberOfConsecutiveFailureReached(clusterS
853853
return false
854854
}
855855

856+
func (r *ClusterSummaryReconciler) getMaxConsecutiveFailures(clusterSummaryScope *scope.ClusterSummaryScope) uint {
857+
maxVal := uint(0)
858+
for i := range clusterSummaryScope.ClusterSummary.Status.FeatureSummaries {
859+
fs := clusterSummaryScope.ClusterSummary.Status.FeatureSummaries[i]
860+
if fs.ConsecutiveFailures > maxVal {
861+
maxVal = fs.ConsecutiveFailures
862+
}
863+
}
864+
return maxVal
865+
}
866+
856867
func logContentSummary(clusterSummary *configv1beta1.ClusterSummary, fID libsveltosv1beta1.FeatureID, logger logr.Logger) {
857868
switch fID {
858869
case libsveltosv1beta1.FeatureHelm:

0 commit comments

Comments
 (0)