Skip to content

Commit 83d2813

Browse files
committed
(bug) Fix Helm conflict race when chart ownership transfers between ClusterSummaries
When a new ClusterSummary (B) is created before an existing one (A) is deleted, and both reference the same Helm chart, B may receive a NonRetriableError for chart ownership conflict while A is still being withdrawn. The existing recovery path handles the normal case correctly: when A is deleted, it unregisters from the chart manager and requeues all conflicting ClusterSummaries so they retry. The race occurs when B's deploy worker is still running at that moment. IsInProgress(B) returns true, the requeue is skipped, and nobody notifies B once A is gone and B has become the rightful owner. B's deploy worker eventually finishes and stores the NonRetriableError. When B's reconciler reads it, the conflict is already resolved, A has been removed from the chart manager and B now owns the chart. Without this fix, B is permanently marked FailedNonRetriable and never retried, since NonRetriableError intentionally suppresses further reconciliation. The fix detects this at the moment B processes the stale result: if all charts that were in conflict can now be managed by B, the error is treated as resolved, status is reset to Provisioning, and a requeue is returned. The next reconciliation cycle submits a fresh deploy.
1 parent 7d9f9bf commit 83d2813

1 file changed

Lines changed: 55 additions & 3 deletions

File tree

controllers/clustersummary_deployer.go

Lines changed: 55 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import (
3030
"sigs.k8s.io/controller-runtime/pkg/client"
3131

3232
configv1beta1 "github.com/projectsveltos/addon-controller/api/v1beta1"
33+
"github.com/projectsveltos/addon-controller/controllers/chartmanager"
3334
"github.com/projectsveltos/addon-controller/lib/clusterops"
3435
"github.com/projectsveltos/addon-controller/pkg/scope"
3536
libsveltosv1beta1 "github.com/projectsveltos/libsveltos/api/v1beta1"
@@ -182,9 +183,22 @@ func (r *ClusterSummaryReconciler) proceedDeployingFeature(ctx context.Context,
182183

183184
r.updateFeatureStatus(clusterSummaryScope, f.id, deployerStatus, currentHash, deployerError, logger)
184185
if deployerError != nil {
185-
shouldReturn, err := r.handleDeployerError(deployerError, clusterSummaryScope, f, currentHash, logger)
186-
if shouldReturn {
187-
return err
186+
// Race: when a blocking profile is deleted while this CS's deploy was still in
187+
// progress (IsInProgress=true prevented requeueClusterSummary from running), the
188+
// conflict may already be gone by the time we process this result. If so, skip
189+
// FailedNonRetriable and fall through to submit a new deploy immediately.
190+
var nonRetriableError *configv1beta1.NonRetriableError
191+
if f.id == libsveltosv1beta1.FeatureHelm && errors.As(deployerError, &nonRetriableError) &&
192+
r.helmConflictResolved(ctx, clusterSummaryScope, logger) {
193+
194+
provisioning := libsveltosv1beta1.FeatureStatusProvisioning
195+
r.updateFeatureStatus(clusterSummaryScope, f.id, &provisioning, currentHash, nil, logger)
196+
return fmt.Errorf("helm conflict resolved, requeuing")
197+
} else {
198+
shouldReturn, err := r.handleDeployerError(deployerError, clusterSummaryScope, f, currentHash, logger)
199+
if shouldReturn {
200+
return err
201+
}
188202
}
189203
}
190204
if *deployerStatus == libsveltosv1beta1.FeatureStatusProvisioning {
@@ -873,6 +887,44 @@ func (r *ClusterSummaryReconciler) shouldRedeploy(ctx context.Context,
873887
return true
874888
}
875889

890+
// helmConflictResolved returns true when all Helm charts previously in HelmChartStatusConflict
891+
// can now be managed by this ClusterSummary according to the in-memory chart manager.
892+
// HelmReleaseSummaries contain instantiated release names, so chart-manager lookups are exact.
893+
func (r *ClusterSummaryReconciler) helmConflictResolved(ctx context.Context,
894+
clusterSummaryScope *scope.ClusterSummaryScope, logger logr.Logger) bool {
895+
896+
cs := clusterSummaryScope.ClusterSummary
897+
898+
var conflicted []configv1beta1.HelmChartSummary
899+
for i := range cs.Status.HelmReleaseSummaries {
900+
if cs.Status.HelmReleaseSummaries[i].Status == configv1beta1.HelmChartStatusConflict {
901+
conflicted = append(conflicted, cs.Status.HelmReleaseSummaries[i])
902+
}
903+
}
904+
if len(conflicted) == 0 {
905+
return false
906+
}
907+
908+
chartMgr, err := chartmanager.GetChartManagerInstance(ctx, r.Client)
909+
if err != nil {
910+
logger.V(logs.LogInfo).Error(err, "failed to get chart manager instance")
911+
return false
912+
}
913+
914+
for i := range conflicted {
915+
chart := &configv1beta1.HelmChart{
916+
ReleaseNamespace: conflicted[i].ReleaseNamespace,
917+
ReleaseName: conflicted[i].ReleaseName,
918+
}
919+
if !chartMgr.CanManageChart(cs, chart) {
920+
return false
921+
}
922+
}
923+
924+
logger.V(logs.LogDebug).Info("all previously conflicting helm charts can now be managed")
925+
return true
926+
}
927+
876928
// maxNumberOfConsecutiveFailureReached returns true if max number of consecutive failures has been reached.
877929
func (r *ClusterSummaryReconciler) maxNumberOfConsecutiveFailureReached(clusterSummaryScope *scope.ClusterSummaryScope, f feature,
878930
logger logr.Logger) bool {

0 commit comments

Comments
 (0)