Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 45 additions & 4 deletions hypershift-operator/controllers/hostedcluster/etcd_recovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,12 @@ import (
"github.com/go-logr/logr"
)

func isEtcdStatefulSetHealthy(sts *appsv1.StatefulSet) bool {
return sts.Spec.Replicas != nil &&
sts.Status.ReadyReplicas == *sts.Spec.Replicas &&
sts.Status.AvailableReplicas == *sts.Spec.Replicas
}

type etcdJobStatus struct {
exists bool
finished bool
Expand Down Expand Up @@ -62,12 +68,31 @@ func (r *HostedClusterReconciler) reconcileETCDMemberRecovery(ctx context.Contex
// It returns (done, err) where done=true means the caller should return immediately
// without falling through to detectAndTriggerEtcdRecovery.
func (r *HostedClusterReconciler) handleExistingEtcdRecoveryJob(ctx context.Context, log logr.Logger, hcluster *hyperv1.HostedCluster, recoveryJob *batchv1.Job, jobStatus *etcdJobStatus) (bool, error) {
hcpNS := manifests.HostedControlPlaneNamespace(hcluster.Namespace, hcluster.Name)

if !jobStatus.finished {
log.Info("waiting for etcd recovery job to complete")
return true, nil
}

if !jobStatus.successful {
etcdStatefulSet := etcdrecoverymanifests.EtcdStatefulSet(hcpNS)
if err := r.Get(ctx, crclient.ObjectKeyFromObject(etcdStatefulSet), etcdStatefulSet); err != nil {
if !apierrors.IsNotFound(err) {
return false, fmt.Errorf("failed to get etcd statefulset: %w", err)
}
} else if isEtcdStatefulSetHealthy(etcdStatefulSet) {
log.Info("etcd recovered despite failed recovery job, cleaning up")

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nit] New messages use "ETCD" (all-caps) while existing code at line 93 uses "Etcd" (title-case). Not blocking, but consider unifying the capitalization.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, unified to "Etcd" throughout.

if err := r.cleanupEtcdRecoveryObjects(ctx, hcluster); err != nil {
return false, fmt.Errorf("failed to cleanup etcd recovery job: %w", err)
}
if err := r.setEtcdRecoveryCondition(ctx, hcluster, metav1.ConditionFalse, hyperv1.AsExpectedReason, "Etcd recovered despite failed recovery job."); err != nil {
return false, err
}
return true, nil
}

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[blocking] The fallthrough to this existing setEtcdRecoveryCondition call is the "etcd is still unhealthy" case, but it is easy to miss after the new health-check block above. A short comment would help:

// Etcd is still unhealthy after the failed recovery job; report manual intervention needed.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you @bryan-cox,
Done, added the comment.


// Etcd is still unhealthy after the failed recovery job; report manual intervention needed.
if err := r.setEtcdRecoveryCondition(ctx, hcluster, metav1.ConditionFalse, hyperv1.EtcdRecoveryJobFailedReason, "Error in Etcd Recovery job: the Etcd cluster requires manual intervention."); err != nil {
return false, err
}
Expand All @@ -80,7 +105,7 @@ func (r *HostedClusterReconciler) handleExistingEtcdRecoveryJob(ctx context.Cont
return false, fmt.Errorf("failed to cleanup etcd recovery job: %w", err)
}

if err := r.setEtcdRecoveryCondition(ctx, hcluster, metav1.ConditionFalse, hyperv1.AsExpectedReason, "ETCD Recovery job succeeded."); err != nil {
if err := r.setEtcdRecoveryCondition(ctx, hcluster, metav1.ConditionFalse, hyperv1.AsExpectedReason, "Etcd recovery job succeeded."); err != nil {
return false, err
}

Expand All @@ -100,8 +125,11 @@ func (r *HostedClusterReconciler) setEtcdRecoveryCondition(ctx context.Context,
LastTransitionTime: r.now(),
}

// Update the condition if the status or reason changed. The reason check
// is needed to transition from EtcdRecoveryJobFailed -> AsExpected when
// etcd self-heals (both use Status=False).
oldCondition := meta.FindStatusCondition(hcluster.Status.Conditions, string(hyperv1.EtcdRecoveryActive))
if oldCondition == nil || oldCondition.Status != condition.Status {
if oldCondition == nil || oldCondition.Status != condition.Status || oldCondition.Reason != condition.Reason {

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[blocking] This guard change is necessary for the fix (both old and new condition use Status=False), but the intent is non-obvious. Please add a comment explaining why Reason is now compared:

// Update the condition if the status or reason changed. The reason check
// is needed to transition from EtcdRecoveryJobFailed -> AsExpected when
// etcd self-heals (both use Status=False).

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you @bryan-cox,
Done, added the comment.

meta.SetStatusCondition(&hcluster.Status.Conditions, condition)
if err := r.Client.Status().Update(ctx, hcluster); err != nil {
return fmt.Errorf("failed to update etcd recovery job condition: %w", err)
Expand All @@ -117,8 +145,9 @@ func (r *HostedClusterReconciler) detectAndTriggerEtcdRecovery(ctx context.Conte
log.Info("etcd statefulset does not yet exist")
return nil, nil
}
return nil, fmt.Errorf("failed to get etcd statefulset: %w", err)
}
fullyAvailable := etcdStatefulSet.Status.ReadyReplicas == 3 && etcdStatefulSet.Status.AvailableReplicas == 3
fullyAvailable := isEtcdStatefulSetHealthy(etcdStatefulSet)
if !fullyAvailable {
log.Info("etcd is not reporting fully available, need to watch")
}
Expand All @@ -134,6 +163,18 @@ func (r *HostedClusterReconciler) detectAndTriggerEtcdRecovery(ctx context.Conte
if !fullyAvailable {
return &requeueAfter, nil
}

// Only clear conditions with EtcdRecoveryJobFailedReason. Conditions set
// during active recovery (Status=True/Reason=AsExpected) are managed by
// handleExistingEtcdRecoveryJob when the job completes.
oldCondition := meta.FindStatusCondition(hcluster.Status.Conditions, string(hyperv1.EtcdRecoveryActive))
if oldCondition != nil && oldCondition.Reason == hyperv1.EtcdRecoveryJobFailedReason {
log.Info("etcd is healthy but EtcdRecoveryActive has stale failure condition, clearing it")
if err := r.setEtcdRecoveryCondition(ctx, hcluster, metav1.ConditionFalse, hyperv1.AsExpectedReason, "Etcd cluster is healthy."); err != nil {
return nil, err
}

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[question] This only clears stale conditions with Reason == EtcdRecoveryJobFailedReason. If someone manually deletes the failed recovery Job while recovery is active, the condition could be stuck at Status=True/Reason=AsExpected — that would not be caught here.

Is manual job deletion during active recovery a scenario worth guarding against? If not, a brief comment explaining the scope would be helpful.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added scope comment. Not a scenario worth guarding against active recovery conditions are managed by handleExistingEtcdRecoveryJob.

}

return nil, nil
}

Expand All @@ -143,7 +184,7 @@ func (r *HostedClusterReconciler) detectAndTriggerEtcdRecovery(ctx context.Conte
return nil, err
}

if err := r.setEtcdRecoveryCondition(ctx, hcluster, metav1.ConditionTrue, hyperv1.AsExpectedReason, "ETCD Recovery job in progress."); err != nil {
if err := r.setEtcdRecoveryCondition(ctx, hcluster, metav1.ConditionTrue, hyperv1.AsExpectedReason, "Etcd recovery job in progress."); err != nil {
return nil, err
}

Expand Down
Loading