From c5ada28bfdde4d0db2de2dea362a39cd0f2ad6aa Mon Sep 17 00:00:00 2001 From: Vimal Solanki Date: Mon, 4 May 2026 16:54:07 +0530 Subject: [PATCH] fix(etcd-recovery): clear stale EtcdRecoveryActive failure condition when etcd is healthy When the etcd recovery job fails but etcd self-heals, the EtcdRecoveryJobFailed condition was never cleared. This caused the OpenShift Console to display a stale error message even when the cluster was fully healthy. This fix adds two checks: - When a failed recovery job exists but etcd StatefulSet is fully available (3/3), clean up the job and clear the condition. Transient API errors are propagated instead of silently falling through to the failure path. - When no failing etcd pods exist and etcd is healthy, clear any stale EtcdRecoveryJobFailed condition. Signed-off-by: Vimal Solanki --- .../hostedcluster/etcd_recovery.go | 49 ++- .../hostedcluster_controller_test.go | 299 ++++++++++++++++++ 2 files changed, 344 insertions(+), 4 deletions(-) diff --git a/hypershift-operator/controllers/hostedcluster/etcd_recovery.go b/hypershift-operator/controllers/hostedcluster/etcd_recovery.go index 3d67b56e52fd..06d8359343cb 100644 --- a/hypershift-operator/controllers/hostedcluster/etcd_recovery.go +++ b/hypershift-operator/controllers/hostedcluster/etcd_recovery.go @@ -29,6 +29,12 @@ import ( "github.com/go-logr/logr" ) +func isEtcdStatefulSetHealthy(sts *appsv1.StatefulSet) bool { + return sts.Spec.Replicas != nil && + sts.Status.ReadyReplicas == *sts.Spec.Replicas && + sts.Status.AvailableReplicas == *sts.Spec.Replicas +} + type etcdJobStatus struct { exists bool finished bool @@ -62,12 +68,31 @@ func (r *HostedClusterReconciler) reconcileETCDMemberRecovery(ctx context.Contex // It returns (done, err) where done=true means the caller should return immediately // without falling through to detectAndTriggerEtcdRecovery. func (r *HostedClusterReconciler) handleExistingEtcdRecoveryJob(ctx context.Context, log logr.Logger, hcluster *hyperv1.HostedCluster, recoveryJob *batchv1.Job, jobStatus *etcdJobStatus) (bool, error) { + hcpNS := manifests.HostedControlPlaneNamespace(hcluster.Namespace, hcluster.Name) + if !jobStatus.finished { log.Info("waiting for etcd recovery job to complete") return true, nil } if !jobStatus.successful { + etcdStatefulSet := etcdrecoverymanifests.EtcdStatefulSet(hcpNS) + if err := r.Get(ctx, crclient.ObjectKeyFromObject(etcdStatefulSet), etcdStatefulSet); err != nil { + if !apierrors.IsNotFound(err) { + return false, fmt.Errorf("failed to get etcd statefulset: %w", err) + } + } else if isEtcdStatefulSetHealthy(etcdStatefulSet) { + log.Info("etcd recovered despite failed recovery job, cleaning up") + if err := r.cleanupEtcdRecoveryObjects(ctx, hcluster); err != nil { + return false, fmt.Errorf("failed to cleanup etcd recovery job: %w", err) + } + if err := r.setEtcdRecoveryCondition(ctx, hcluster, metav1.ConditionFalse, hyperv1.AsExpectedReason, "Etcd recovered despite failed recovery job."); err != nil { + return false, err + } + return true, nil + } + + // Etcd is still unhealthy after the failed recovery job; report manual intervention needed. if err := r.setEtcdRecoveryCondition(ctx, hcluster, metav1.ConditionFalse, hyperv1.EtcdRecoveryJobFailedReason, "Error in Etcd Recovery job: the Etcd cluster requires manual intervention."); err != nil { return false, err } @@ -80,7 +105,7 @@ func (r *HostedClusterReconciler) handleExistingEtcdRecoveryJob(ctx context.Cont return false, fmt.Errorf("failed to cleanup etcd recovery job: %w", err) } - if err := r.setEtcdRecoveryCondition(ctx, hcluster, metav1.ConditionFalse, hyperv1.AsExpectedReason, "ETCD Recovery job succeeded."); err != nil { + if err := r.setEtcdRecoveryCondition(ctx, hcluster, metav1.ConditionFalse, hyperv1.AsExpectedReason, "Etcd recovery job succeeded."); err != nil { return false, err } @@ -100,8 +125,11 @@ func (r *HostedClusterReconciler) setEtcdRecoveryCondition(ctx context.Context, LastTransitionTime: r.now(), } + // Update the condition if the status or reason changed. The reason check + // is needed to transition from EtcdRecoveryJobFailed -> AsExpected when + // etcd self-heals (both use Status=False). oldCondition := meta.FindStatusCondition(hcluster.Status.Conditions, string(hyperv1.EtcdRecoveryActive)) - if oldCondition == nil || oldCondition.Status != condition.Status { + if oldCondition == nil || oldCondition.Status != condition.Status || oldCondition.Reason != condition.Reason { meta.SetStatusCondition(&hcluster.Status.Conditions, condition) if err := r.Client.Status().Update(ctx, hcluster); err != nil { return fmt.Errorf("failed to update etcd recovery job condition: %w", err) @@ -117,8 +145,9 @@ func (r *HostedClusterReconciler) detectAndTriggerEtcdRecovery(ctx context.Conte log.Info("etcd statefulset does not yet exist") return nil, nil } + return nil, fmt.Errorf("failed to get etcd statefulset: %w", err) } - fullyAvailable := etcdStatefulSet.Status.ReadyReplicas == 3 && etcdStatefulSet.Status.AvailableReplicas == 3 + fullyAvailable := isEtcdStatefulSetHealthy(etcdStatefulSet) if !fullyAvailable { log.Info("etcd is not reporting fully available, need to watch") } @@ -134,6 +163,18 @@ func (r *HostedClusterReconciler) detectAndTriggerEtcdRecovery(ctx context.Conte if !fullyAvailable { return &requeueAfter, nil } + + // Only clear conditions with EtcdRecoveryJobFailedReason. Conditions set + // during active recovery (Status=True/Reason=AsExpected) are managed by + // handleExistingEtcdRecoveryJob when the job completes. + oldCondition := meta.FindStatusCondition(hcluster.Status.Conditions, string(hyperv1.EtcdRecoveryActive)) + if oldCondition != nil && oldCondition.Reason == hyperv1.EtcdRecoveryJobFailedReason { + log.Info("etcd is healthy but EtcdRecoveryActive has stale failure condition, clearing it") + if err := r.setEtcdRecoveryCondition(ctx, hcluster, metav1.ConditionFalse, hyperv1.AsExpectedReason, "Etcd cluster is healthy."); err != nil { + return nil, err + } + } + return nil, nil } @@ -143,7 +184,7 @@ func (r *HostedClusterReconciler) detectAndTriggerEtcdRecovery(ctx context.Conte return nil, err } - if err := r.setEtcdRecoveryCondition(ctx, hcluster, metav1.ConditionTrue, hyperv1.AsExpectedReason, "ETCD Recovery job in progress."); err != nil { + if err := r.setEtcdRecoveryCondition(ctx, hcluster, metav1.ConditionTrue, hyperv1.AsExpectedReason, "Etcd recovery job in progress."); err != nil { return nil, err } diff --git a/hypershift-operator/controllers/hostedcluster/hostedcluster_controller_test.go b/hypershift-operator/controllers/hostedcluster/hostedcluster_controller_test.go index e86cc2236404..66a7d53aaaba 100644 --- a/hypershift-operator/controllers/hostedcluster/hostedcluster_controller_test.go +++ b/hypershift-operator/controllers/hostedcluster/hostedcluster_controller_test.go @@ -30,6 +30,7 @@ import ( hcmetrics "github.com/openshift/hypershift/hypershift-operator/controllers/hostedcluster/metrics" hcpmanifests "github.com/openshift/hypershift/hypershift-operator/controllers/manifests" "github.com/openshift/hypershift/hypershift-operator/controllers/manifests/controlplaneoperator" + etcdrecoverymanifests "github.com/openshift/hypershift/hypershift-operator/controllers/manifests/etcdrecovery" kvinfra "github.com/openshift/hypershift/kubevirtexternalinfra" "github.com/openshift/hypershift/support/api" "github.com/openshift/hypershift/support/azureutil" @@ -50,9 +51,11 @@ import ( configv1 "github.com/openshift/api/config/v1" appsv1 "k8s.io/api/apps/v1" + batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/equality" errors2 "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/intstr" @@ -7141,3 +7144,299 @@ func TestKasServingCertHashFromEndpoint(t *testing.T) { }) } } + +func TestReconcileETCDMemberRecovery(t *testing.T) { + hcpNS := "clusters-test-hc" + + healthyEtcdPods := func() []crclient.Object { + var pods []crclient.Object + for i := 0; i < 3; i++ { + pods = append(pods, &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf("etcd-%d", i), + Namespace: hcpNS, + Labels: map[string]string{"app": "etcd"}, + }, + Status: corev1.PodStatus{ + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "etcd", + State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}}, + }, + }, + }, + }) + } + return pods + } + + recoveredEtcdPods := func() []crclient.Object { + var pods []crclient.Object + for i := 0; i < 3; i++ { + pods = append(pods, &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf("etcd-%d", i), + Namespace: hcpNS, + Labels: map[string]string{"app": "etcd"}, + }, + Status: corev1.PodStatus{ + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "etcd", + RestartCount: 3, + State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}}, + }, + }, + }, + }) + } + return pods + } + + initEtcdStatefulSet := func(specReplicas, readyReplicas, availableReplicas int32) *appsv1.StatefulSet { + return &appsv1.StatefulSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: "etcd", + Namespace: hcpNS, + }, + Spec: appsv1.StatefulSetSpec{ + Replicas: ptr.To[int32](specReplicas), + }, + Status: appsv1.StatefulSetStatus{ + ReadyReplicas: readyReplicas, + AvailableReplicas: availableReplicas, + }, + } + } + + healthyStatefulSet := func() *appsv1.StatefulSet { return initEtcdStatefulSet(3, 3, 3) } + unhealthyStatefulSet := func() *appsv1.StatefulSet { return initEtcdStatefulSet(3, 2, 2) } + + staleCondition := func() metav1.Condition { + return metav1.Condition{ + Type: string(hyperv1.EtcdRecoveryActive), + Status: metav1.ConditionFalse, + Reason: hyperv1.EtcdRecoveryJobFailedReason, + Message: "Error in Etcd Recovery job: the Etcd cluster requires manual intervention.", + LastTransitionTime: metav1.Now(), + } + } + + failedJob := func() *batchv1.Job { + job := etcdrecoverymanifests.EtcdRecoveryJob(hcpNS) + job.Status = batchv1.JobStatus{ + Conditions: []batchv1.JobCondition{ + { + Type: batchv1.JobFailed, + Status: corev1.ConditionTrue, + }, + }, + } + return job + } + + testCases := []struct { + name string + objects []crclient.Object + conditions []metav1.Condition + expectedReason string + conditionExists bool + expectJobDeleted bool + }{ + { + name: "When etcd is healthy and stale EtcdRecoveryJobFailed condition exists it should clear the condition", + conditions: []metav1.Condition{staleCondition()}, + objects: append(healthyEtcdPods(), healthyStatefulSet()), + expectedReason: hyperv1.AsExpectedReason, + conditionExists: true, + }, + { + name: "When etcd is healthy and no EtcdRecoveryActive condition exists it should not add one", + conditions: []metav1.Condition{}, + objects: append(healthyEtcdPods(), healthyStatefulSet()), + conditionExists: false, + }, + { + name: "When failed job exists but etcd recovered it should cleanup job and clear condition", + conditions: []metav1.Condition{staleCondition()}, + objects: append(healthyEtcdPods(), healthyStatefulSet(), failedJob()), + expectedReason: hyperv1.AsExpectedReason, + conditionExists: true, + expectJobDeleted: true, + }, + { + name: "When etcd pods have restarted but recovered it should clear the stale condition", + conditions: []metav1.Condition{staleCondition()}, + objects: append(recoveredEtcdPods(), healthyStatefulSet()), + expectedReason: hyperv1.AsExpectedReason, + conditionExists: true, + }, + { + name: "When failed job exists and etcd is still unhealthy it should keep the failure condition", + conditions: []metav1.Condition{staleCondition()}, + objects: append(healthyEtcdPods(), unhealthyStatefulSet(), failedJob()), + expectedReason: hyperv1.EtcdRecoveryJobFailedReason, + conditionExists: true, + }, + { + name: "When failed job exists and etcd statefulset does not exist it should report failure", + conditions: []metav1.Condition{staleCondition()}, + objects: append(healthyEtcdPods(), failedJob()), + expectedReason: hyperv1.EtcdRecoveryJobFailedReason, + conditionExists: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + g := NewGomegaWithT(t) + + hcluster := &hyperv1.HostedCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hc", + Namespace: "clusters", + }, + Spec: hyperv1.HostedClusterSpec{ + Etcd: hyperv1.EtcdSpec{ + ManagementType: hyperv1.Managed, + }, + ControllerAvailabilityPolicy: hyperv1.HighlyAvailable, + }, + Status: hyperv1.HostedClusterStatus{ + Conditions: tc.conditions, + }, + } + + objects := append([]crclient.Object{hcluster}, tc.objects...) + client := fake.NewClientBuilder(). + WithScheme(api.Scheme). + WithObjects(objects...). + WithStatusSubresource(hcluster). + Build() + + r := &HostedClusterReconciler{ + Client: client, + now: metav1.Now, + EnableEtcdRecovery: true, + } + + _, err := r.reconcileETCDMemberRecovery( + ctrl.LoggerInto(t.Context(), zap.New(zap.UseDevMode(true))), + hcluster, + upsert.New(false).CreateOrUpdate, + ) + g.Expect(err).ToNot(HaveOccurred()) + + updatedHC := &hyperv1.HostedCluster{} + g.Expect(client.Get(t.Context(), crclient.ObjectKeyFromObject(hcluster), updatedHC)).To(Succeed()) + + condition := meta.FindStatusCondition(updatedHC.Status.Conditions, string(hyperv1.EtcdRecoveryActive)) + if tc.conditionExists { + g.Expect(condition).ToNot(BeNil()) + g.Expect(condition.Reason).To(Equal(tc.expectedReason)) + } else { + g.Expect(condition).To(BeNil()) + } + if tc.expectJobDeleted { + job := etcdrecoverymanifests.EtcdRecoveryJob(hcpNS) + err := client.Get(t.Context(), crclient.ObjectKeyFromObject(job), job) + g.Expect(errors2.IsNotFound(err)).To(BeTrue(), "expected failed recovery job to be deleted") + } + }) + } + + t.Run("When StatefulSet Get fails with transient error it should return the error", func(t *testing.T) { + g := NewGomegaWithT(t) + + hcluster := &hyperv1.HostedCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hc", + Namespace: "clusters", + }, + Spec: hyperv1.HostedClusterSpec{ + Etcd: hyperv1.EtcdSpec{ + ManagementType: hyperv1.Managed, + }, + ControllerAvailabilityPolicy: hyperv1.HighlyAvailable, + }, + } + + objects := append([]crclient.Object{hcluster}, healthyEtcdPods()...) + client := fake.NewClientBuilder(). + WithScheme(api.Scheme). + WithObjects(objects...). + WithStatusSubresource(hcluster). + WithInterceptorFuncs(interceptor.Funcs{ + Get: func(ctx context.Context, client crclient.WithWatch, key crclient.ObjectKey, obj crclient.Object, opts ...crclient.GetOption) error { + if _, ok := obj.(*appsv1.StatefulSet); ok { + return fmt.Errorf("connection refused") + } + return client.Get(ctx, key, obj, opts...) + }, + }). + Build() + + r := &HostedClusterReconciler{ + Client: client, + now: metav1.Now, + EnableEtcdRecovery: true, + } + + _, err := r.reconcileETCDMemberRecovery( + ctrl.LoggerInto(t.Context(), zap.New(zap.UseDevMode(true))), + hcluster, + upsert.New(false).CreateOrUpdate, + ) + g.Expect(err).To(HaveOccurred()) + g.Expect(err.Error()).To(ContainSubstring("failed to get etcd statefulset")) + }) + + t.Run("When failed job exists and StatefulSet Get fails with transient error it should return the error", func(t *testing.T) { + g := NewGomegaWithT(t) + + hcluster := &hyperv1.HostedCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hc", + Namespace: "clusters", + }, + Spec: hyperv1.HostedClusterSpec{ + Etcd: hyperv1.EtcdSpec{ + ManagementType: hyperv1.Managed, + }, + ControllerAvailabilityPolicy: hyperv1.HighlyAvailable, + }, + Status: hyperv1.HostedClusterStatus{ + Conditions: []metav1.Condition{staleCondition()}, + }, + } + + objects := append([]crclient.Object{hcluster, failedJob()}, healthyEtcdPods()...) + client := fake.NewClientBuilder(). + WithScheme(api.Scheme). + WithObjects(objects...). + WithStatusSubresource(hcluster). + WithInterceptorFuncs(interceptor.Funcs{ + Get: func(ctx context.Context, client crclient.WithWatch, key crclient.ObjectKey, obj crclient.Object, opts ...crclient.GetOption) error { + if _, ok := obj.(*appsv1.StatefulSet); ok { + return fmt.Errorf("connection refused") + } + return client.Get(ctx, key, obj, opts...) + }, + }). + Build() + + r := &HostedClusterReconciler{ + Client: client, + now: metav1.Now, + EnableEtcdRecovery: true, + } + + _, err := r.reconcileETCDMemberRecovery( + ctrl.LoggerInto(t.Context(), zap.New(zap.UseDevMode(true))), + hcluster, + upsert.New(false).CreateOrUpdate, + ) + g.Expect(err).To(HaveOccurred()) + g.Expect(err.Error()).To(ContainSubstring("failed to get etcd statefulset")) + }) +}