From a0f3033f73bffefea2eb3aef712271b6c0f4ea39 Mon Sep 17 00:00:00 2001 From: Poornima Singour Date: Mon, 25 May 2026 15:58:50 +0530 Subject: [PATCH 1/2] fix(cpo): delete terminated MCD pods and requeue to retry in-place upgrades When an in-place MCD upgrade pod terminates (Failed/Succeeded) but the node still needs an upgrade, the controller now deletes the terminated pod so a fresh one can be recreated on the next reconcile loop. A periodic requeue (upgradeRequeueInterval = 30s) ensures the controller re-evaluates nodes that still need upgrades rather than waiting for an external event. Additionally: - Extract deleteUpgradePodIfExists helper to reduce duplication across reconcileUpgradePods and deleteUpgradeManifests - Add test coverage for PodPending phase, multi-node mixed states, NotFound on Delete, RequeueAfter assertion, and Delete failure scenarios Co-Authored-By: Claude Opus 4.6 (1M context) --- .../inplaceupgrader/inplaceupgrader.go | 80 +- .../inplaceupgrader/inplaceupgrader_test.go | 685 ++++++++++++++++++ 2 files changed, 734 insertions(+), 31 deletions(-) diff --git a/control-plane-operator/hostedclusterconfigoperator/controllers/inplaceupgrader/inplaceupgrader.go b/control-plane-operator/hostedclusterconfigoperator/controllers/inplaceupgrader/inplaceupgrader.go index 3ceb122e833e..e7bdc1de9157 100644 --- a/control-plane-operator/hostedclusterconfigoperator/controllers/inplaceupgrader/inplaceupgrader.go +++ b/control-plane-operator/hostedclusterconfigoperator/controllers/inplaceupgrader/inplaceupgrader.go @@ -58,6 +58,11 @@ const ( TokenSecretPayloadKey = "payload" TokenSecretReleaseKey = "release" TokenSecretReleaseVersionKey = "release-version" + + // upgradeRequeueInterval is how often the controller rechecks while an + // upgrade is in progress, closing the gap when a force-deleted pod's + // deletion event is missed. + upgradeRequeueInterval = 30 * time.Second ) type Reconciler struct { @@ -147,7 +152,14 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reco return ctrl.Result{}, fmt.Errorf("token secret %s/%s is missing %q key", tokenSecret.Namespace, tokenSecret.Name, TokenSecretReleaseVersionKey) } - return ctrl.Result{}, r.reconcileInPlaceUpgrade(ctx, nodePoolUpgradeAPI, tokenSecret, mcoImage, releaseVersion) + if err := r.reconcileInPlaceUpgrade(ctx, nodePoolUpgradeAPI, tokenSecret, mcoImage, releaseVersion); err != nil { + return ctrl.Result{}, err + } + // Requeue periodically while an upgrade is in progress. The controller only + // watches Nodes and MachineSets, so if an upgrade pod is force-deleted the + // deletion event is missed and the replacement pod is never created. A + // periodic recheck closes that gap. + return ctrl.Result{RequeueAfter: upgradeRequeueInterval}, nil } type nodePoolUpgradeAPI struct { @@ -277,7 +289,7 @@ func (r *Reconciler) reconcileInPlaceUpgrade(ctx context.Context, nodePoolUpgrad err = r.reconcileUpgradePods(ctx, r.guestClusterClient, nodes, nodePoolUpgradeAPI.spec.poolRef.GetName(), mcoImage, nodePoolUpgradeAPI.proxy) if err != nil { - return fmt.Errorf("failed to delete idle upgrade pods: %w", err) + return fmt.Errorf("failed to reconcile upgrade pods: %w", err) } return nil } @@ -312,22 +324,11 @@ func (r *Reconciler) reconcileUpgradePods(ctx context.Context, hostedClusterClie pod := inPlaceUpgradePod(namespace.Name, node.Name) if node.Annotations[CurrentMachineConfigAnnotationKey] == node.Annotations[DesiredMachineConfigAnnotationKey] && - node.Annotations[DesiredDrainerAnnotationKey] == node.Annotations[LastAppliedDrainerAnnotationKey] { + node.Annotations[DesiredDrainerAnnotationKey] == node.Annotations[LastAppliedDrainerAnnotationKey] && + node.Annotations[MachineConfigDaemonStateAnnotationKey] == MachineConfigDaemonStateDone { // the node is updated and does not require a MCD running - if err := hostedClusterClient.Get(ctx, client.ObjectKeyFromObject(pod), pod); err != nil { - if apierrors.IsNotFound(err) { - continue - } - return fmt.Errorf("error getting upgrade MCD pod: %w", err) - } - if pod.DeletionTimestamp != nil { - continue - } - if err := hostedClusterClient.Delete(ctx, pod); err != nil { - if apierrors.IsNotFound(err) { - continue - } - return fmt.Errorf("error deleting upgrade MCD pod: %w", err) + if err := deleteUpgradePodIfExists(ctx, hostedClusterClient, pod); err != nil { + return err } log.Info("Deleted idle upgrade pod") } else { @@ -349,12 +350,41 @@ func (r *Reconciler) reconcileUpgradePods(ctx context.Context, hostedClusterClie } else { log.Info("create upgrade pod", "result", result) } + // A pod with RestartPolicy=OnFailure only reaches a terminal phase after kubelet has exhausted its restart attempts (e.g. eviction or node loss), so deleting it here does not interrupt an active retry. + } else if pod.Status.Phase == corev1.PodSucceeded || pod.Status.Phase == corev1.PodFailed { + if pod.DeletionTimestamp != nil { + continue + } + log.Info("Detected terminated upgrade pod on node that still needs upgrade, deleting for retry", + "node", node.Name, "podPhase", pod.Status.Phase) + if err := hostedClusterClient.Delete(ctx, pod); err != nil { + if apierrors.IsNotFound(err) { + continue + } + return fmt.Errorf("error deleting terminated upgrade MCD pod for node %s: %w", node.Name, err) + } } } } return nil } +func deleteUpgradePodIfExists(ctx context.Context, c client.Client, pod *corev1.Pod) error { + if err := c.Get(ctx, client.ObjectKeyFromObject(pod), pod); err != nil { + if apierrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("error getting upgrade MCD pod %s: %w", pod.Name, err) + } + if pod.DeletionTimestamp != nil { + return nil + } + if err := c.Delete(ctx, pod); err != nil && !apierrors.IsNotFound(err) { + return fmt.Errorf("error deleting upgrade MCD pod %s: %w", pod.Name, err) + } + return nil +} + // getPayloadImage gets the specified image reference from the payload. func (r *Reconciler) getPayloadImage(ctx context.Context, imageName string) (string, error) { hcp := manifests.HostedControlPlane(r.hcpNamespace, r.hcpName) @@ -484,20 +514,8 @@ func deleteUpgradeManifests(ctx context.Context, hostedClusterClient client.Clie namespace := inPlaceUpgradeNamespace(poolName) for _, node := range nodes { pod := inPlaceUpgradePod(namespace.Name, node.Name) - if err := hostedClusterClient.Get(ctx, client.ObjectKeyFromObject(pod), pod); err != nil { - if apierrors.IsNotFound(err) { - continue - } - return fmt.Errorf("error getting upgrade MCD pod: %w", err) - } - if pod.DeletionTimestamp != nil { - continue - } - if err := hostedClusterClient.Delete(ctx, pod); err != nil { - if apierrors.IsNotFound(err) { - continue - } - return fmt.Errorf("error deleting upgrade MCD pod: %w", err) + if err := deleteUpgradePodIfExists(ctx, hostedClusterClient, pod); err != nil { + return err } } return nil diff --git a/control-plane-operator/hostedclusterconfigoperator/controllers/inplaceupgrader/inplaceupgrader_test.go b/control-plane-operator/hostedclusterconfigoperator/controllers/inplaceupgrader/inplaceupgrader_test.go index a2f28becfa44..fb472bff0e53 100644 --- a/control-plane-operator/hostedclusterconfigoperator/controllers/inplaceupgrader/inplaceupgrader_test.go +++ b/control-plane-operator/hostedclusterconfigoperator/controllers/inplaceupgrader/inplaceupgrader_test.go @@ -1,23 +1,33 @@ package inplaceupgrader import ( + "context" + "fmt" "testing" + "time" . "github.com/onsi/gomega" hyperv1 "github.com/openshift/hypershift/api/hypershift/v1beta1" + "github.com/openshift/hypershift/control-plane-operator/hostedclusterconfigoperator/controllers/resources/manifests" + "github.com/openshift/hypershift/support/releaseinfo" "github.com/openshift/hypershift/support/upsert" configv1 "github.com/openshift/api/config/v1" + imageapi "github.com/openshift/api/image/v1" corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes/scheme" "k8s.io/utils/ptr" capiv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/client/interceptor" + "sigs.k8s.io/controller-runtime/pkg/reconcile" ) func TestGetNodesForMachineSet(t *testing.T) { @@ -590,6 +600,242 @@ func TestCreateUpgradePod(t *testing.T) { } } +func TestReconcileUpgradePods(t *testing.T) { + _ = capiv1.AddToScheme(scheme.Scheme) + + poolName := "test-pool" + mcoImage := "mco-image:latest" + namespace := inPlaceUpgradeNamespace(poolName) + targetConfig := "target-hash" + + testCases := []struct { + name string + node *corev1.Node + existingPod *corev1.Pod + expectPodDeleted bool + expectPodCreated bool + expectPodRetained bool + expectPodSkipped bool + }{ + { + name: "When MCD pod is in Succeeded phase on a node needing upgrade it should delete the terminated pod", + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + Annotations: map[string]string{ + CurrentMachineConfigAnnotationKey: "old-hash", + DesiredMachineConfigAnnotationKey: targetConfig, + }, + }, + }, + existingPod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace.Name, + Name: "machine-config-daemon-node1", + }, + Status: corev1.PodStatus{ + Phase: corev1.PodSucceeded, + }, + }, + expectPodDeleted: true, + }, + { + name: "When MCD pod is in Failed phase on a node needing upgrade it should delete the terminated pod", + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + Annotations: map[string]string{ + CurrentMachineConfigAnnotationKey: "old-hash", + DesiredMachineConfigAnnotationKey: targetConfig, + }, + }, + }, + existingPod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace.Name, + Name: "machine-config-daemon-node1", + }, + Status: corev1.PodStatus{ + Phase: corev1.PodFailed, + }, + }, + expectPodDeleted: true, + }, + { + name: "When MCD pod is in Running phase on a node needing upgrade it should leave the pod alone", + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + Annotations: map[string]string{ + CurrentMachineConfigAnnotationKey: "old-hash", + DesiredMachineConfigAnnotationKey: targetConfig, + }, + }, + }, + existingPod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace.Name, + Name: "machine-config-daemon-node1", + }, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + }, + }, + expectPodRetained: true, + }, + { + name: "When no MCD pod exists on a node needing upgrade it should create a new pod", + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + Annotations: map[string]string{ + CurrentMachineConfigAnnotationKey: "old-hash", + DesiredMachineConfigAnnotationKey: targetConfig, + }, + }, + }, + existingPod: nil, + expectPodCreated: true, + }, + { + name: "When terminated MCD pod is already being deleted it should skip without error", + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + Annotations: map[string]string{ + CurrentMachineConfigAnnotationKey: "old-hash", + DesiredMachineConfigAnnotationKey: targetConfig, + }, + }, + }, + existingPod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace.Name, + Name: "machine-config-daemon-node1", + DeletionTimestamp: &metav1.Time{Time: time.Now()}, + Finalizers: []string{"test-finalizer"}, + }, + Status: corev1.PodStatus{ + Phase: corev1.PodFailed, + }, + }, + expectPodSkipped: true, + }, + { + name: "When MCD pod is in Succeeded phase on a fully updated node it should delete the idle pod", + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + Annotations: map[string]string{ + CurrentMachineConfigAnnotationKey: targetConfig, + DesiredMachineConfigAnnotationKey: targetConfig, + DesiredDrainerAnnotationKey: "uncordon-xxx", + LastAppliedDrainerAnnotationKey: "uncordon-xxx", + MachineConfigDaemonStateAnnotationKey: MachineConfigDaemonStateDone, + }, + }, + }, + existingPod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace.Name, + Name: "machine-config-daemon-node1", + }, + Status: corev1.PodStatus{ + Phase: corev1.PodSucceeded, + }, + }, + expectPodDeleted: true, + }, + { + name: "When MCD pod is in Pending phase on a node needing upgrade it should leave the pod alone", + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + Annotations: map[string]string{ + CurrentMachineConfigAnnotationKey: "old-hash", + DesiredMachineConfigAnnotationKey: targetConfig, + }, + }, + }, + existingPod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace.Name, + Name: "machine-config-daemon-node1", + }, + Status: corev1.PodStatus{ + Phase: corev1.PodPending, + }, + }, + expectPodRetained: true, + }, + { + name: "When node config matches but MCD state is not Done it should delete the terminated pod for retry", + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + Annotations: map[string]string{ + CurrentMachineConfigAnnotationKey: targetConfig, + DesiredMachineConfigAnnotationKey: targetConfig, + DesiredDrainerAnnotationKey: "uncordon-xxx", + LastAppliedDrainerAnnotationKey: "uncordon-xxx", + MachineConfigDaemonStateAnnotationKey: "Working", + }, + }, + }, + existingPod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace.Name, + Name: "machine-config-daemon-node1", + }, + Status: corev1.PodStatus{ + Phase: corev1.PodFailed, + }, + }, + expectPodDeleted: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + g := NewWithT(t) + + var guestObjects []client.Object + if tc.existingPod != nil { + guestObjects = append(guestObjects, tc.existingPod) + } + guestClient := fake.NewClientBuilder().WithScheme(scheme.Scheme). + WithObjects(guestObjects...).Build() + + r := &Reconciler{ + CreateOrUpdateProvider: upsert.New(false), + } + + err := r.reconcileUpgradePods(t.Context(), guestClient, []*corev1.Node{tc.node}, poolName, mcoImage, nil) + g.Expect(err).ToNot(HaveOccurred()) + + pod := inPlaceUpgradePod(namespace.Name, tc.node.Name) + getErr := guestClient.Get(t.Context(), client.ObjectKeyFromObject(pod), pod) + + if tc.expectPodDeleted { + g.Expect(apierrors.IsNotFound(getErr)).To(BeTrue(), "expected pod to be deleted, got: %v", getErr) + } + if tc.expectPodCreated { + g.Expect(getErr).ToNot(HaveOccurred(), "expected pod to be created") + g.Expect(pod.Spec.Containers).To(HaveLen(1)) + g.Expect(pod.Spec.Containers[0].Image).To(Equal(mcoImage)) + } + if tc.expectPodRetained { + g.Expect(getErr).ToNot(HaveOccurred(), "expected pod to be retained") + g.Expect(pod.Status.Phase).To(Equal(tc.existingPod.Status.Phase)) + } + if tc.expectPodSkipped { + g.Expect(getErr).ToNot(HaveOccurred(), "expected pod to still exist (skip due to DeletionTimestamp)") + g.Expect(pod.DeletionTimestamp).ToNot(BeNil(), "expected DeletionTimestamp to remain from original object") + } + }) + } +} + func TestReconcileInPlaceUpgradeAnnotatesMachineWithNodePoolVersion(t *testing.T) { g := NewWithT(t) _ = capiv1.AddToScheme(scheme.Scheme) @@ -725,3 +971,442 @@ func TestReconcileInPlaceUpgradeAnnotatesMachineWithNodePoolVersion(t *testing.T g.Expect(err).ToNot(HaveOccurred()) g.Expect(updatedMachine.Annotations[hyperv1.NodePoolReleaseVersionAnnotation]).To(Equal(nodePoolVersion)) } + +func TestReconcileInPlaceUpgradeDegradedNodeErrorMessage(t *testing.T) { + _ = capiv1.AddToScheme(scheme.Scheme) + + targetConfigVersion := "target-hash" + currentConfigVersion := "current-hash" + degradedReason := "disk validation failed: node disk usage exceeds threshold" + + selector := map[string]string{"pool": "test"} + + testCases := []struct { + name string + nodeName string + mcdState string + mcdMessage string + expectError bool + }{ + { + name: "when a node is degraded it should include the node name in the error message", + nodeName: "degraded-node-xyz", + mcdState: MachineConfigDaemonStateDegraded, + mcdMessage: degradedReason, + expectError: true, + }, + { + name: "when a node is not degraded it should not return a degraded error", + nodeName: "healthy-node", + mcdState: MachineConfigDaemonStateDone, + mcdMessage: "", + expectError: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + g := NewWithT(t) + + annotations := map[string]string{ + nodePoolAnnotationTargetConfigVersion: targetConfigVersion, + nodePoolAnnotationCurrentConfigVersion: currentConfigVersion, + } + if tc.expectError { + annotations[nodePoolAnnotationUpgradeInProgressTrue] = "upgrade in progress" + } + + machineSet := &capiv1.MachineSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-ms", + Namespace: "test-ns", + UID: "ms-uid", + Annotations: annotations, + }, + Spec: capiv1.MachineSetSpec{ + Selector: metav1.LabelSelector{MatchLabels: selector}, + }, + } + + machine := &capiv1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-machine", + Namespace: "test-ns", + Labels: selector, + OwnerReferences: []metav1.OwnerReference{ + { + Kind: "MachineSet", + Name: machineSet.Name, + UID: machineSet.UID, + Controller: ptr.To(true), + }, + }, + }, + Status: capiv1.MachineStatus{ + NodeRef: &corev1.ObjectReference{Name: tc.nodeName}, + }, + } + + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: tc.nodeName, + Annotations: map[string]string{ + CurrentMachineConfigAnnotationKey: currentConfigVersion, + DesiredMachineConfigAnnotationKey: targetConfigVersion, + MachineConfigDaemonStateAnnotationKey: tc.mcdState, + MachineConfigDaemonMessageAnnotationKey: tc.mcdMessage, + }, + }, + } + + tokenSecret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "token-test", + Namespace: "test-ns", + }, + Data: map[string][]byte{ + TokenSecretPayloadKey: []byte("payload"), + }, + } + + mgmtClient := fake.NewClientBuilder().WithScheme(scheme.Scheme). + WithObjects(machineSet, machine).Build() + guestClient := fake.NewClientBuilder().WithScheme(scheme.Scheme). + WithObjects(node).Build() + + r := &Reconciler{ + client: mgmtClient, + guestClusterClient: guestClient, + CreateOrUpdateProvider: upsert.New(false), + } + + upgradeAPI := &nodePoolUpgradeAPI{ + spec: struct { + targetConfigVersion string + poolRef *capiv1.MachineSet + }{ + targetConfigVersion: targetConfigVersion, + poolRef: machineSet, + }, + status: struct { + currentConfigVersion string + }{ + currentConfigVersion: currentConfigVersion, + }, + } + + err := r.reconcileInPlaceUpgrade(t.Context(), upgradeAPI, tokenSecret, "mco-image", "4.18.37") + + if tc.expectError { + g.Expect(err).To(HaveOccurred()) + g.Expect(err.Error()).To(ContainSubstring(tc.nodeName), + "error message should contain the degraded node name") + g.Expect(err.Error()).To(ContainSubstring(tc.mcdMessage), + "error message should contain the MCD degraded reason") + + updatedMS := &capiv1.MachineSet{} + err = mgmtClient.Get(t.Context(), client.ObjectKeyFromObject(machineSet), updatedMS) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(updatedMS.Annotations[nodePoolAnnotationUpgradeInProgressFalse]).To( + ContainSubstring(tc.nodeName), + "MachineSet annotation should contain the degraded node name") + _, hasTrue := updatedMS.Annotations[nodePoolAnnotationUpgradeInProgressTrue] + g.Expect(hasTrue).To(BeFalse(), + "upgradeInProgressTrue annotation should be deleted on degradation") + } else { + g.Expect(err).ToNot(HaveOccurred()) + updatedMS := &capiv1.MachineSet{} + err = mgmtClient.Get(t.Context(), client.ObjectKeyFromObject(machineSet), updatedMS) + g.Expect(err).ToNot(HaveOccurred()) + _, hasDegraded := updatedMS.Annotations[nodePoolAnnotationUpgradeInProgressFalse] + g.Expect(hasDegraded).To(BeFalse(), + "degraded annotation should not be set for non-degraded nodes") + } + }) + } +} + +type fakeReleaseProvider struct { + image string +} + +func (f *fakeReleaseProvider) Lookup(_ context.Context, _ string, _ []byte) (*releaseinfo.ReleaseImage, error) { + return &releaseinfo.ReleaseImage{ + ImageStream: &imageapi.ImageStream{ + Spec: imageapi.ImageStreamSpec{ + Tags: []imageapi.TagReference{ + { + Name: MachineConfigOperatorImage, + From: &corev1.ObjectReference{Name: f.image}, + }, + }, + }, + }, + }, nil +} + +func TestReconcileReturnsRequeueAfterDuringUpgrade(t *testing.T) { + g := NewWithT(t) + _ = capiv1.AddToScheme(scheme.Scheme) + _ = hyperv1.AddToScheme(scheme.Scheme) + + hcpNamespace := "test-ns" + hcpName := "test-hcp" + targetConfig := "target-hash" + currentConfig := "current-hash" + selector := map[string]string{"pool": "test"} + + machineSet := &capiv1.MachineSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-ms", + Namespace: hcpNamespace, + UID: "ms-uid", + Annotations: map[string]string{ + nodePoolAnnotationTargetConfigVersion: targetConfig, + nodePoolAnnotationCurrentConfigVersion: currentConfig, + }, + }, + Spec: capiv1.MachineSetSpec{ + Selector: metav1.LabelSelector{MatchLabels: selector}, + }, + } + + machine := &capiv1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-machine", + Namespace: hcpNamespace, + Labels: selector, + OwnerReferences: []metav1.OwnerReference{ + { + Kind: "MachineSet", + Name: machineSet.Name, + UID: machineSet.UID, + Controller: ptr.To(true), + }, + }, + }, + Status: capiv1.MachineStatus{ + NodeRef: &corev1.ObjectReference{Name: "test-node"}, + }, + } + + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node", + Annotations: map[string]string{ + CurrentMachineConfigAnnotationKey: currentConfig, + DesiredMachineConfigAnnotationKey: currentConfig, + }, + }, + } + + tokenSecret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf("token-%s-%s", machineSet.Name, targetConfig), + Namespace: hcpNamespace, + }, + Data: map[string][]byte{ + TokenSecretPayloadKey: []byte("payload"), + TokenSecretReleaseVersionKey: []byte("4.18.0"), + }, + } + + hcp := manifests.HostedControlPlane(hcpNamespace, hcpName) + hcp.Spec.ReleaseImage = "release-image:latest" + + pullSecret := manifests.PullSecret(hcpNamespace) + pullSecret.Data = map[string][]byte{ + corev1.DockerConfigJsonKey: []byte(`{}`), + } + + mgmtClient := fake.NewClientBuilder().WithScheme(scheme.Scheme). + WithObjects(machineSet, machine, tokenSecret, hcp, pullSecret).Build() + guestClient := fake.NewClientBuilder().WithScheme(scheme.Scheme). + WithObjects(node).Build() + + r := &Reconciler{ + client: mgmtClient, + guestClusterClient: guestClient, + releaseProvider: &fakeReleaseProvider{image: "mco-image:latest"}, + hcpName: hcpName, + hcpNamespace: hcpNamespace, + CreateOrUpdateProvider: upsert.New(false), + } + + result, err := r.Reconcile(t.Context(), reconcile.Request{ + NamespacedName: types.NamespacedName{ + Name: machineSet.Name, + Namespace: machineSet.Namespace, + }, + }) + + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(result.RequeueAfter).To(Equal(upgradeRequeueInterval)) +} + +func TestReconcileUpgradePodsReturnsErrorWhenDeleteFails(t *testing.T) { + g := NewWithT(t) + _ = capiv1.AddToScheme(scheme.Scheme) + + poolName := "test-pool" + mcoImage := "mco-image:latest" + namespace := inPlaceUpgradeNamespace(poolName) + targetConfig := "target-hash" + + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + Annotations: map[string]string{ + CurrentMachineConfigAnnotationKey: "old-hash", + DesiredMachineConfigAnnotationKey: targetConfig, + }, + }, + } + + existingPod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace.Name, + Name: "machine-config-daemon-node1", + }, + Status: corev1.PodStatus{ + Phase: corev1.PodFailed, + }, + } + + deleteErr := fmt.Errorf("connection refused") + guestClient := fake.NewClientBuilder().WithScheme(scheme.Scheme). + WithObjects(existingPod). + WithInterceptorFuncs(interceptor.Funcs{ + Delete: func(_ context.Context, _ client.WithWatch, obj client.Object, _ ...client.DeleteOption) error { + if obj.GetName() == existingPod.Name { + return deleteErr + } + return nil + }, + }). + Build() + + r := &Reconciler{ + CreateOrUpdateProvider: upsert.New(false), + } + + err := r.reconcileUpgradePods(t.Context(), guestClient, []*corev1.Node{node}, poolName, mcoImage, nil) + g.Expect(err).To(HaveOccurred()) + g.Expect(err.Error()).To(ContainSubstring("connection refused")) +} + +func TestReconcileUpgradePodsMultiNodeMixedStates(t *testing.T) { + g := NewWithT(t) + _ = capiv1.AddToScheme(scheme.Scheme) + + poolName := "test-pool" + mcoImage := "mco-image:latest" + namespace := inPlaceUpgradeNamespace(poolName) + targetConfig := "target-hash" + + node1 := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + Annotations: map[string]string{ + CurrentMachineConfigAnnotationKey: "old-hash", + DesiredMachineConfigAnnotationKey: targetConfig, + }, + }, + } + node2 := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node2", + Annotations: map[string]string{ + CurrentMachineConfigAnnotationKey: "old-hash", + DesiredMachineConfigAnnotationKey: targetConfig, + }, + }, + } + + terminatedPod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace.Name, + Name: "machine-config-daemon-node1", + }, + Status: corev1.PodStatus{ + Phase: corev1.PodFailed, + }, + } + runningPod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace.Name, + Name: "machine-config-daemon-node2", + }, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + }, + } + + guestClient := fake.NewClientBuilder().WithScheme(scheme.Scheme). + WithObjects(terminatedPod, runningPod).Build() + + r := &Reconciler{ + CreateOrUpdateProvider: upsert.New(false), + } + + err := r.reconcileUpgradePods(t.Context(), guestClient, []*corev1.Node{node1, node2}, poolName, mcoImage, nil) + g.Expect(err).ToNot(HaveOccurred()) + + pod1 := inPlaceUpgradePod(namespace.Name, node1.Name) + getErr1 := guestClient.Get(t.Context(), client.ObjectKeyFromObject(pod1), pod1) + g.Expect(apierrors.IsNotFound(getErr1)).To(BeTrue(), "expected terminated pod on node1 to be deleted") + + pod2 := inPlaceUpgradePod(namespace.Name, node2.Name) + getErr2 := guestClient.Get(t.Context(), client.ObjectKeyFromObject(pod2), pod2) + g.Expect(getErr2).ToNot(HaveOccurred(), "expected running pod on node2 to be retained") + g.Expect(pod2.Status.Phase).To(Equal(corev1.PodRunning)) +} + +func TestReconcileUpgradePodsDeleteNotFoundContinues(t *testing.T) { + g := NewWithT(t) + _ = capiv1.AddToScheme(scheme.Scheme) + + poolName := "test-pool" + mcoImage := "mco-image:latest" + namespace := inPlaceUpgradeNamespace(poolName) + targetConfig := "target-hash" + + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + Annotations: map[string]string{ + CurrentMachineConfigAnnotationKey: "old-hash", + DesiredMachineConfigAnnotationKey: targetConfig, + }, + }, + } + + existingPod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace.Name, + Name: "machine-config-daemon-node1", + }, + Status: corev1.PodStatus{ + Phase: corev1.PodFailed, + }, + } + + guestClient := fake.NewClientBuilder().WithScheme(scheme.Scheme). + WithObjects(existingPod). + WithInterceptorFuncs(interceptor.Funcs{ + Delete: func(_ context.Context, _ client.WithWatch, obj client.Object, _ ...client.DeleteOption) error { + if obj.GetName() == existingPod.Name { + return apierrors.NewNotFound(corev1.Resource("pods"), existingPod.Name) + } + return nil + }, + }). + Build() + + r := &Reconciler{ + CreateOrUpdateProvider: upsert.New(false), + } + + err := r.reconcileUpgradePods(t.Context(), guestClient, []*corev1.Node{node}, poolName, mcoImage, nil) + g.Expect(err).ToNot(HaveOccurred(), "expected NotFound on Delete to be handled gracefully") +} From 0f1134ecddbfb042e2748c82333ee4a84030f16a Mon Sep 17 00:00:00 2001 From: Poornima Singour Date: Wed, 3 Jun 2026 16:43:06 +0530 Subject: [PATCH 2/2] refactor(cpo): use shared k8sutil.DeleteIfNeeded for upgrade pod cleanup Replace the local deleteUpgradePodIfExists helper with the shared k8sutil.DeleteIfNeeded utility to reduce duplication and improve consistency across the codebase. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../inplaceupgrader/inplaceupgrader.go | 24 ++++--------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/control-plane-operator/hostedclusterconfigoperator/controllers/inplaceupgrader/inplaceupgrader.go b/control-plane-operator/hostedclusterconfigoperator/controllers/inplaceupgrader/inplaceupgrader.go index e7bdc1de9157..0a80656a9187 100644 --- a/control-plane-operator/hostedclusterconfigoperator/controllers/inplaceupgrader/inplaceupgrader.go +++ b/control-plane-operator/hostedclusterconfigoperator/controllers/inplaceupgrader/inplaceupgrader.go @@ -9,6 +9,7 @@ import ( hyperv1 "github.com/openshift/hypershift/api/hypershift/v1beta1" "github.com/openshift/hypershift/control-plane-operator/hostedclusterconfigoperator/controllers/resources/manifests" "github.com/openshift/hypershift/support/globalconfig" + "github.com/openshift/hypershift/support/k8sutil" "github.com/openshift/hypershift/support/releaseinfo" "github.com/openshift/hypershift/support/upsert" @@ -327,10 +328,11 @@ func (r *Reconciler) reconcileUpgradePods(ctx context.Context, hostedClusterClie node.Annotations[DesiredDrainerAnnotationKey] == node.Annotations[LastAppliedDrainerAnnotationKey] && node.Annotations[MachineConfigDaemonStateAnnotationKey] == MachineConfigDaemonStateDone { // the node is updated and does not require a MCD running - if err := deleteUpgradePodIfExists(ctx, hostedClusterClient, pod); err != nil { + if existed, err := k8sutil.DeleteIfNeeded(ctx, hostedClusterClient, pod); err != nil { return err + } else if existed { + log.Info("Deleted idle upgrade pod") } - log.Info("Deleted idle upgrade pod") } else { if err := hostedClusterClient.Get(ctx, types.NamespacedName{Namespace: pod.Namespace, Name: pod.Name}, pod); err != nil { if !apierrors.IsNotFound(err) { @@ -369,22 +371,6 @@ func (r *Reconciler) reconcileUpgradePods(ctx context.Context, hostedClusterClie return nil } -func deleteUpgradePodIfExists(ctx context.Context, c client.Client, pod *corev1.Pod) error { - if err := c.Get(ctx, client.ObjectKeyFromObject(pod), pod); err != nil { - if apierrors.IsNotFound(err) { - return nil - } - return fmt.Errorf("error getting upgrade MCD pod %s: %w", pod.Name, err) - } - if pod.DeletionTimestamp != nil { - return nil - } - if err := c.Delete(ctx, pod); err != nil && !apierrors.IsNotFound(err) { - return fmt.Errorf("error deleting upgrade MCD pod %s: %w", pod.Name, err) - } - return nil -} - // getPayloadImage gets the specified image reference from the payload. func (r *Reconciler) getPayloadImage(ctx context.Context, imageName string) (string, error) { hcp := manifests.HostedControlPlane(r.hcpNamespace, r.hcpName) @@ -514,7 +500,7 @@ func deleteUpgradeManifests(ctx context.Context, hostedClusterClient client.Clie namespace := inPlaceUpgradeNamespace(poolName) for _, node := range nodes { pod := inPlaceUpgradePod(namespace.Name, node.Name) - if err := deleteUpgradePodIfExists(ctx, hostedClusterClient, pod); err != nil { + if _, err := k8sutil.DeleteIfNeeded(ctx, hostedClusterClient, pod); err != nil { return err } }