From c5dce66d32c610b88ea3d0cf1e5c1eea1ffb0a60 Mon Sep 17 00:00:00 2001 From: Mulham Raee Date: Wed, 10 Jun 2026 12:36:56 +0200 Subject: [PATCH 1/3] refactor(hostedcluster): segregate reconcile loop into error-collecting blocks The reconcile() method executes ~50 sequential operations where every error causes an early return, short-circuiting all remaining work. An unrelated failure (e.g., missing SSH key secret) prevents critical operations like deploying the CPO or reconciling the HCP object. This refactoring: - Extracts 12 inline blocks into named methods - Groups operations into phased error-collecting blocks - Aggregates all errors with utilerrors.NewAggregate at the end - Introduce reconcileReport struct that classifies reconcile operations as critical (blocks Phase 8) or non-critical (error-collecting, never blocks). Replace the sequential error chain where any failure short-circuits the entire loop with structured error collection and blocking rules. After this change, failures in one phase no longer block unrelated phases. For example, a missing SSH key no longer prevents CPO deployment or HCP object creation. Includes the analysis document and integration tests that verify non-blocking behavior across phases. Co-Authored-By: Claude Opus 4.6 --- ...dcluster-reconcile-segregation-analysis.md | 116 ++ ...ostedcluster-reconcile-segregation-plan.md | 51 + .../hostedcluster/hostedcluster_controller.go | 1326 ++++++++++------- .../hostedcluster_controller_test.go | 392 +++++ .../internal/platform/aws/aws.go | 2 +- .../hostedcluster/reconcile_report.go | 169 +++ .../hostedcluster/reconcile_report_test.go | 355 +++++ hypershift-operator/main.go | 2 + support/config/constants.go | 1 + 9 files changed, 1848 insertions(+), 566 deletions(-) create mode 100644 docs/design/hostedcluster-reconcile-segregation-analysis.md create mode 100644 docs/design/hostedcluster-reconcile-segregation-plan.md create mode 100644 hypershift-operator/controllers/hostedcluster/reconcile_report.go create mode 100644 hypershift-operator/controllers/hostedcluster/reconcile_report_test.go diff --git a/docs/design/hostedcluster-reconcile-segregation-analysis.md b/docs/design/hostedcluster-reconcile-segregation-analysis.md new file mode 100644 index 000000000000..bf7047b15521 --- /dev/null +++ b/docs/design/hostedcluster-reconcile-segregation-analysis.md @@ -0,0 +1,116 @@ +# HostedCluster Controller Reconciliation Loop — Segregation Analysis + +## Problem Statement + +The `reconcile()` function in `hypershift-operator/controllers/hostedcluster/hostedcluster_controller.go` +executes ~50 distinct operations in a strictly sequential chain where **every error +causes an early return**, short-circuiting the entire remaining loop. This means an unrelated failure +(e.g., missing SSH key secret) prevents critical operations like deploying the control plane operator +or reconciling the HostedControlPlane object. + +## Implemented Solution + +### Error Categorization + +Operations are classified into two categories using a `reconcileReport` struct +(`reconcile_report.go`): + +- **critical**: Failures block all downstream Phase 8 component operations. + Examples: PullSecretSync, PlatformCredentials, SecretEncryptionSync, CoreHCPChain. +- **nonCritical**: Failures are collected but never block other work. + Examples: SSHKeySync, AuditWebhookSync, MonitoringAndCLISecrets. + +### Phase Structure + +``` +Phase 0-4: Initialization, status conditions, deletion handling — unchanged +Phase 5: Prerequisites — finalizers, pause, validation gates — unchanged (fail-fast) + +Phase 6a: Critical sync (error-collecting within phase) + - PlatformCredentials + - PullSecretSync + - SecretEncryptionSync + +Phase 6b: Non-critical sync (error-collecting, never blocked) + - RestoredFromBackup (TODO: move to Phase 4) + - AuditWebhookSync + - SSHKeySync + - AdditionalTrustBundle + - ServiceAccountSigningKey + - UnmanagedEtcdMTLS + - ETCDMemberRecovery + - GlobalConfigSync + +Phase 7: Core HCP chain — sequential (HCP → InfraCR → CAPI Cluster) + Always runs regardless of Phase 6a failures. + +Phase 8: Component groups — blocked if any critical operation failed. + Uses executeOrBlock/executeOrBlockMulti to automatically skip + when hasCriticalFailure() is true: + - KubeconfigAndPasswordSync + - OperatorDeployments (CPO, CAPI Manager, CAPI Provider, Karpenter) + - RBACAndPolicies (Prometheus RBAC, PKI RBAC, Network Policies) + - PlatformOIDCAndCSI (KubeVirt CSI / AWS OIDC / Azure SecretProviderClass) + - MonitoringAndCLISecrets (CLI secrets, dashboard, SRE metrics, trusted CAs) +``` + +### Blocking Rules + +- **Phase 6a critical failure** → Phase 8 blocked (Phase 7 still runs) +- **Phase 7 failure / nil HCP** → Phase 8 blocked +- **Phase 6b non-critical failure** → never blocks anything + +### Error Aggregation + +When critical failures exist, `aggregate()` returns only critical errors plus a summary of +blocked operations — non-critical errors are suppressed since the user should fix the +critical issue first: + +``` +critical error: failed to get pull secret...; blocked operations: [KubeconfigAndPasswordSync, OperatorDeployments, RBACAndPolicies, PlatformOIDCAndCSI, MonitoringAndCLISecrets] +``` + +When no critical failures exist, all errors are returned as-is via `utilerrors.NewAggregate`. + +### Key Types + +```go +// reconcile_report.go + +type operationCategory int +const ( + critical operationCategory = iota + nonCritical +) + +type reconcileReport struct { + results []operationResult + requeueAfter *time.Duration +} +``` + +Key methods: +- `recordOp(name, category, err)` — record a single operation result +- `recordBlocked(name, category)` — record a skipped operation +- `recordErrors(name, category, []error)` — record a group operation +- `executeOrBlock(name, category, fn)` — run fn or record blocked if critical failure exists +- `executeOrBlockMulti(name, category, fn)` — same for multi-error functions +- `hasCriticalFailure()` — any critical op failed? +- `aggregate()` — final error for reconcile return value +- `conditionMessage()` — structured summary for logging + +## Impact Summary + +**Before**: An error in any of ~50 operations blocks all subsequent operations. Examples: +- SSH key secret missing → CPO never deployed → control plane never starts +- Monitoring dashboard error → karpenter operator never reconciled +- AWS resource tags error → HCP never updated + +**After**: Only genuinely dependent operations block each other. Examples: +- SSH key secret missing → SSH key not synced (reported), but HCP object still created + (Phase 7), CPO still deployed (Phase 8), CAPI still works +- Pull secret sync failure (critical) → Phase 8 components blocked with clear reporting + of what failed and what was skipped, but Phase 7 still runs +- Platform credential error (critical) → same blocking behavior, user sees exactly which + operations were skipped +- Audit webhook failure (non-critical) → reported, but everything else proceeds normally diff --git a/docs/design/hostedcluster-reconcile-segregation-plan.md b/docs/design/hostedcluster-reconcile-segregation-plan.md new file mode 100644 index 000000000000..56ed64fd8dae --- /dev/null +++ b/docs/design/hostedcluster-reconcile-segregation-plan.md @@ -0,0 +1,51 @@ +# HostedCluster Reconciliation Segregation — Implementation Plan + +Companion to: [hostedcluster-reconcile-segregation-analysis.md](./hostedcluster-reconcile-segregation-analysis.md) + +## Status: Implemented + +The reconcile loop has been restructured with categorized error handling. See the analysis +doc for the full design. + +## Files Changed + +| File | Action | +|------|--------| +| `hypershift-operator/controllers/hostedcluster/reconcile_report.go` | **Created** — report types and methods | +| `hypershift-operator/controllers/hostedcluster/reconcile_report_test.go` | **Created** — unit tests for report | +| `hypershift-operator/controllers/hostedcluster/hostedcluster_controller.go` | **Modified** — replaced sequential error chain with report-based flow | +| `hypershift-operator/controllers/hostedcluster/hostedcluster_controller_test.go` | **Modified** — added non-blocking behavior tests | + +## Operation Classification + +| Operation | Category | Rationale | +|-----------|----------|-----------| +| PlatformCredentials | critical | Platform infra operations fail without credentials | +| PullSecretSync | critical | Image pulls fail, pods crash-loop | +| SecretEncryptionSync | critical | Etcd data encryption fails | +| CoreHCPChain | critical | Produces HCP object needed by Phase 8 | +| RestoredFromBackup | nonCritical | Status-only (TODO: move to Phase 4) | +| AuditWebhookSync | nonCritical | Observability, not functional | +| SSHKeySync | nonCritical | Debug access, not operational | +| AdditionalTrustBundle | nonCritical | Custom CAs, not blocking | +| ServiceAccountSigningKey | nonCritical | Conditional, specific feature | +| UnmanagedEtcdMTLS | nonCritical | Only for unmanaged etcd | +| ETCDMemberRecovery | nonCritical | Recovery mechanism, retries naturally | +| GlobalConfigSync | nonCritical | Config propagation, eventual consistency | +| KubeconfigAndPasswordSync | nonCritical | Consumer convenience | +| OperatorDeployments | nonCritical | CPO/CAPI/Karpenter deployments | +| RBACAndPolicies | nonCritical | RBAC/network policies | +| PlatformOIDCAndCSI | nonCritical | Platform-specific (OIDC, CSI, SecretProviderClass) | +| MonitoringAndCLISecrets | nonCritical | Monitoring, CLI secrets, SRE config, trusted CAs | + +## Design Principles + +1. **Categorize, don't short-circuit** — Operations are classified as critical or non-critical. + Critical failures block downstream work; non-critical failures are collected and reported. +2. **Minimal structural change** — Extracted blocks into private methods on + `HostedClusterReconciler`. Individual operation semantics are preserved. +3. **Preserve legitimate gates** — Deletion handling, pause, and config validation gates still + halt the entire loop. These are intentional. +4. **Structured error reporting** — When critical failures exist, only critical errors are + surfaced with a list of blocked operations. Non-critical noise is suppressed. +5. **Preserve requeueAfter** — If any operation sets a requeueAfter, it is preserved. diff --git a/hypershift-operator/controllers/hostedcluster/hostedcluster_controller.go b/hypershift-operator/controllers/hostedcluster/hostedcluster_controller.go index f7c2deac532f..34b124cfffe3 100644 --- a/hypershift-operator/controllers/hostedcluster/hostedcluster_controller.go +++ b/hypershift-operator/controllers/hostedcluster/hostedcluster_controller.go @@ -24,7 +24,6 @@ import ( "net/netip" "os" "reflect" - "sort" "strconv" "strings" "time" @@ -210,6 +209,8 @@ type HostedClusterReconciler struct { EnableEtcdRecovery bool + ReconcileLegacy bool + FeatureSet configv1.FeatureSet OpenShiftTrustedCAFilePath string @@ -394,6 +395,7 @@ func (r *HostedClusterReconciler) Reconcile(ctx context.Context, req ctrl.Reques //nolint:gocyclo func (r *HostedClusterReconciler) reconcile(ctx context.Context, req ctrl.Request, log logr.Logger, hcluster *hyperv1.HostedCluster) (ctrl.Result, error) { + // Phase 0: Initialization — get the HostedControlPlane and set up the control plane namespace. controlPlaneNamespace := manifests.HostedControlPlaneNamespaceObject(hcluster.Namespace, hcluster.Name) hcp := controlplaneoperator.HostedControlPlane(controlPlaneNamespace.Name, hcluster.Name) err := r.Client.Get(ctx, client.ObjectKeyFromObject(hcp), hcp) @@ -405,6 +407,9 @@ func (r *HostedClusterReconciler) reconcile(ctx context.Context, req ctrl.Reques } } + // Phase 1: Bubble up pre-deletion conditions from HCP. + // These conditions are set even during deletion so consumers have clear signals. + // Bubble up ValidIdentityProvider condition from the hostedControlPlane. // We set this condition even if the HC is being deleted. Otherwise, a hostedCluster with a conflicted identity provider // would fail to complete deletion forever with no clear signal for consumers. @@ -472,6 +477,7 @@ func (r *HostedClusterReconciler) reconcile(ctx context.Context, req ctrl.Reques } } + // Phase 2: Deletion handling — if the cluster is being deleted, clean up and return early. var hcDestroyGracePeriod time.Duration if gracePeriodString := hcluster.Annotations[hyperv1.HCDestroyGracePeriodAnnotation]; len(gracePeriodString) > 0 { @@ -481,7 +487,6 @@ func (r *HostedClusterReconciler) reconcile(ctx context.Context, req ctrl.Reques } } - // If deleted, clean up and return early. if !hcluster.DeletionTimestamp.IsZero() { // This new condition is necessary for OCM personnel to report any cloud dangling objects to the user. // The grace period is customizable using an annotation called HCDestroyGracePeriodAnnotation. It's a time.Duration annotation. @@ -601,7 +606,7 @@ func (r *HostedClusterReconciler) reconcile(ctx context.Context, req ctrl.Reques return ctrl.Result{}, nil } - // Part zero: fix up conversion + // Phase 3: Fix up conversion and reconcile platform defaults. originalSpec := hcluster.Spec.DeepCopy() // Reconcile converted AWS roles. @@ -634,7 +639,7 @@ func (r *HostedClusterReconciler) reconcile(ctx context.Context, req ctrl.Reques return ctrl.Result{}, r.Client.Update(ctx, hcluster) } - // Part one: update status + // Phase 4: Update status conditions and persist. // Set kubeconfig status { @@ -1290,7 +1295,7 @@ func (r *HostedClusterReconciler) reconcile(ctx context.Context, req ctrl.Reques return ctrl.Result{}, fmt.Errorf("failed to update status: %w", err) } - // Part two: reconcile the state of the world + // Phase 5: Prerequisites — finalizers, pause check, validation gates, namespace, platform setup. // Ensure the cluster has a finalizer for cleanup and update right away. if !controllerutil.ContainsFinalizer(hcluster, HostedClusterFinalizer) { @@ -1324,10 +1329,6 @@ func (r *HostedClusterReconciler) reconcile(ctx context.Context, req ctrl.Reques return ctrl.Result{}, err } - if err = r.reconcileCLISecrets(ctx, createOrUpdate, hcluster); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to reconcile the CLI secrets: %w", err) - } - // Set the infraID as Tag on all created AWS if err := r.reconcileAWSResourceTags(ctx, hcluster); err != nil { return ctrl.Result{}, err @@ -1437,510 +1438,781 @@ func (r *HostedClusterReconciler) reconcile(ctx context.Context, req ctrl.Reques return ctrl.Result{}, err } - // Reconcile Platform specifics. - { - if err := p.ReconcileCredentials(ctx, r.Client, createOrUpdate, hcluster, controlPlaneNamespace.Name); err != nil { - meta.SetStatusCondition(&hcluster.Status.Conditions, metav1.Condition{ - Type: string(hyperv1.PlatformCredentialsFound), - Status: metav1.ConditionFalse, - Reason: hyperv1.PlatformCredentialsNotFoundReason, - ObservedGeneration: hcluster.Generation, - Message: err.Error(), - }) - if statusErr := r.Client.Status().Update(ctx, hcluster); statusErr != nil { - return ctrl.Result{}, fmt.Errorf("failed to reconcile platform credentials: %w, failed to update status: %w", err, statusErr) - } - return ctrl.Result{}, fmt.Errorf("failed to reconcile platform credentials: %w", err) - } - if !meta.IsStatusConditionTrue(hcluster.Status.Conditions, string(hyperv1.PlatformCredentialsFound)) { - meta.SetStatusCondition(&hcluster.Status.Conditions, metav1.Condition{ - Type: string(hyperv1.PlatformCredentialsFound), - Status: metav1.ConditionTrue, - Reason: hyperv1.AsExpectedReason, - ObservedGeneration: hcluster.Generation, - Message: "Required platform credentials are found", - }) - if statusErr := r.Client.Status().Update(ctx, hcluster); statusErr != nil { - return ctrl.Result{}, fmt.Errorf("failed to reconcile platform credentials: %w, failed to update status: %w", err, statusErr) - } - } + releaseImageVersion, err := semver.Parse(releaseImage.Version()) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to parse release image version: %w", err) } - // Set the HostedCluster restored from backup condition - { - if _, exists := hcluster.Annotations[hyperv1.HostedClusterRestoredFromBackupAnnotation]; exists { - freshCondition := &metav1.Condition{ - Type: string(hyperv1.HostedClusterRestoredFromBackup), - Reason: hyperv1.RecoveryFinishedReason, - Status: metav1.ConditionUnknown, - ObservedGeneration: hcluster.Generation, - } + // --- Categorized reconcile operations with structured error reporting --- + // + // Operations are classified as critical or nonCritical: + // - critical: failures block downstream component operations (Phase 8). + // - nonCritical: failures are collected but never block other work. + // + // Blocking rules: + // - Critical sync failures (Phase 6a) → Phase 8 components blocked. + // - Core HCP chain failure / nil HCP (Phase 7) → Phase 8 components blocked. + // - Non-critical sync (Phase 6b) → never blocked, always runs. - if hcp != nil { - hostedClusterRestoredFromBackupCondition := meta.FindStatusCondition(hcp.Status.Conditions, string(hyperv1.HostedClusterRestoredFromBackup)) - if hostedClusterRestoredFromBackupCondition != nil { - freshCondition = hostedClusterRestoredFromBackupCondition - } - } + report := &reconcileReport{legacy: r.ReconcileLegacy} - oldCondition := meta.FindStatusCondition(hcluster.Status.Conditions, string(hyperv1.HostedClusterRestoredFromBackup)) + // Phase 6a: Critical sync operations. + // These must succeed for downstream component deployments to function. + report.execute("PlatformCredentials", critical, func() error { + return r.reconcilePlatformCredentialsWithStatus(ctx, hcluster, createOrUpdate, controlPlaneNamespace.Name, p) + }) - // Preserve previous status if we can no longer determine the status - if oldCondition != nil && freshCondition.Status == metav1.ConditionUnknown { - freshCondition.Status = oldCondition.Status - } + report.execute("PullSecretSync", critical, func() error { + return r.reconcilePullSecretSync(ctx, hcluster, createOrUpdate, controlPlaneNamespace.Name) + }) - // If the condition is not set, or the status is different, set the condition - if oldCondition == nil || oldCondition.Status != freshCondition.Status { - freshCondition.ObservedGeneration = hcluster.Generation - } + report.execute("SecretEncryptionSync", critical, func() error { + return r.reconcileSecretEncryptionSync(ctx, hcluster, createOrUpdate, controlPlaneNamespace.Name, p) + }) - // If the condition is true, delete the hc annotation. It will be eventually bubbled down to the hcp. - if freshCondition.Status == metav1.ConditionTrue { - hclusterAnnotations := hcluster.GetAnnotations() - delete(hclusterAnnotations, hyperv1.HostedClusterRestoredFromBackupAnnotation) - hcluster.SetAnnotations(hclusterAnnotations) - if err := r.Client.Update(ctx, hcluster); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to remove annotations %v: %w", string(hyperv1.HostedClusterRestoredFromBackup), err) - } - } + // Phase 6b: Non-critical sync operations. + // These run independently and never block downstream work. + // TODO: move RestoredFromBackup to Phase 4 — it's status/annotation propagation, not a sync operation. + report.execute("RestoredFromBackup", nonCritical, func() error { + return r.reconcileRestoredFromBackupWithStatus(ctx, hcluster, hcp) + }) - // Persist status updates - meta.SetStatusCondition(&hcluster.Status.Conditions, *freshCondition) - if err := r.Client.Status().Update(ctx, hcluster); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to update status %v: %w", string(hyperv1.HostedClusterRestoredFromBackup), err) - } - } + report.execute("AuditWebhookSync", nonCritical, func() error { + return r.reconcileAuditWebhookSync(ctx, hcluster, createOrUpdate, controlPlaneNamespace.Name) + }) + + report.execute("SSHKeySync", nonCritical, func() error { + return r.reconcileSSHKeySync(ctx, hcluster, createOrUpdate, controlPlaneNamespace.Name) + }) + + report.execute("AdditionalTrustBundle", nonCritical, func() error { + return r.reconcileAdditionalTrustBundle(ctx, hcluster, createOrUpdate, controlPlaneNamespace.Name) + }) + + if hcluster.Spec.ServiceAccountSigningKey != nil { + report.execute("ServiceAccountSigningKey", nonCritical, func() error { + return r.reconcileServiceAccountSigningKey(ctx, hcluster, controlPlaneNamespace.Name, createOrUpdate) + }) } - // Reconcile the HostedControlPlane pull secret by resolving the source secret - // reference from the HostedCluster and syncing the secret in the control plane namespace. - { - var src corev1.Secret - if err := r.Client.Get(ctx, client.ObjectKey{Namespace: hcluster.GetNamespace(), Name: hcluster.Spec.PullSecret.Name}, &src); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to get pull secret %s: %w", hcluster.Spec.PullSecret.Name, err) - } - if err := ensureReferencedResourceAnnotation(ctx, r.Client, hcluster.Name, &src); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set referenced resource annotation: %w", err) - } - dst := controlplaneoperator.PullSecret(controlPlaneNamespace.Name) - _, err = createOrUpdate(ctx, r.Client, dst, func() error { - srcData, srcHasData := src.Data[".dockerconfigjson"] - if !srcHasData { - return fmt.Errorf("hostedcluster pull secret %q must have a .dockerconfigjson key", src.Name) - } - dst.Type = corev1.SecretTypeDockerConfigJson - if dst.Data == nil { - dst.Data = map[string][]byte{} + report.execute("UnmanagedEtcdMTLS", nonCritical, func() error { + return r.reconcileUnmanagedEtcdMTLSSync(ctx, hcluster, createOrUpdate, controlPlaneNamespace.Name) + }) + + if r.EnableEtcdRecovery && + hcluster.Spec.Etcd.ManagementType == hyperv1.Managed && + hcluster.Spec.ControllerAvailabilityPolicy == hyperv1.HighlyAvailable { + report.execute("ETCDMemberRecovery", nonCritical, func() error { + requeueAfter, etcdErr := r.reconcileETCDMemberRecovery(ctx, hcluster, createOrUpdate) + if etcdErr != nil { + return fmt.Errorf("failed to perform etcd member recovery: %w", etcdErr) } - dst.Data[".dockerconfigjson"] = srcData + report.requeueAfter = requeueAfter return nil }) + } + + report.execute("GlobalConfigSync", nonCritical, func() error { + return r.reconcileGlobalConfigSync(ctx, hcluster, createOrUpdate, controlPlaneNamespace.Name) + }) + + // Phase 7: Core HCP chain — sequential (HCP → InfraCR → CAPI Cluster). + // Always runs regardless of Phase 6a failures (these are K8s resource operations). + report.execute("CoreHCPChain", critical, func() error { + var chainErr error + hcp, chainErr = r.reconcileCoreHCPChain(ctx, log, hcluster, hcp, createOrUpdate, + controlPlaneNamespace.Name, defaultToControlPlaneV2, p) + if chainErr != nil { + return chainErr + } + if hcp == nil { + return fmt.Errorf("HCP object is nil") + } + return nil + }) + + report.executeOrBlock("KubeconfigAndPasswordSync", func() error { + return r.reconcileKubeconfigAndPasswordSync(ctx, createOrUpdate, hcluster, hcp, cpoSupportsKASCustomKubeconfig) + }) + + report.executeOrBlock("OperatorDeployments", func() error { + return r.reconcileOperatorDeployments(ctx, createOrUpdate, hcluster, hcp, controlPlaneNamespace, p, + controlPlaneOperatorImage, utilitiesImage, + cpoHasUtilities, r.CertRotationScale, releaseImage, releaseImageVersion, releaseProvider) + }) + + report.executeOrBlock("RBACAndPolicies", func() error { + return r.reconcileRBACAndPolicies(ctx, log, createOrUpdate, hcluster, hcp, + controlPlanePKIOperatorSignsCSRs, controlPlaneOperatorAppliesManagementKASNetworkPolicyLabel, + releaseImageVersion) + }) + + report.executeOrBlock("PlatformOIDCAndCSI", func() error { + return r.reconcilePlatformSpecific(ctx, log, hcluster, hcp, createOrUpdate) + }) + + report.executeOrBlock("MonitoringAndCLISecrets", func() error { + return r.reconcileAuxiliary(ctx, createOrUpdate, hcluster, hcp) + }) + + report.executeOrBlock("PublicEndpointExposed", func() error { + pubEndpointRequeue, err := r.reconcilePublicEndpointExposedCondition(ctx, hcluster) if err != nil { - return ctrl.Result{}, fmt.Errorf("failed to reconcile pull secret: %w", err) + return fmt.Errorf("failed to reconcile PublicEndpointExposed condition: %w", err) } - } + if pubEndpointRequeue != nil && (report.requeueAfter == nil || *pubEndpointRequeue < *report.requeueAfter) { + report.requeueAfter = pubEndpointRequeue + } + return nil + }) - // Reconcile the HostedControlPlane Secret Encryption Info - if hcluster.Spec.SecretEncryption != nil { - log.Info("Reconciling secret encryption configuration") - switch hcluster.Spec.SecretEncryption.Type { - case hyperv1.AESCBC: - if hcluster.Spec.SecretEncryption.AESCBC == nil || len(hcluster.Spec.SecretEncryption.AESCBC.ActiveKey.Name) == 0 { - log.Error(fmt.Errorf("aescbc metadata is nil"), "") - // don't return error here as reconciling won't fix input error - return ctrl.Result{}, nil - } - var src corev1.Secret - if err := r.Client.Get(ctx, client.ObjectKey{Namespace: hcluster.GetNamespace(), Name: hcluster.Spec.SecretEncryption.AESCBC.ActiveKey.Name}, &src); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to get active aescbc secret %s: %w", hcluster.Spec.SecretEncryption.AESCBC.ActiveKey.Name, err) - } - if err := ensureReferencedResourceAnnotation(ctx, r.Client, hcluster.Name, &src); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set referenced resource annotation: %w", err) - } - if _, ok := src.Data[hyperv1.AESCBCKeySecretKey]; !ok { - log.Error(fmt.Errorf("no key field %s specified for aescbc active key secret", hyperv1.AESCBCKeySecretKey), "") - // don't return error here as reconciling won't fix input error - return ctrl.Result{}, nil - } - hostedControlPlaneActiveKeySecret := &corev1.Secret{ - ObjectMeta: metav1.ObjectMeta{ - Namespace: controlPlaneNamespace.Name, - Name: src.Name, - }, - } - _, err = createOrUpdate(ctx, r.Client, hostedControlPlaneActiveKeySecret, func() error { - if hostedControlPlaneActiveKeySecret.Data == nil { - hostedControlPlaneActiveKeySecret.Data = map[string][]byte{} - } - hostedControlPlaneActiveKeySecret.Data[hyperv1.AESCBCKeySecretKey] = src.Data[hyperv1.AESCBCKeySecretKey] - hostedControlPlaneActiveKeySecret.Type = corev1.SecretTypeOpaque - return nil - }) - if err != nil { - return ctrl.Result{}, fmt.Errorf("failed reconciling aescbc active key: %w", err) - } - if hcluster.Spec.SecretEncryption.AESCBC.BackupKey != nil && len(hcluster.Spec.SecretEncryption.AESCBC.BackupKey.Name) > 0 { - var src corev1.Secret - if err := r.Client.Get(ctx, client.ObjectKey{Namespace: hcluster.GetNamespace(), Name: hcluster.Spec.SecretEncryption.AESCBC.BackupKey.Name}, &src); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to get backup aescbc secret %s: %w", hcluster.Spec.SecretEncryption.AESCBC.BackupKey.Name, err) - } - if err := ensureReferencedResourceAnnotation(ctx, r.Client, hcluster.Name, &src); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set referenced resource annotation: %w", err) - } - if _, ok := src.Data[hyperv1.AESCBCKeySecretKey]; !ok { - log.Error(fmt.Errorf("no key field %s specified for aescbc backup key secret", hyperv1.AESCBCKeySecretKey), "") - // don't return error here as reconciling won't fix input error - return ctrl.Result{}, nil - } - hostedControlPlaneBackupKeySecret := &corev1.Secret{ - ObjectMeta: metav1.ObjectMeta{ - Namespace: controlPlaneNamespace.Name, - Name: src.Name, - }, - } - _, err = createOrUpdate(ctx, r.Client, hostedControlPlaneBackupKeySecret, func() error { - if hostedControlPlaneBackupKeySecret.Data == nil { - hostedControlPlaneBackupKeySecret.Data = map[string][]byte{} - } - hostedControlPlaneBackupKeySecret.Data[hyperv1.AESCBCKeySecretKey] = src.Data[hyperv1.AESCBCKeySecretKey] - hostedControlPlaneBackupKeySecret.Type = corev1.SecretTypeOpaque - return nil - }) - if err != nil { - return ctrl.Result{}, fmt.Errorf("failed reconciling aescbc backup key: %w", err) - } - } - case hyperv1.KMS: - if hcluster.Spec.SecretEncryption.KMS == nil { - log.Error(fmt.Errorf("kms metadata nil"), "") - // don't return error here as reconciling won't fix input error - return ctrl.Result{}, nil - } - if err := p.ReconcileSecretEncryption(ctx, r.Client, createOrUpdate, - hcluster, - controlPlaneNamespace.Name); err != nil { - return ctrl.Result{}, err - } - default: - log.Error(fmt.Errorf("unsupported encryption type %s", hcluster.Spec.SecretEncryption.Type), "") - // don't return error here as reconciling won't fix input error - return ctrl.Result{}, nil + // Aggregate results and return. + result := ctrl.Result{} + if report.requeueAfter != nil { + result.RequeueAfter = *report.requeueAfter + } + if autoNodeProgressing { + autoNodeRequeue := 15 * time.Second + if result.RequeueAfter == 0 || autoNodeRequeue < result.RequeueAfter { + result.RequeueAfter = autoNodeRequeue } } - // Reconcile the HostedControlPlane audit webhook config if specified - // reference from the HostedCluster and syncing the secret in the control plane namespace. - { - if hcluster.Spec.AuditWebhook != nil && len(hcluster.Spec.AuditWebhook.Name) > 0 { - var src corev1.Secret - if err := r.Client.Get(ctx, client.ObjectKey{Namespace: hcluster.GetNamespace(), Name: hcluster.Spec.AuditWebhook.Name}, &src); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to get audit webhook config %s: %w", hcluster.Spec.AuditWebhook.Name, err) - } - if err := ensureReferencedResourceAnnotation(ctx, r.Client, hcluster.Name, &src); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set referenced resource annotation: %w", err) - } - configData, ok := src.Data[hyperv1.AuditWebhookKubeconfigKey] - if !ok { - return ctrl.Result{}, fmt.Errorf("audit webhook secret does not contain key %s", hyperv1.AuditWebhookKubeconfigKey) - } + if errs := report.allErrors(); len(errs) == 0 { + log.Info("successfully reconciled") + } else if msg := report.conditionMessage(); msg != "" { + log.Info("reconciliation completed with errors", "summary", msg) + } + return result, report.aggregate() +} - hostedControlPlaneAuditWebhookSecret := &corev1.Secret{ - ObjectMeta: metav1.ObjectMeta{ - Namespace: controlPlaneNamespace.Name, - Name: src.Name, - }, - } - _, err = createOrUpdate(ctx, r.Client, hostedControlPlaneAuditWebhookSecret, func() error { - if hostedControlPlaneAuditWebhookSecret.Data == nil { - hostedControlPlaneAuditWebhookSecret.Data = map[string][]byte{} - } - hostedControlPlaneAuditWebhookSecret.Data[hyperv1.AuditWebhookKubeconfigKey] = configData - hostedControlPlaneAuditWebhookSecret.Type = corev1.SecretTypeOpaque - return nil - }) - if err != nil { - return ctrl.Result{}, fmt.Errorf("failed reconciling audit webhook secret: %w", err) - } +// reconcileCoreHCPChain reconciles the core HCP chain: reconcile the HCP object, +// CAPI Infra CR, AWS subnets, and CAPI Cluster. These operations are sequential +// because each depends on the previous. Returns the (potentially reassigned) HCP +// and any error. +func (r *HostedClusterReconciler) reconcileCoreHCPChain( + ctx context.Context, log logr.Logger, hcluster *hyperv1.HostedCluster, + hcp *hyperv1.HostedControlPlane, createOrUpdate upsert.CreateOrUpdateFN, + controlPlaneNamespace string, defaultToControlPlaneV2 bool, + p platform.Platform, +) (*hyperv1.HostedControlPlane, error) { + isAutoscalingNeeded, err := r.isAutoscalingNeeded(ctx, hcluster) + if err != nil { + return hcp, fmt.Errorf("failed to determine if autoscaler is needed: %w", err) + } + + isAWSNodeTerminationHandlerNeeded, err := r.isAWSNodeTerminationHandlerNeeded(ctx, hcluster) + if err != nil { + return hcp, fmt.Errorf("failed to determine if AWS node termination handler is needed: %w", err) + } + + hcp = controlplaneoperator.HostedControlPlane(controlPlaneNamespace, hcluster.Name) + _, err = createOrUpdate(ctx, r.Client, hcp, func() error { + return reconcileHostedControlPlane(hcp, hcluster, isAutoscalingNeeded, isAWSNodeTerminationHandlerNeeded, + annotationsForCertRenewal(log, + hcp, + shouldCheckForStaleCerts(hcluster, defaultToControlPlaneV2), + r.kasServingCertHashFromSecret(ctx, hcp), + r.kasServingCertHashFromEndpoint(ctx, kasHostAndPortFromHCP(hcp)))) + }) + if err != nil { + return hcp, fmt.Errorf("failed to reconcile hostedcontrolplane: %w", err) + } + + infraCR, err := p.ReconcileCAPIInfraCR(ctx, r.Client, createOrUpdate, + hcluster, + controlPlaneNamespace, + hcp.Status.ControlPlaneEndpoint) + if err != nil { + return hcp, fmt.Errorf("failed to reconcile CAPI infra CR: %w", err) + } + + // Reconcile the CAPI Cluster resource + // In the None platform case, there is no CAPI provider/resources so infraCR is nil + if infraCR != nil { + capiCluster := controlplaneoperator.CAPICluster(controlPlaneNamespace, hcluster.Spec.InfraID) + _, err = createOrUpdate(ctx, r.Client, capiCluster, func() error { + return reconcileCAPICluster(capiCluster, hcluster, hcp, infraCR) + }) + if err != nil { + return hcp, fmt.Errorf("failed to reconcile capi cluster: %w", err) } } - // Reconcile the HostedControlPlane SSH secret by resolving the source secret reference - // from the HostedCluster and syncing the secret in the control plane namespace. - if len(hcluster.Spec.SSHKey.Name) > 0 { - var src corev1.Secret - err = r.Client.Get(ctx, client.ObjectKey{Namespace: hcluster.Namespace, Name: hcluster.Spec.SSHKey.Name}, &src) + return hcp, nil +} + +// reconcileKubeconfigAndPasswordSync groups kubeconfig, custom kubeconfig, and kubeadmin +// password sync operations. Each runs independently and errors are collected. +func (r *HostedClusterReconciler) reconcileKubeconfigAndPasswordSync( + ctx context.Context, createOrUpdate upsert.CreateOrUpdateFN, + hcluster *hyperv1.HostedCluster, hcp *hyperv1.HostedControlPlane, + cpoSupportsKASCustomKubeconfig bool, +) error { + var errs []error + if err := r.reconcileKubeconfigSync(ctx, hcluster, hcp, createOrUpdate); err != nil { + errs = append(errs, fmt.Errorf("failed to reconcile kubeconfig: %w", err)) + } + if err := r.reconcileCustomKubeconfigSync(ctx, hcluster, hcp, createOrUpdate, cpoSupportsKASCustomKubeconfig); err != nil { + errs = append(errs, fmt.Errorf("failed to reconcile custom kubeconfig: %w", err)) + } + if err := r.reconcileKubeadminPasswordSync(ctx, hcluster, hcp, createOrUpdate); err != nil { + errs = append(errs, fmt.Errorf("failed to reconcile kubeadmin password: %w", err)) + } + return utilerrors.NewAggregate(errs) +} + +// reconcileOperatorDeployments groups CPO, CAPI manager/provider, and karpenter operator +// reconciliation. Each runs independently and errors are collected. +func (r *HostedClusterReconciler) reconcileOperatorDeployments(ctx context.Context, + createOrUpdate upsert.CreateOrUpdateFN, + hcluster *hyperv1.HostedCluster, hcp *hyperv1.HostedControlPlane, controlPlaneNamespace *corev1.Namespace, + p platform.Platform, + controlPlaneOperatorImage, utilitiesImage string, + cpoHasUtilities bool, certRotationScale time.Duration, + releaseImage *releaseinfo.ReleaseImage, + releaseImageVersion semver.Version, + releaseProvider releaseinfo.ProviderWithOpenShiftImageRegistryOverrides, +) error { + var errs []error + + securityContextUID := controlplanecomponent.DefaultSecurityContextUID + if r.SetDefaultSecurityContext { + var err error + securityContextUID, err = strconv.ParseInt(controlPlaneNamespace.Annotations[DefaultSecurityContextUIDAnnnotation], 10, 64) if err != nil { - return ctrl.Result{}, fmt.Errorf("failed to get hostedcluster SSHKey secret %s: %w", hcluster.Spec.SSHKey.Name, err) + return fmt.Errorf("failed to parse SecurityContext UID: %w", err) } - if err := ensureReferencedResourceAnnotation(ctx, r.Client, hcluster.Name, &src); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set referenced resource annotation: %w", err) + } + + imageProvider := imageprovider.New(releaseImage) + imageProvider.ComponentImages()["token-minter"] = utilitiesImage + imageProvider.ComponentImages()[podspec.AvailabilityProberImageName] = utilitiesImage + + cpContext := controlplanecomponent.ControlPlaneContext{ + Context: ctx, + Client: r.Client, + ApplyProvider: upsert.NewApplyProvider(r.EnableCIDebugOutput), + HCP: hcp, + SetDefaultSecurityContext: r.SetDefaultSecurityContext, + DefaultSecurityContextUID: securityContextUID, + EnableCIDebugOutput: r.EnableCIDebugOutput, + MetricsSet: r.MetricsSet, + ReleaseImageProvider: imageProvider, + OmitOwnerReference: true, + } + + if err := r.reconcileControlPlaneOperator(cpContext, createOrUpdate, hcluster, + controlPlaneOperatorImage, utilitiesImage, + cpoHasUtilities, certRotationScale, releaseImageVersion, releaseProvider); err != nil { + errs = append(errs, fmt.Errorf("failed to reconcile control plane operator: %w", err)) + } + if err := r.reconcileCAPIManager(cpContext, createOrUpdate, hcluster); err != nil { + errs = append(errs, fmt.Errorf("failed to reconcile CAPI manager: %w", err)) + } + if err := r.reconcileCAPIProvider(cpContext, hcluster, hcp, p); err != nil { + errs = append(errs, fmt.Errorf("failed to reconcile CAPI provider: %w", err)) + } + if err := r.reconcileKarpenterOperator(cpContext, hcluster, + r.HypershiftOperatorImage, controlPlaneOperatorImage); err != nil { + errs = append(errs, fmt.Errorf("failed to reconcile karpenter operator: %w", err)) + } + return utilerrors.NewAggregate(errs) +} + +// reconcileRBACAndPolicies groups prometheus RBAC, PKI RBAC, and network policies +// reconciliation. Each runs independently and errors are collected. +func (r *HostedClusterReconciler) reconcileRBACAndPolicies( + ctx context.Context, log logr.Logger, createOrUpdate upsert.CreateOrUpdateFN, + hcluster *hyperv1.HostedCluster, hcp *hyperv1.HostedControlPlane, + controlPlanePKIOperatorSignsCSRs, controlPlaneOperatorAppliesManagementKASNetworkPolicyLabel bool, + releaseImageVersion semver.Version, +) error { + var errs []error + if r.EnableOCPClusterMonitoring { + if err := r.reconcileClusterPrometheusRBAC(ctx, createOrUpdate, hcp.Namespace); err != nil { + errs = append(errs, fmt.Errorf("failed to reconcile cluster prometheus RBAC: %w", err)) } - dest := controlplaneoperator.SSHKey(controlPlaneNamespace.Name) - _, err = createOrUpdate(ctx, r.Client, dest, func() error { - srcData, srcHasData := src.Data["id_rsa.pub"] - if !srcHasData { - return fmt.Errorf("hostedcluster SSHKey secret %q must have a id_rsa.pub key", src.Name) - } - dest.Type = corev1.SecretTypeOpaque - if dest.Data == nil { - dest.Data = map[string][]byte{} - } - dest.Data["id_rsa.pub"] = srcData - return nil + } + if _, pkiDisabled := hcp.Annotations[hyperv1.DisablePKIReconciliationAnnotation]; controlPlanePKIOperatorSignsCSRs && !pkiDisabled { + if err := r.reconcileControlPlanePKIOperatorRBAC(ctx, createOrUpdate, hcluster); err != nil { + errs = append(errs, fmt.Errorf("failed to reconcile PKI operator RBAC: %w", err)) + } + } + if err := r.reconcileNetworkPolicies(ctx, log, createOrUpdate, hcluster, hcp, + releaseImageVersion, controlPlaneOperatorAppliesManagementKASNetworkPolicyLabel); err != nil { + errs = append(errs, fmt.Errorf("failed to reconcile network policies: %w", err)) + } + return utilerrors.NewAggregate(errs) +} + +// reconcilePlatformCredentialsWithStatus reconciles platform credentials and updates +// the PlatformCredentialsFound status condition on the HostedCluster. +func (r *HostedClusterReconciler) reconcilePlatformCredentialsWithStatus( + ctx context.Context, hcluster *hyperv1.HostedCluster, createOrUpdate upsert.CreateOrUpdateFN, + controlPlaneNamespace string, p platform.Platform, +) error { + if err := p.ReconcileCredentials(ctx, r.Client, createOrUpdate, hcluster, controlPlaneNamespace); err != nil { + meta.SetStatusCondition(&hcluster.Status.Conditions, metav1.Condition{ + Type: string(hyperv1.PlatformCredentialsFound), + Status: metav1.ConditionFalse, + Reason: hyperv1.PlatformCredentialsNotFoundReason, + ObservedGeneration: hcluster.Generation, + Message: err.Error(), }) - if err != nil { - return ctrl.Result{}, fmt.Errorf("failed to reconcile controlplane SSHKey secret: %w", err) + if statusErr := r.Client.Status().Update(ctx, hcluster); statusErr != nil { + return fmt.Errorf("failed to reconcile platform credentials: %s, failed to update status: %w", err, statusErr) } + return fmt.Errorf("failed to reconcile platform credentials: %w", err) } + if !meta.IsStatusConditionTrue(hcluster.Status.Conditions, string(hyperv1.PlatformCredentialsFound)) { + meta.SetStatusCondition(&hcluster.Status.Conditions, metav1.Condition{ + Type: string(hyperv1.PlatformCredentialsFound), + Status: metav1.ConditionTrue, + Reason: hyperv1.AsExpectedReason, + ObservedGeneration: hcluster.Generation, + Message: "Required platform credentials are found", + }) + if statusErr := r.Client.Status().Update(ctx, hcluster); statusErr != nil { + return fmt.Errorf("failed to update platform credentials status: %w", statusErr) + } + } + return nil +} - // Reconcile the HostedControlPlane AdditionalTrustBundle ConfigMap by resolving the source reference - // from the HostedCluster and syncing the CM in the control plane namespace. - if err := r.reconcileAdditionalTrustBundle(ctx, hcluster, createOrUpdate, controlPlaneNamespace.Name); err != nil { - return ctrl.Result{}, err +// reconcileRestoredFromBackupWithStatus reconciles the HostedClusterRestoredFromBackup +// condition by propagating it from the HCP and updating status. +func (r *HostedClusterReconciler) reconcileRestoredFromBackupWithStatus( + ctx context.Context, hcluster *hyperv1.HostedCluster, hcp *hyperv1.HostedControlPlane, +) error { + if _, exists := hcluster.Annotations[hyperv1.HostedClusterRestoredFromBackupAnnotation]; !exists { + return nil } - // Reconcile the service account signing key if set - if hcluster.Spec.ServiceAccountSigningKey != nil { - if err := r.reconcileServiceAccountSigningKey(ctx, hcluster, controlPlaneNamespace.Name, createOrUpdate); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to reconcile service account signing key: %w", err) + freshCondition := &metav1.Condition{ + Type: string(hyperv1.HostedClusterRestoredFromBackup), + Reason: hyperv1.RecoveryFinishedReason, + Status: metav1.ConditionUnknown, + ObservedGeneration: hcluster.Generation, + } + + if hcp != nil { + hostedClusterRestoredFromBackupCondition := meta.FindStatusCondition(hcp.Status.Conditions, string(hyperv1.HostedClusterRestoredFromBackup)) + if hostedClusterRestoredFromBackupCondition != nil { + freshCondition = hostedClusterRestoredFromBackupCondition } } - // Reconcile etcd client MTLS secret if the control plane is using an unmanaged etcd cluster - if hcluster.Spec.Etcd.ManagementType == hyperv1.Unmanaged { - unmanagedEtcdTLSClientSecret := &corev1.Secret{ - ObjectMeta: metav1.ObjectMeta{ - Namespace: hcluster.GetNamespace(), - Name: hcluster.Spec.Etcd.Unmanaged.TLS.ClientSecret.Name, - }, + oldCondition := meta.FindStatusCondition(hcluster.Status.Conditions, string(hyperv1.HostedClusterRestoredFromBackup)) + + // Preserve previous status if we can no longer determine the status + if oldCondition != nil && freshCondition.Status == metav1.ConditionUnknown { + freshCondition.Status = oldCondition.Status + } + + // If the condition is not set, or the status is different, set the condition + if oldCondition == nil || oldCondition.Status != freshCondition.Status { + freshCondition.ObservedGeneration = hcluster.Generation + } + + // If the condition is true, delete the hc annotation. It will be eventually bubbled down to the hcp. + if freshCondition.Status == metav1.ConditionTrue { + hclusterAnnotations := hcluster.GetAnnotations() + delete(hclusterAnnotations, hyperv1.HostedClusterRestoredFromBackupAnnotation) + hcluster.SetAnnotations(hclusterAnnotations) + if err := r.Client.Update(ctx, hcluster); err != nil { + return fmt.Errorf("failed to remove annotations %v: %w", string(hyperv1.HostedClusterRestoredFromBackup), err) + } + } + + // Persist status updates + meta.SetStatusCondition(&hcluster.Status.Conditions, *freshCondition) + if err := r.Client.Status().Update(ctx, hcluster); err != nil { + return fmt.Errorf("failed to update status %v: %w", string(hyperv1.HostedClusterRestoredFromBackup), err) + } + return nil +} + +// reconcilePullSecretSync syncs the pull secret from the HostedCluster namespace +// to the control plane namespace. +func (r *HostedClusterReconciler) reconcilePullSecretSync( + ctx context.Context, hcluster *hyperv1.HostedCluster, createOrUpdate upsert.CreateOrUpdateFN, + controlPlaneNamespace string, +) error { + var src corev1.Secret + if err := r.Client.Get(ctx, client.ObjectKey{Namespace: hcluster.GetNamespace(), Name: hcluster.Spec.PullSecret.Name}, &src); err != nil { + return fmt.Errorf("failed to get pull secret %s: %w", hcluster.Spec.PullSecret.Name, err) + } + if err := ensureReferencedResourceAnnotation(ctx, r.Client, hcluster.Name, &src); err != nil { + return fmt.Errorf("failed to set referenced resource annotation: %w", err) + } + dst := controlplaneoperator.PullSecret(controlPlaneNamespace) + _, err := createOrUpdate(ctx, r.Client, dst, func() error { + srcData, srcHasData := src.Data[".dockerconfigjson"] + if !srcHasData { + return fmt.Errorf("hostedcluster pull secret %q must have a .dockerconfigjson key", src.Name) + } + dst.Type = corev1.SecretTypeDockerConfigJson + if dst.Data == nil { + dst.Data = map[string][]byte{} + } + dst.Data[".dockerconfigjson"] = srcData + return nil + }) + return err +} + +// reconcileSecretEncryptionSync syncs secret encryption configuration from the +// HostedCluster to the control plane namespace. +func (r *HostedClusterReconciler) reconcileSecretEncryptionSync( + ctx context.Context, hcluster *hyperv1.HostedCluster, createOrUpdate upsert.CreateOrUpdateFN, + controlPlaneNamespace string, p platform.Platform, +) error { + if hcluster.Spec.SecretEncryption == nil { + return nil + } + log := ctrl.LoggerFrom(ctx) + log.Info("Reconciling secret encryption configuration") + switch hcluster.Spec.SecretEncryption.Type { + case hyperv1.AESCBC: + if hcluster.Spec.SecretEncryption.AESCBC == nil || len(hcluster.Spec.SecretEncryption.AESCBC.ActiveKey.Name) == 0 { + log.Error(fmt.Errorf("aescbc metadata is nil"), "") + // don't return error here as reconciling won't fix input error + return nil } - if err := r.Client.Get(ctx, client.ObjectKeyFromObject(unmanagedEtcdTLSClientSecret), unmanagedEtcdTLSClientSecret); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to get unmanaged etcd tls secret: %w", err) + var src corev1.Secret + if err := r.Client.Get(ctx, client.ObjectKey{Namespace: hcluster.GetNamespace(), Name: hcluster.Spec.SecretEncryption.AESCBC.ActiveKey.Name}, &src); err != nil { + return fmt.Errorf("failed to get active aescbc secret %s: %w", hcluster.Spec.SecretEncryption.AESCBC.ActiveKey.Name, err) } - if err := ensureReferencedResourceAnnotation(ctx, r.Client, hcluster.Name, unmanagedEtcdTLSClientSecret); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set referenced resource annotation: %w", err) + if err := ensureReferencedResourceAnnotation(ctx, r.Client, hcluster.Name, &src); err != nil { + return fmt.Errorf("failed to set referenced resource annotation: %w", err) } - hostedControlPlaneEtcdClientSecret := &corev1.Secret{ + if _, ok := src.Data[hyperv1.AESCBCKeySecretKey]; !ok { + log.Error(fmt.Errorf("no key field %s specified for aescbc active key secret", hyperv1.AESCBCKeySecretKey), "") + // don't return error here as reconciling won't fix input error + return nil + } + hostedControlPlaneActiveKeySecret := &corev1.Secret{ ObjectMeta: metav1.ObjectMeta{ - Namespace: controlPlaneNamespace.Name, - Name: hcluster.Spec.Etcd.Unmanaged.TLS.ClientSecret.Name, + Namespace: controlPlaneNamespace, + Name: src.Name, }, } - if result, err := createOrUpdate(ctx, r.Client, hostedControlPlaneEtcdClientSecret, func() error { - if hostedControlPlaneEtcdClientSecret.Data == nil { - hostedControlPlaneEtcdClientSecret.Data = map[string][]byte{} + _, err := createOrUpdate(ctx, r.Client, hostedControlPlaneActiveKeySecret, func() error { + if hostedControlPlaneActiveKeySecret.Data == nil { + hostedControlPlaneActiveKeySecret.Data = map[string][]byte{} } - hostedControlPlaneEtcdClientSecret.Data = unmanagedEtcdTLSClientSecret.Data - hostedControlPlaneEtcdClientSecret.Type = corev1.SecretTypeOpaque + hostedControlPlaneActiveKeySecret.Data[hyperv1.AESCBCKeySecretKey] = src.Data[hyperv1.AESCBCKeySecretKey] + hostedControlPlaneActiveKeySecret.Type = corev1.SecretTypeOpaque return nil - }); err != nil { - return ctrl.Result{}, fmt.Errorf("failed reconciling etcd client secret: %w", err) - } else { - log.Info("reconciled etcd client mtls secret to control plane namespace", "result", result) - } - } - - // Reconcile the ETCD member recovery - var requeueAfter *time.Duration - if r.EnableEtcdRecovery && - hcluster.Spec.Etcd.ManagementType == hyperv1.Managed && - hcluster.Spec.ControllerAvailabilityPolicy == hyperv1.HighlyAvailable { - var err error - if requeueAfter, err = r.reconcileETCDMemberRecovery(ctx, hcluster, createOrUpdate); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to perform etcd member recovery: %w", err) + }) + if err != nil { + return fmt.Errorf("failed reconciling aescbc active key: %w", err) } - } - - // Reconcile global config related configmaps and secrets - { - if hcluster.Spec.Configuration != nil { - configMapRefs := configrefs.ConfigMapRefs(hcluster.Spec.Configuration) - for _, configMapRef := range configMapRefs { - sourceCM := &corev1.ConfigMap{} - if err := r.Get(ctx, client.ObjectKey{Namespace: hcluster.Namespace, Name: configMapRef}, sourceCM); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to get referenced configmap %s/%s: %w", hcluster.Namespace, configMapRef, err) - } - if err := ensureReferencedResourceAnnotation(ctx, r.Client, hcluster.Name, sourceCM); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set referenced resource annotation: %w", err) - } - destCM := &corev1.ConfigMap{} - destCM.Name = sourceCM.Name - destCM.Namespace = controlPlaneNamespace.Name - if _, err := createOrUpdate(ctx, r.Client, destCM, func() error { - destCM.Annotations = sourceCM.Annotations - destCM.Labels = sourceCM.Labels - destCM.Data = sourceCM.Data - destCM.BinaryData = sourceCM.BinaryData - destCM.Immutable = sourceCM.Immutable - return nil - }); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to reconcile referenced config map %s/%s: %w", destCM.Namespace, destCM.Name, err) - } + if hcluster.Spec.SecretEncryption.AESCBC.BackupKey != nil && len(hcluster.Spec.SecretEncryption.AESCBC.BackupKey.Name) > 0 { + var src corev1.Secret + if err := r.Client.Get(ctx, client.ObjectKey{Namespace: hcluster.GetNamespace(), Name: hcluster.Spec.SecretEncryption.AESCBC.BackupKey.Name}, &src); err != nil { + return fmt.Errorf("failed to get backup aescbc secret %s: %w", hcluster.Spec.SecretEncryption.AESCBC.BackupKey.Name, err) + } + if err := ensureReferencedResourceAnnotation(ctx, r.Client, hcluster.Name, &src); err != nil { + return fmt.Errorf("failed to set referenced resource annotation: %w", err) + } + if _, ok := src.Data[hyperv1.AESCBCKeySecretKey]; !ok { + log.Error(fmt.Errorf("no key field %s specified for aescbc backup key secret", hyperv1.AESCBCKeySecretKey), "") + // don't return error here as reconciling won't fix input error + return nil + } + hostedControlPlaneBackupKeySecret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: controlPlaneNamespace, + Name: src.Name, + }, } - secretRefs := configrefs.SecretRefs(hcluster.Spec.Configuration) - for _, secretRef := range secretRefs { - sourceSecret := &corev1.Secret{} - if err := r.Get(ctx, client.ObjectKey{Namespace: hcluster.Namespace, Name: secretRef}, sourceSecret); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to get referenced secret %s/%s: %w", hcluster.Namespace, secretRef, err) - } - if err := ensureReferencedResourceAnnotation(ctx, r.Client, hcluster.Name, sourceSecret); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set referenced resource annotation: %w", err) - } - if err := ensureHostedResourcesAreEmpty(ctx, r.Client, hcluster, sourceSecret); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to validate referenced secret %s/%s: %w", hcluster.Namespace, secretRef, err) - } - destSecret := &corev1.Secret{} - destSecret.Name = sourceSecret.Name - destSecret.Namespace = controlPlaneNamespace.Name - if _, err := createOrUpdate(ctx, r.Client, destSecret, func() error { - destSecret.Annotations = sourceSecret.Annotations - destSecret.Labels = sourceSecret.Labels - destSecret.Data = sourceSecret.Data - destSecret.Immutable = sourceSecret.Immutable - destSecret.Type = sourceSecret.Type - return nil - }); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to reconcile secret %s/%s: %w", destSecret.Namespace, destSecret.Name, err) + _, err = createOrUpdate(ctx, r.Client, hostedControlPlaneBackupKeySecret, func() error { + if hostedControlPlaneBackupKeySecret.Data == nil { + hostedControlPlaneBackupKeySecret.Data = map[string][]byte{} } + hostedControlPlaneBackupKeySecret.Data[hyperv1.AESCBCKeySecretKey] = src.Data[hyperv1.AESCBCKeySecretKey] + hostedControlPlaneBackupKeySecret.Type = corev1.SecretTypeOpaque + return nil + }) + if err != nil { + return fmt.Errorf("failed reconciling aescbc backup key: %w", err) } } + case hyperv1.KMS: + if hcluster.Spec.SecretEncryption.KMS == nil { + log.Error(fmt.Errorf("kms metadata nil"), "") + // don't return error here as reconciling won't fix input error + return nil + } + if err := p.ReconcileSecretEncryption(ctx, r.Client, createOrUpdate, + hcluster, + controlPlaneNamespace); err != nil { + return err + } + default: + log.Error(fmt.Errorf("unsupported encryption type %s", hcluster.Spec.SecretEncryption.Type), "") + // don't return error here as reconciling won't fix input error + return nil } + return nil +} - // Get release image version - var releaseImageVersion semver.Version - releaseImageVersion, err = semver.Parse(releaseImage.Version()) - if err != nil { - return ctrl.Result{}, fmt.Errorf("failed to parse release image version: %w", err) +// reconcileAuditWebhookSync syncs the audit webhook secret from the HostedCluster +// namespace to the control plane namespace. +func (r *HostedClusterReconciler) reconcileAuditWebhookSync( + ctx context.Context, hcluster *hyperv1.HostedCluster, createOrUpdate upsert.CreateOrUpdateFN, + controlPlaneNamespace string, +) error { + if hcluster.Spec.AuditWebhook == nil || len(hcluster.Spec.AuditWebhook.Name) == 0 { + return nil } - - // Reconcile the HostedControlPlane - isAutoscalingNeeded, err := r.isAutoscalingNeeded(ctx, hcluster) - if err != nil { - return ctrl.Result{}, fmt.Errorf("failed to determine if autoscaler is needed: %w", err) + var src corev1.Secret + if err := r.Client.Get(ctx, client.ObjectKey{Namespace: hcluster.GetNamespace(), Name: hcluster.Spec.AuditWebhook.Name}, &src); err != nil { + return fmt.Errorf("failed to get audit webhook config %s: %w", hcluster.Spec.AuditWebhook.Name, err) } - isAWSNodeTerminationHandlerNeeded, err := r.isAWSNodeTerminationHandlerNeeded(ctx, hcluster) - if err != nil { - return ctrl.Result{}, fmt.Errorf("failed to determine if AWS node termination handler is needed: %w", err) + if err := ensureReferencedResourceAnnotation(ctx, r.Client, hcluster.Name, &src); err != nil { + return fmt.Errorf("failed to set referenced resource annotation: %w", err) } - hcp = controlplaneoperator.HostedControlPlane(controlPlaneNamespace.Name, hcluster.Name) - _, err = createOrUpdate(ctx, r.Client, hcp, func() error { - return reconcileHostedControlPlane(hcp, hcluster, isAutoscalingNeeded, isAWSNodeTerminationHandlerNeeded, - annotationsForCertRenewal(log, - hcp, - shouldCheckForStaleCerts(hcluster, defaultToControlPlaneV2), - r.kasServingCertHashFromSecret(ctx, hcp), - r.kasServingCertHashFromEndpoint(ctx, kasHostAndPortFromHCP(hcp)))) - }) - if err != nil { - return ctrl.Result{}, fmt.Errorf("failed to reconcile hostedcontrolplane: %w", err) + configData, ok := src.Data[hyperv1.AuditWebhookKubeconfigKey] + if !ok { + return fmt.Errorf("audit webhook secret does not contain key %s", hyperv1.AuditWebhookKubeconfigKey) } - // Reconcile CAPI Infra CR. - infraCR, err := p.ReconcileCAPIInfraCR(ctx, r.Client, createOrUpdate, - hcluster, - controlPlaneNamespace.Name, - hcp.Status.ControlPlaneEndpoint) - if err != nil { - return ctrl.Result{}, err + hostedControlPlaneAuditWebhookSecret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: controlPlaneNamespace, + Name: src.Name, + }, } + _, err := createOrUpdate(ctx, r.Client, hostedControlPlaneAuditWebhookSecret, func() error { + if hostedControlPlaneAuditWebhookSecret.Data == nil { + hostedControlPlaneAuditWebhookSecret.Data = map[string][]byte{} + } + hostedControlPlaneAuditWebhookSecret.Data[hyperv1.AuditWebhookKubeconfigKey] = configData + hostedControlPlaneAuditWebhookSecret.Type = corev1.SecretTypeOpaque + return nil + }) + return err +} - if err := r.reconcileAWSSubnets(ctx, createOrUpdate, infraCR, req.Namespace, req.Name, controlPlaneNamespace.Name); err != nil { - return ctrl.Result{}, err +// reconcileSSHKeySync syncs the SSH key secret from the HostedCluster namespace +// to the control plane namespace. +func (r *HostedClusterReconciler) reconcileSSHKeySync( + ctx context.Context, hcluster *hyperv1.HostedCluster, createOrUpdate upsert.CreateOrUpdateFN, + controlPlaneNamespace string, +) error { + if len(hcluster.Spec.SSHKey.Name) == 0 { + return nil } - - // Reconcile cluster prometheus RBAC resources if enabled - if r.EnableOCPClusterMonitoring { - if err := r.reconcileClusterPrometheusRBAC(ctx, createOrUpdate, hcp.Namespace); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to reconcile RBAC for OCP cluster prometheus: %w", err) - } + var src corev1.Secret + if err := r.Client.Get(ctx, client.ObjectKey{Namespace: hcluster.Namespace, Name: hcluster.Spec.SSHKey.Name}, &src); err != nil { + return fmt.Errorf("failed to get hostedcluster SSHKey secret %s: %w", hcluster.Spec.SSHKey.Name, err) + } + if err := ensureReferencedResourceAnnotation(ctx, r.Client, hcluster.Name, &src); err != nil { + return fmt.Errorf("failed to set referenced resource annotation: %w", err) } + dest := controlplaneoperator.SSHKey(controlPlaneNamespace) + _, err := createOrUpdate(ctx, r.Client, dest, func() error { + srcData, srcHasData := src.Data["id_rsa.pub"] + if !srcHasData { + return fmt.Errorf("hostedcluster SSHKey secret %q must have a id_rsa.pub key", src.Name) + } + dest.Type = corev1.SecretTypeOpaque + if dest.Data == nil { + dest.Data = map[string][]byte{} + } + dest.Data["id_rsa.pub"] = srcData + return nil + }) + return err +} - // Reconcile the CAPI Cluster resource - // In the None platform case, there is no CAPI provider/resources so infraCR is nil - if infraCR != nil { - capiCluster := controlplaneoperator.CAPICluster(controlPlaneNamespace.Name, hcluster.Spec.InfraID) - _, err = createOrUpdate(ctx, r.Client, capiCluster, func() error { - return reconcileCAPICluster(capiCluster, hcluster, hcp, infraCR) - }) - if err != nil { - return ctrl.Result{}, fmt.Errorf("failed to reconcile capi cluster: %w", err) +// reconcileUnmanagedEtcdMTLSSync syncs the unmanaged etcd client MTLS secret +// from the HostedCluster namespace to the control plane namespace. +func (r *HostedClusterReconciler) reconcileUnmanagedEtcdMTLSSync( + ctx context.Context, hcluster *hyperv1.HostedCluster, createOrUpdate upsert.CreateOrUpdateFN, + controlPlaneNamespace string, +) error { + if hcluster.Spec.Etcd.ManagementType != hyperv1.Unmanaged { + return nil + } + log := ctrl.LoggerFrom(ctx) + unmanagedEtcdTLSClientSecret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: hcluster.GetNamespace(), + Name: hcluster.Spec.Etcd.Unmanaged.TLS.ClientSecret.Name, + }, + } + if err := r.Client.Get(ctx, client.ObjectKeyFromObject(unmanagedEtcdTLSClientSecret), unmanagedEtcdTLSClientSecret); err != nil { + return fmt.Errorf("failed to get unmanaged etcd tls secret: %w", err) + } + if err := ensureReferencedResourceAnnotation(ctx, r.Client, hcluster.Name, unmanagedEtcdTLSClientSecret); err != nil { + return fmt.Errorf("failed to set referenced resource annotation: %w", err) + } + hostedControlPlaneEtcdClientSecret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: controlPlaneNamespace, + Name: hcluster.Spec.Etcd.Unmanaged.TLS.ClientSecret.Name, + }, + } + if result, err := createOrUpdate(ctx, r.Client, hostedControlPlaneEtcdClientSecret, func() error { + if hostedControlPlaneEtcdClientSecret.Data == nil { + hostedControlPlaneEtcdClientSecret.Data = map[string][]byte{} } + hostedControlPlaneEtcdClientSecret.Data = unmanagedEtcdTLSClientSecret.Data + hostedControlPlaneEtcdClientSecret.Type = corev1.SecretTypeOpaque + return nil + }); err != nil { + return fmt.Errorf("failed reconciling etcd client secret: %w", err) + } else { + log.Info("reconciled etcd client mtls secret to control plane namespace", "result", result) } + return nil +} - // Reconcile the monitoring dashboard if configured - if r.MonitoringDashboards { - if err := r.reconcileMonitoringDashboard(ctx, createOrUpdate, hcluster); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to reconcile monitoring dashboard: %w", err) +// reconcileGlobalConfigSync syncs global configuration configmaps and secrets +// from the HostedCluster namespace to the control plane namespace. +func (r *HostedClusterReconciler) reconcileGlobalConfigSync( + ctx context.Context, hcluster *hyperv1.HostedCluster, createOrUpdate upsert.CreateOrUpdateFN, + controlPlaneNamespace string, +) error { + if hcluster.Spec.Configuration == nil { + return nil + } + configMapRefs := configrefs.ConfigMapRefs(hcluster.Spec.Configuration) + for _, configMapRef := range configMapRefs { + sourceCM := &corev1.ConfigMap{} + if err := r.Get(ctx, client.ObjectKey{Namespace: hcluster.Namespace, Name: configMapRef}, sourceCM); err != nil { + return fmt.Errorf("failed to get referenced configmap %s/%s: %w", hcluster.Namespace, configMapRef, err) + } + if err := ensureReferencedResourceAnnotation(ctx, r.Client, hcluster.Name, sourceCM); err != nil { + return fmt.Errorf("failed to set referenced resource annotation: %w", err) + } + destCM := &corev1.ConfigMap{} + destCM.Name = sourceCM.Name + destCM.Namespace = controlPlaneNamespace + if _, err := createOrUpdate(ctx, r.Client, destCM, func() error { + destCM.Annotations = sourceCM.Annotations + destCM.Labels = sourceCM.Labels + destCM.Data = sourceCM.Data + destCM.BinaryData = sourceCM.BinaryData + destCM.Immutable = sourceCM.Immutable + return nil + }); err != nil { + return fmt.Errorf("failed to reconcile referenced config map %s/%s: %w", destCM.Namespace, destCM.Name, err) + } + } + secretRefs := configrefs.SecretRefs(hcluster.Spec.Configuration) + for _, secretRef := range secretRefs { + sourceSecret := &corev1.Secret{} + if err := r.Get(ctx, client.ObjectKey{Namespace: hcluster.Namespace, Name: secretRef}, sourceSecret); err != nil { + return fmt.Errorf("failed to get referenced secret %s/%s: %w", hcluster.Namespace, secretRef, err) + } + if err := ensureReferencedResourceAnnotation(ctx, r.Client, hcluster.Name, sourceSecret); err != nil { + return fmt.Errorf("failed to set referenced resource annotation: %w", err) + } + if err := ensureHostedResourcesAreEmpty(ctx, r.Client, hcluster, sourceSecret); err != nil { + return fmt.Errorf("failed to validate referenced secret %s/%s: %w", hcluster.Namespace, secretRef, err) + } + destSecret := &corev1.Secret{} + destSecret.Name = sourceSecret.Name + destSecret.Namespace = controlPlaneNamespace + if _, err := createOrUpdate(ctx, r.Client, destSecret, func() error { + destSecret.Annotations = sourceSecret.Annotations + destSecret.Labels = sourceSecret.Labels + destSecret.Data = sourceSecret.Data + destSecret.Immutable = sourceSecret.Immutable + destSecret.Type = sourceSecret.Type + return nil + }); err != nil { + return fmt.Errorf("failed to reconcile secret %s/%s: %w", destSecret.Namespace, destSecret.Name, err) } } + return nil +} - // Reconcile the HostedControlPlane kubeconfig if one is reported - if hcp.Status.KubeConfig != nil { - src := &corev1.Secret{ - ObjectMeta: metav1.ObjectMeta{ - Namespace: hcp.Namespace, - Name: hcp.Status.KubeConfig.Name, - }, +// reconcileKubeconfigSync syncs the kubeconfig secret from the HCP namespace +// to the HostedCluster namespace. +func (r *HostedClusterReconciler) reconcileKubeconfigSync( + ctx context.Context, hcluster *hyperv1.HostedCluster, hcp *hyperv1.HostedControlPlane, + createOrUpdate upsert.CreateOrUpdateFN, +) error { + if hcp.Status.KubeConfig == nil { + return nil + } + src := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: hcp.Namespace, + Name: hcp.Status.KubeConfig.Name, + }, + } + if err := r.Client.Get(ctx, client.ObjectKeyFromObject(src), src); err != nil { + return fmt.Errorf("failed to get controlplane kubeconfig secret %q: %w", client.ObjectKeyFromObject(src), err) + } + dest := manifests.KubeConfigSecret(hcluster.Namespace, hcluster.Name) + _, err := createOrUpdate(ctx, r.Client, dest, func() error { + key := hcp.Status.KubeConfig.Key + srcData, srcHasData := src.Data[key] + if !srcHasData { + return fmt.Errorf("controlplane kubeconfig secret %q must have a %q key", client.ObjectKeyFromObject(src), key) } - err := r.Client.Get(ctx, client.ObjectKeyFromObject(src), src) - if err != nil { - return ctrl.Result{}, fmt.Errorf("failed to get controlplane kubeconfig secret %q: %w", client.ObjectKeyFromObject(src), err) - } - dest := manifests.KubeConfigSecret(hcluster.Namespace, hcluster.Name) - _, err = createOrUpdate(ctx, r.Client, dest, func() error { - key := hcp.Status.KubeConfig.Key - srcData, srcHasData := src.Data[key] - if !srcHasData { - return fmt.Errorf("controlplane kubeconfig secret %q must have a %q key", client.ObjectKeyFromObject(src), key) - } - dest.Labels = hcluster.Labels - dest.Type = corev1.SecretTypeOpaque - if dest.Data == nil { - dest.Data = map[string][]byte{} - } - dest.Data["kubeconfig"] = srcData - dest.SetOwnerReferences([]metav1.OwnerReference{{ - APIVersion: hyperv1.GroupVersion.String(), - Kind: "HostedCluster", - Name: hcluster.Name, - UID: hcluster.UID, - }}) - return nil - }) - if err != nil { - return ctrl.Result{}, fmt.Errorf("failed to reconcile hostedcluster kubeconfig secret: %w", err) + dest.Labels = hcluster.Labels + dest.Type = corev1.SecretTypeOpaque + if dest.Data == nil { + dest.Data = map[string][]byte{} } - } + dest.Data["kubeconfig"] = srcData + dest.SetOwnerReferences([]metav1.OwnerReference{{ + APIVersion: hyperv1.GroupVersion.String(), + Kind: "HostedCluster", + Name: hcluster.Name, + UID: hcluster.UID, + }}) + return nil + }) + return err +} - if cpoSupportsKASCustomKubeconfig { - // Reconcile the HostedControlPlane external kubeconfig if one is reported - if len(hcp.Spec.KubeAPIServerDNSName) > 0 { - requeue, err := r.reconcileCustomExternalKubeconfig(ctx, createOrUpdate, hcp, hcluster) - if err != nil { - return ctrl.Result{}, err - } - if requeue != nil { - requeueAfter = requeue +// reconcileCustomKubeconfigSync syncs the custom external kubeconfig secret +// from the HCP namespace to the HostedCluster namespace, or deletes it if not needed. +func (r *HostedClusterReconciler) reconcileCustomKubeconfigSync( + ctx context.Context, hcluster *hyperv1.HostedCluster, hcp *hyperv1.HostedControlPlane, + createOrUpdate upsert.CreateOrUpdateFN, cpoSupportsKASCustomKubeconfig bool, +) error { + if !cpoSupportsKASCustomKubeconfig { + return nil + } + if len(hcp.Spec.KubeAPIServerDNSName) > 0 { + _, err := r.reconcileCustomExternalKubeconfig(ctx, createOrUpdate, hcp, hcluster) + if err != nil { + return err + } + } else { + // Delete the custom external kubeconfig secret if it exists and the external name is not set + if hcluster.Status.CustomKubeconfig != nil { + customKubeconfig := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: hcluster.Namespace, + Name: hcluster.Status.CustomKubeconfig.Name, + }, } - } else { - // Delete the custom external kubeconfig secret if it exists and the external name is not set - if hcluster.Status.CustomKubeconfig != nil { - customKubeconfig := &corev1.Secret{ - ObjectMeta: metav1.ObjectMeta{ - Namespace: hcluster.Namespace, - Name: hcluster.Status.CustomKubeconfig.Name, - }, - } - if _, err := k8sutil.DeleteIfNeeded(ctx, r.Client, customKubeconfig); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to delete custom external kubeconfig secret %q: %w", client.ObjectKeyFromObject(customKubeconfig), err) - } - hcluster.Status.CustomKubeconfig = nil + if _, err := k8sutil.DeleteIfNeeded(ctx, r.Client, customKubeconfig); err != nil { + return fmt.Errorf("failed to delete custom external kubeconfig secret %q: %w", client.ObjectKeyFromObject(customKubeconfig), err) } + hcluster.Status.CustomKubeconfig = nil } } + return nil +} - // Reconcile the HostedControlPlane kubeadminPassword +// reconcileKubeadminPasswordSync syncs the kubeadmin password secret from the HCP +// namespace to the HostedCluster namespace, or deletes it if not present in HCP. +func (r *HostedClusterReconciler) reconcileKubeadminPasswordSync( + ctx context.Context, hcluster *hyperv1.HostedCluster, hcp *hyperv1.HostedControlPlane, + createOrUpdate upsert.CreateOrUpdateFN, +) error { if hcp.Status.KubeadminPassword != nil { src := &corev1.Secret{ ObjectMeta: metav1.ObjectMeta{ @@ -1948,12 +2220,11 @@ func (r *HostedClusterReconciler) reconcile(ctx context.Context, req ctrl.Reques Name: hcp.Status.KubeadminPassword.Name, }, } - err := r.Client.Get(ctx, client.ObjectKeyFromObject(src), src) - if err != nil { - return ctrl.Result{}, fmt.Errorf("failed to get controlplane kubeadmin password secret %q: %w", client.ObjectKeyFromObject(src), err) + if err := r.Client.Get(ctx, client.ObjectKeyFromObject(src), src); err != nil { + return fmt.Errorf("failed to get controlplane kubeadmin password secret %q: %w", client.ObjectKeyFromObject(src), err) } dest := manifests.KubeadminPasswordSecret(hcluster.Namespace, hcluster.Name) - _, err = createOrUpdate(ctx, r.Client, dest, func() error { + _, err := createOrUpdate(ctx, r.Client, dest, func() error { dest.Type = corev1.SecretTypeOpaque dest.Data = map[string][]byte{} for k, v := range src.Data { @@ -1968,129 +2239,90 @@ func (r *HostedClusterReconciler) reconcile(ctx context.Context, req ctrl.Reques return nil }) if err != nil { - return ctrl.Result{}, fmt.Errorf("failed to reconcile hostedcluster kubeconfig secret: %w", err) + return fmt.Errorf("failed to reconcile hostedcluster kubeadmin password secret: %w", err) } } else { KubeadminPasswordSecret := manifests.KubeadminPasswordSecret(hcluster.Namespace, hcluster.Name) if err := r.Client.Get(ctx, client.ObjectKeyFromObject(KubeadminPasswordSecret), KubeadminPasswordSecret); err != nil { if !apierrors.IsNotFound(err) { - return ctrl.Result{}, fmt.Errorf("failed to get hostedcluster kubeadmin password secret %q: %w", client.ObjectKeyFromObject(KubeadminPasswordSecret), err) + return fmt.Errorf("failed to get hostedcluster kubeadmin password secret %q: %w", client.ObjectKeyFromObject(KubeadminPasswordSecret), err) } } else { if err := r.Client.Delete(ctx, KubeadminPasswordSecret); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to delete hostedcluster kubeadmin password secret %q: %w", client.ObjectKeyFromObject(KubeadminPasswordSecret), err) + return fmt.Errorf("failed to delete hostedcluster kubeadmin password secret %q: %w", client.ObjectKeyFromObject(KubeadminPasswordSecret), err) } } } + return nil +} - defaultIngressDomain, err := r.defaultIngressDomain(ctx) - if err != nil { - return ctrl.Result{}, fmt.Errorf("failed to determine default ingress domain: %w", err) - } - - // Reconcile SRE metrics config - if err := r.reconcileSREMetricsConfig(ctx, createOrUpdate, hcp); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to reconcile SRE metrics config: %w", err) - } - - _, err = r.reconcileOpenShiftTrustedCAs(ctx, hcp) - if err != nil { - return ctrl.Result{}, fmt.Errorf("failed to reconcile OpenShift trusted CAs: %w", err) +// reconcileAuxiliary groups monitoring, SRE metrics, trusted CAs, and CLI secrets +// reconciliation. Each runs independently and errors are collected. +func (r *HostedClusterReconciler) reconcileAuxiliary( + ctx context.Context, createOrUpdate upsert.CreateOrUpdateFN, + hcluster *hyperv1.HostedCluster, hcp *hyperv1.HostedControlPlane, +) error { + var errs []error + if err := r.reconcileCLISecrets(ctx, createOrUpdate, hcluster); err != nil { + errs = append(errs, fmt.Errorf("failed to reconcile CLI secrets: %w", err)) } - - imageProvider := imageprovider.New(releaseImage) - imageProvider.ComponentImages()["token-minter"] = utilitiesImage - imageProvider.ComponentImages()[podspec.AvailabilityProberImageName] = utilitiesImage - - securityContextUID := controlplanecomponent.DefaultSecurityContextUID - if r.SetDefaultSecurityContext { - securityContextUID, err = strconv.ParseInt(controlPlaneNamespace.Annotations[DefaultSecurityContextUIDAnnnotation], 10, 64) - if err != nil { - return ctrl.Result{}, fmt.Errorf("failed to parse SecurityContext UID: %w", err) + if r.MonitoringDashboards { + if err := r.reconcileMonitoringDashboard(ctx, createOrUpdate, hcluster); err != nil { + errs = append(errs, fmt.Errorf("failed to reconcile monitoring dashboard: %w", err)) } } - cpContext := controlplanecomponent.ControlPlaneContext{ - Context: ctx, - Client: r.Client, - ApplyProvider: upsert.NewApplyProvider(r.EnableCIDebugOutput), - HCP: hcp, - SetDefaultSecurityContext: r.SetDefaultSecurityContext, - DefaultSecurityContextUID: securityContextUID, - EnableCIDebugOutput: r.EnableCIDebugOutput, - MetricsSet: r.MetricsSet, - ReleaseImageProvider: imageProvider, - OmitOwnerReference: true, - } - - // Reconcile the control plane operator - err = r.reconcileControlPlaneOperator(cpContext, createOrUpdate, hcluster, controlPlaneOperatorImage, utilitiesImage, defaultIngressDomain, cpoHasUtilities, r.CertRotationScale, releaseImageVersion, releaseProvider) - if err != nil { - return ctrl.Result{}, fmt.Errorf("failed to reconcile control plane operator: %w", err) - } - - // Reconcile the CAPI manager components - err = r.reconcileCAPIManager(cpContext, createOrUpdate, hcluster) - if err != nil { - return ctrl.Result{}, fmt.Errorf("failed to reconcile capi manager: %w", err) - } - - // Reconcile the CAPI provider components - if err = r.reconcileCAPIProvider(cpContext, hcluster, hcp, p); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to reconcile capi provider: %w", err) - } - - if _, pkiDisabled := hcp.Annotations[hyperv1.DisablePKIReconciliationAnnotation]; controlPlanePKIOperatorSignsCSRs && !pkiDisabled { - // Reconcile the control plane PKI operator RBAC - the CPO does not have rights to do this itself - err = r.reconcileControlPlanePKIOperatorRBAC(ctx, createOrUpdate, hcluster) - if err != nil { - return ctrl.Result{}, fmt.Errorf("failed to reconcile control plane PKI operator RBAC: %w", err) - } + if err := r.reconcileSREMetricsConfig(ctx, createOrUpdate, hcp); err != nil { + errs = append(errs, fmt.Errorf("failed to reconcile SRE metrics config: %w", err)) } - - // Reconcile the network policies - if err = r.reconcileNetworkPolicies(ctx, log, createOrUpdate, hcluster, hcp, releaseImageVersion, controlPlaneOperatorAppliesManagementKASNetworkPolicyLabel); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to reconcile network policies: %w", err) + if _, err := r.reconcileOpenShiftTrustedCAs(ctx, hcp); err != nil { + errs = append(errs, fmt.Errorf("failed to reconcile OpenShift trusted CAs: %w", err)) } + return utilerrors.NewAggregate(errs) +} - // Reconcile platform specific items +// reconcilePlatformSpecific handles platform-specific reconciliation including +// KubeVirt CSI RBAC, AWS OIDC documents, and Azure SecretProviderClass resources. +func (r *HostedClusterReconciler) reconcilePlatformSpecific( + ctx context.Context, log logr.Logger, hcluster *hyperv1.HostedCluster, + hcp *hyperv1.HostedControlPlane, createOrUpdate upsert.CreateOrUpdateFN, +) error { switch hcluster.Spec.Platform.Type { case hyperv1.KubevirtPlatform: if hcluster.Spec.Platform.Kubevirt != nil && hcluster.Spec.Platform.Kubevirt.Credentials != nil { if err := r.Client.Status().Update(ctx, hcluster); err != nil { if apierrors.IsConflict(err) { - return ctrl.Result{Requeue: true}, nil + return fmt.Errorf("failed to update status after network policy RBAC check: %w", err) } - return ctrl.Result{}, fmt.Errorf("failed to update status after network policy RBAC check: %w", err) + return fmt.Errorf("failed to update status after network policy RBAC check: %w", err) } } - err = r.reconcileKubevirtCSIClusterRBAC(ctx, createOrUpdate, hcluster) - if err != nil { - return ctrl.Result{}, fmt.Errorf("failed to reconcile kubevirt CSI cluster wide RBAC: %w", err) + if err := r.reconcileKubevirtCSIClusterRBAC(ctx, createOrUpdate, hcluster); err != nil { + return fmt.Errorf("failed to reconcile kubevirt CSI cluster wide RBAC: %w", err) } case hyperv1.AWSPlatform: if err := r.reconcileOIDCDocumentsWithStatus(ctx, hcluster, func() error { return r.reconcileAWSOIDCDocuments(ctx, log, hcluster, hcp) }); err != nil { - return ctrl.Result{}, err + return err } case hyperv1.AzurePlatform: if azureutil.IsAroHCPByHC(hcluster) { // Reconcile CPO SecretProviderClass CR cpoSecretProviderClass := cpomanifests.ManagedAzureSecretProviderClass(config.ManagedAzureCPOSecretProviderClassName, hcp.Namespace) - if _, err = createOrUpdate(ctx, r, cpoSecretProviderClass, func() error { + if _, err := createOrUpdate(ctx, r, cpoSecretProviderClass, func() error { secretproviderclass.ReconcileManagedAzureSecretProviderClass(cpoSecretProviderClass, hcp, hcp.Spec.Platform.Azure.AzureAuthenticationConfig.ManagedIdentities.ControlPlane.ControlPlaneOperator) return nil }); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to reconcile control plane operator secret provider class: %w", err) + return fmt.Errorf("failed to reconcile control plane operator secret provider class: %w", err) } // Reconcile CAPZ SecretProviderClass CR nodepoolMgmtSecretProviderClass := cpomanifests.ManagedAzureSecretProviderClass(config.ManagedAzureNodePoolMgmtSecretProviderClassName, hcp.Namespace) - if _, err = createOrUpdate(ctx, r, nodepoolMgmtSecretProviderClass, func() error { + if _, err := createOrUpdate(ctx, r, nodepoolMgmtSecretProviderClass, func() error { secretproviderclass.ReconcileManagedAzureSecretProviderClass(nodepoolMgmtSecretProviderClass, hcp, hcp.Spec.Platform.Azure.AzureAuthenticationConfig.ManagedIdentities.ControlPlane.NodePoolManagement) return nil }); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to reconcile nodepool management secret provider class: %w", err) + return fmt.Errorf("failed to reconcile nodepool management secret provider class: %w", err) } } @@ -2102,7 +2334,7 @@ func (r *HostedClusterReconciler) reconcile(ctx context.Context, req ctrl.Reques secretproviderclass.ReconcileManagedAzureSecretProviderClass(kmsSecretProviderClass, hcp, hcp.Spec.SecretEncryption.KMS.Azure.KMS) return nil }); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to reconcile KMS SecretProviderClass: %w", err) + return fmt.Errorf("failed to reconcile KMS SecretProviderClass: %w", err) } } } @@ -2110,32 +2342,10 @@ func (r *HostedClusterReconciler) reconcile(ctx context.Context, req ctrl.Reques if err := r.reconcileOIDCDocumentsWithStatus(ctx, hcluster, func() error { return r.reconcileGCPOIDCDocuments(ctx, log, hcluster, hcp) }); err != nil { - return ctrl.Result{}, err - } - } - - if err := r.reconcileKarpenterOperator(cpContext, hcluster, r.HypershiftOperatorImage, controlPlaneOperatorImage); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to reconcile karpenter operator: %w", err) - } - - if pubEndpointRequeue, err := r.reconcilePublicEndpointExposedCondition(ctx, hcluster); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to reconcile PublicEndpointExposed condition: %w", err) - } else if pubEndpointRequeue != nil && (requeueAfter == nil || *pubEndpointRequeue < *requeueAfter) { - requeueAfter = pubEndpointRequeue - } - - log.Info("successfully reconciled") - result := ctrl.Result{} - if requeueAfter != nil { - result.RequeueAfter = *requeueAfter - } - if autoNodeProgressing { - autoNodeRequeue := 15 * time.Second - if result.RequeueAfter == 0 || autoNodeRequeue < result.RequeueAfter { - result.RequeueAfter = autoNodeRequeue + return err } } - return result, nil + return nil } // reconcileCustomExternalKubeconfig copies the custom kubeconfig secret from the @@ -2641,13 +2851,18 @@ func (r *HostedClusterReconciler) reconcileCAPIProvider(cpContext controlplaneco // reconcileControlPlaneOperator orchestrates reconciliation of the control plane // operator components. -func (r *HostedClusterReconciler) reconcileControlPlaneOperator(cpContext controlplanecomponent.ControlPlaneContext, createOrUpdate upsert.CreateOrUpdateFN, hcluster *hyperv1.HostedCluster, controlPlaneOperatorImage, utilitiesImage, defaultIngressDomain string, cpoHasUtilities bool, certRotationScale time.Duration, releaseVersion semver.Version, releaseProvider releaseinfo.ProviderWithOpenShiftImageRegistryOverrides) error { +func (r *HostedClusterReconciler) reconcileControlPlaneOperator(cpContext controlplanecomponent.ControlPlaneContext, createOrUpdate upsert.CreateOrUpdateFN, hcluster *hyperv1.HostedCluster, controlPlaneOperatorImage, utilitiesImage string, cpoHasUtilities bool, certRotationScale time.Duration, releaseVersion semver.Version, releaseProvider releaseinfo.ProviderWithOpenShiftImageRegistryOverrides) error { controlPlaneNamespace := manifests.HostedControlPlaneNamespaceObject(hcluster.Namespace, hcluster.Name) err := r.Client.Get(cpContext, client.ObjectKeyFromObject(controlPlaneNamespace), controlPlaneNamespace) if err != nil { return fmt.Errorf("failed to get control plane namespace: %w", err) } + defaultIngressDomain, err := r.defaultIngressDomain(cpContext) + if err != nil { + return fmt.Errorf("failed to determine default ingress domain: %w", err) + } + // TODO: Remove this block after initial merge of this feature. It is not needed for latest CPO version if r.ManagementClusterCapabilities.Has(capabilities.CapabilityRoute) && releaseVersion.Major == 4 && releaseVersion.Minor <= 14 { controlPlaneOperatorServiceAccount := controlplaneoperator.OperatorServiceAccount(controlPlaneNamespace.Name) @@ -4646,25 +4861,6 @@ func (r *HostedClusterReconciler) reconcileAWSResourceTags(ctx context.Context, return nil } -func (r *HostedClusterReconciler) reconcileAWSSubnets(ctx context.Context, _ upsert.CreateOrUpdateFN, - _ client.Object, namespace, clusterName, _ string, -) error { - nodePools, err := listNodePools(ctx, r.Client, namespace, clusterName) - if err != nil { - return fmt.Errorf("failed to get nodePools by cluster name for cluster %q: %w", clusterName, err) - } - subnetIDs := []string{} - for _, nodePool := range nodePools { - if nodePool.Spec.Platform.AWS != nil && - nodePool.Spec.Platform.AWS.Subnet.ID != nil { - subnetIDs = append(subnetIDs, *nodePool.Spec.Platform.AWS.Subnet.ID) - } - } - // Sort for stable update detection (is this needed?) - sort.Strings(subnetIDs) - return nil -} - func (r *HostedClusterReconciler) lookupReleaseImage(ctx context.Context, hcluster *hyperv1.HostedCluster, releaseProvider releaseinfo.ProviderWithOpenShiftImageRegistryOverrides) (*releaseinfo.ReleaseImage, error) { pullSecretBytes, err := hyperutil.GetPullSecretBytes(ctx, r.Client, hcluster) if err != nil { diff --git a/hypershift-operator/controllers/hostedcluster/hostedcluster_controller_test.go b/hypershift-operator/controllers/hostedcluster/hostedcluster_controller_test.go index 4554f0cbc5b5..803e1ccf4d3b 100644 --- a/hypershift-operator/controllers/hostedcluster/hostedcluster_controller_test.go +++ b/hypershift-operator/controllers/hostedcluster/hostedcluster_controller_test.go @@ -53,6 +53,7 @@ import ( appsv1 "k8s.io/api/apps/v1" batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" "k8s.io/apimachinery/pkg/api/equality" errors2 "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" @@ -7657,3 +7658,394 @@ func TestReconcileETCDMemberRecovery(t *testing.T) { g.Expect(err.Error()).To(ContainSubstring("failed to get etcd statefulset")) }) } + +func TestReconcileKubeconfigAndPasswordSync_WhenKubeconfigFails_ItShouldStillSyncKubeadminPassword(t *testing.T) { + const ( + hcName = "test-hc" + hcNamespace = "test-ns" + hcpNamespace = "test-hcp-ns" + ) + + // HCP references a kubeconfig secret that does NOT exist (will cause kubeconfig sync to fail) + // but has a kubeadmin password secret that DOES exist (should succeed despite kubeconfig failure) + kubeadminSrc := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: hcpNamespace, + Name: "kubeadmin-password", + }, + Data: map[string][]byte{ + "password": []byte("test-password"), + }, + } + + hcluster := &hyperv1.HostedCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: hcName, + Namespace: hcNamespace, + }, + } + + hcp := &hyperv1.HostedControlPlane{ + ObjectMeta: metav1.ObjectMeta{ + Name: hcName, + Namespace: hcpNamespace, + }, + Status: hyperv1.HostedControlPlaneStatus{ + KubeConfig: &hyperv1.KubeconfigSecretRef{ + Name: "nonexistent-kubeconfig", // this secret does NOT exist + Key: "kubeconfig", + }, + KubeadminPassword: &corev1.LocalObjectReference{ + Name: "kubeadmin-password", // this secret exists + }, + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(api.Scheme). + WithObjects(hcluster, kubeadminSrc). + Build() + + r := &HostedClusterReconciler{ + Client: fakeClient, + } + createOrUpdate := upsert.New(false).CreateOrUpdate + + err := r.reconcileKubeconfigAndPasswordSync(t.Context(), createOrUpdate, hcluster, hcp, false) + + // Kubeconfig sync should have failed + if err == nil || !strings.Contains(err.Error(), "kubeconfig") { + t.Errorf("expected kubeconfig sync to fail, but got: %v", err) + } + + // Kubeadmin password sync should have succeeded — verify the destination secret was created + destSecret := &corev1.Secret{} + destKey := crclient.ObjectKey{ + Namespace: hcNamespace, + Name: fmt.Sprintf("%s-kubeadmin-password", hcName), + } + if err := fakeClient.Get(t.Context(), destKey, destSecret); err != nil { + t.Fatalf("kubeadmin password secret should have been synced despite kubeconfig failure, but got: %v", err) + } + if string(destSecret.Data["password"]) != "test-password" { + t.Errorf("expected kubeadmin password data to be synced, got: %v", destSecret.Data) + } +} + +func TestReconcileRBACAndPolicies_WhenPKIRBACFails_ItShouldStillCreatePrometheusRBAC(t *testing.T) { + const ( + hcName = "test-hc" + hcNamespace = "test-ns" + hcpNamespace = "test-hcp-ns" + ) + + hcluster := &hyperv1.HostedCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: hcName, + Namespace: hcNamespace, + }, + Spec: hyperv1.HostedClusterSpec{ + Platform: hyperv1.PlatformSpec{ + Type: hyperv1.NonePlatform, + }, + InfraID: "test-infra", + }, + } + + hcp := &hyperv1.HostedControlPlane{ + ObjectMeta: metav1.ObjectMeta{ + Name: hcName, + Namespace: hcpNamespace, + }, + Spec: hyperv1.HostedControlPlaneSpec{ + Platform: hyperv1.PlatformSpec{ + Type: hyperv1.NonePlatform, + }, + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(api.Scheme). + WithObjects(hcluster, hcp). + Build() + + r := &HostedClusterReconciler{ + Client: fakeClient, + EnableOCPClusterMonitoring: true, + ManagementClusterCapabilities: &fakecapabilities.FakeSupportAllCapabilities{}, + } + createOrUpdate := upsert.New(false).CreateOrUpdate + log := zap.New(zap.UseDevMode(true), zap.Level(zapcore.InfoLevel)) + releaseVersion := semver.MustParse("4.16.0") + + _ = r.reconcileRBACAndPolicies(t.Context(), log, createOrUpdate, hcluster, hcp, + true, // controlPlanePKIOperatorSignsCSRs — enables PKI RBAC + false, releaseVersion) + + // Regardless of PKI RBAC or network policies outcome, prometheus RBAC should have run. + // Verify the prometheus Role was created in the HCP namespace. + role := &rbacv1.Role{} + roleKey := crclient.ObjectKey{Namespace: hcpNamespace, Name: "openshift-prometheus"} + if err := fakeClient.Get(t.Context(), roleKey, role); err != nil { + t.Fatalf("prometheus RBAC Role should have been created despite other failures, but got: %v", err) + } + if len(role.Rules) == 0 { + t.Errorf("expected prometheus Role to have rules, got empty") + } + + binding := &rbacv1.RoleBinding{} + bindingKey := crclient.ObjectKey{Namespace: hcpNamespace, Name: "openshift-prometheus"} + if err := fakeClient.Get(t.Context(), bindingKey, binding); err != nil { + t.Fatalf("prometheus RBAC RoleBinding should have been created, but got: %v", err) + } +} + +// TestReconcileNonBlockingBehavior verifies that the error-collecting reconciliation +// pattern works correctly: a failure in one phase or operation should not block +// independent operations in other phases from running. +func TestReconcileNonBlockingBehavior(t *testing.T) { + tests := []struct { + name string + // mutateHC applies the fault injection to the HostedCluster before reconcile runs. + mutateHC func(t *testing.T, hc *hyperv1.HostedCluster, client crclient.Client) + // createOrUpdateOverride optionally overrides the createOrUpdate function to inject failures + // for specific resource types. If nil, the default upsert.New(false).CreateOrUpdate is used. + createOrUpdateOverride func(reconcile.Request) upsert.CreateOrUpdateFN + // expectedErrSubstring is checked against the aggregated error returned by reconcile. + expectedErrSubstring string + // verifyResources asserts that specific resources were still created despite the failure. + verifyResources func(t *testing.T, client crclient.Client, hcpNamespace, hcName string) + }{ + { + name: "When Phase 6 SSH key fails it should still create HCP and CPO deployment", + mutateHC: func(t *testing.T, hc *hyperv1.HostedCluster, client crclient.Client) { + hc.Spec.SSHKey = corev1.LocalObjectReference{Name: "nonexistent-ssh-key"} + }, + expectedErrSubstring: "SSHKey", + verifyResources: func(t *testing.T, client crclient.Client, hcpNamespace, hcName string) { + // Phase 7: HCP object + hcpObj := &hyperv1.HostedControlPlane{} + if err := client.Get(t.Context(), crclient.ObjectKey{Namespace: hcpNamespace, Name: hcName}, hcpObj); err != nil { + t.Fatalf("HCP object should have been created in Phase 7, but got: %v", err) + } + // Phase 8: CPO Deployment + cpoDeployment := &appsv1.Deployment{} + if err := client.Get(t.Context(), crclient.ObjectKey{Namespace: hcpNamespace, Name: "control-plane-operator"}, cpoDeployment); err != nil { + t.Fatalf("CPO Deployment should have been created in Phase 8, but got: %v", err) + } + // Phase 8: Prometheus RBAC + role := &rbacv1.Role{} + if err := client.Get(t.Context(), crclient.ObjectKey{Namespace: hcpNamespace, Name: "openshift-prometheus"}, role); err != nil { + t.Fatalf("prometheus RBAC Role should have been created in Phase 8, but got: %v", err) + } + }, + }, + { + name: "When Phase 6a secret encryption fails (critical) it should still create HCP but block Phase 8", + mutateHC: func(t *testing.T, hc *hyperv1.HostedCluster, client crclient.Client) { + hc.Spec.SecretEncryption = &hyperv1.SecretEncryptionSpec{ + Type: hyperv1.AESCBC, + AESCBC: &hyperv1.AESCBCSpec{ + ActiveKey: corev1.LocalObjectReference{Name: "nonexistent-aescbc-key"}, + }, + } + }, + expectedErrSubstring: "aescbc", + verifyResources: func(t *testing.T, client crclient.Client, hcpNamespace, hcName string) { + // Phase 7: HCP object should still be created (not blocked by Phase 6a) + hcpObj := &hyperv1.HostedControlPlane{} + if err := client.Get(t.Context(), crclient.ObjectKey{Namespace: hcpNamespace, Name: hcName}, hcpObj); err != nil { + t.Fatalf("HCP object should have been created in Phase 7, but got: %v", err) + } + // Phase 8: CPO Deployment should NOT exist (blocked by critical failure) + cpoDeployment := &appsv1.Deployment{} + err := client.Get(t.Context(), crclient.ObjectKey{Namespace: hcpNamespace, Name: "control-plane-operator"}, cpoDeployment) + if err == nil { + t.Fatal("CPO Deployment should NOT have been created (Phase 8 blocked by critical SecretEncryptionSync failure)") + } + }, + }, + { + name: "When Phase 6 audit webhook fails it should still sync SSH key and create HCP", + mutateHC: func(t *testing.T, hc *hyperv1.HostedCluster, client crclient.Client) { + // Provide a valid SSH key + hc.Spec.SSHKey = corev1.LocalObjectReference{Name: "test-ssh-key"} + sshKey := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: hc.Namespace, + Name: "test-ssh-key", + }, + Data: map[string][]byte{ + "id_rsa.pub": []byte("ssh-rsa AAAA..."), + }, + } + if err := client.Create(t.Context(), sshKey); err != nil { + t.Fatalf("failed to create SSH key: %v", err) + } + // Reference a nonexistent audit webhook to trigger Phase 6 failure + hc.Spec.AuditWebhook = &corev1.LocalObjectReference{Name: "nonexistent-audit-webhook"} + }, + expectedErrSubstring: "audit webhook", + verifyResources: func(t *testing.T, client crclient.Client, hcpNamespace, hcName string) { + // Phase 6: SSH key should have been synced independently + sshKeyDst := controlplaneoperator.SSHKey(hcpNamespace) + if err := client.Get(t.Context(), crclient.ObjectKeyFromObject(sshKeyDst), sshKeyDst); err != nil { + t.Fatalf("SSH key should have been synced to HCP namespace, but got: %v", err) + } + // Phase 7: HCP object + hcpObj := &hyperv1.HostedControlPlane{} + if err := client.Get(t.Context(), crclient.ObjectKey{Namespace: hcpNamespace, Name: hcName}, hcpObj); err != nil { + t.Fatalf("HCP object should have been created in Phase 7, but got: %v", err) + } + // Phase 8: CPO Deployment + cpoDeployment := &appsv1.Deployment{} + if err := client.Get(t.Context(), crclient.ObjectKey{Namespace: hcpNamespace, Name: "control-plane-operator"}, cpoDeployment); err != nil { + t.Fatalf("CPO Deployment should have been created in Phase 8, but got: %v", err) + } + }, + }, + { + name: "When Phase 7 HCP creation fails (critical) it should block Phase 8", + mutateHC: func(t *testing.T, hc *hyperv1.HostedCluster, client crclient.Client) {}, + createOrUpdateOverride: func(req reconcile.Request) upsert.CreateOrUpdateFN { + delegate := upsert.New(false).CreateOrUpdate + return func(ctx context.Context, c crclient.Client, obj crclient.Object, f controllerutil.MutateFn) (controllerutil.OperationResult, error) { + if _, ok := obj.(*hyperv1.HostedControlPlane); ok { + return controllerutil.OperationResultNone, fmt.Errorf("injected HCP creation failure") + } + return delegate(ctx, c, obj, f) + } + }, + expectedErrSubstring: "injected HCP creation failure", + verifyResources: func(t *testing.T, client crclient.Client, hcpNamespace, hcName string) { + // Phase 8: CPO Deployment should NOT exist (blocked by critical CoreHCPChain failure) + cpoDeployment := &appsv1.Deployment{} + err := client.Get(t.Context(), crclient.ObjectKey{Namespace: hcpNamespace, Name: "control-plane-operator"}, cpoDeployment) + if err == nil { + t.Fatal("CPO Deployment should NOT have been created (Phase 8 blocked by critical CoreHCPChain failure)") + } + // Phase 8: Prometheus RBAC should NOT exist (blocked) + role := &rbacv1.Role{} + err = client.Get(t.Context(), crclient.ObjectKey{Namespace: hcpNamespace, Name: "openshift-prometheus"}, role) + if err == nil { + t.Fatal("prometheus RBAC Role should NOT have been created (Phase 8 blocked by critical CoreHCPChain failure)") + } + }, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + const ( + hcName = "test-hc" + hcNamespace = "test-ns" + ) + hcpNamespace := hcpmanifests.HostedControlPlaneNamespace(hcNamespace, hcName) + + mockCtrl := gomock.NewController(t) + mockReleaseProvider := releaseinfo.NewMockProviderWithOpenShiftImageRegistryOverrides(mockCtrl) + mockReleaseProvider.EXPECT(). + Lookup(gomock.Any(), gomock.Any(), gomock.Any()). + Return(testutils.InitReleaseImageOrDie("4.16.0"), nil).AnyTimes() + mockReleaseProvider.EXPECT().GetRegistryOverrides().Return(nil).AnyTimes() + mockReleaseProvider.EXPECT().GetOpenShiftImageRegistryOverrides().Return(nil).AnyTimes() + mockReleaseProvider.EXPECT().GetMirroredReleaseImage().Return("").AnyTimes() + + fakeMetadata := fakeimagemetadataprovider.FakeRegistryClientImageMetadataProvider{ + Result: &dockerv1client.DockerImageConfig{Architecture: "amd64"}, + } + + pullSecret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Namespace: hcNamespace, Name: "pull-secret"}, + Data: map[string][]byte{".dockerconfigjson": []byte(`{"auths":{}}`)}, + } + ingress := &configv1.Ingress{ + ObjectMeta: metav1.ObjectMeta{Name: "cluster"}, + Spec: configv1.IngressSpec{Domain: "apps.test.example.com"}, + } + + hcluster := &hyperv1.HostedCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: hcName, + Namespace: hcNamespace, + Annotations: map[string]string{ + hyperv1.ControlPlaneOperatorImageAnnotation: "test-cpo-image", + hyperv1.ControlPlaneOperatorImageLabelsAnnotation: "fake-label=true", + hyperv1.SkipReleaseImageValidation: "true", + }, + }, + Spec: hyperv1.HostedClusterSpec{ + Platform: hyperv1.PlatformSpec{Type: hyperv1.NonePlatform}, + Release: hyperv1.Release{Image: "quay.io/openshift-release-dev/ocp-release:4.16.0-x86_64"}, + PullSecret: corev1.LocalObjectReference{Name: "pull-secret"}, + Etcd: hyperv1.EtcdSpec{ManagementType: hyperv1.Managed}, + InfraID: "test-infra", + Services: []hyperv1.ServicePublishingStrategyMapping{ + { + Service: hyperv1.APIServer, + ServicePublishingStrategy: hyperv1.ServicePublishingStrategy{ + Type: hyperv1.Route, + Route: &hyperv1.RoutePublishingStrategy{Hostname: "api.test.example.com"}, + }, + }, + {Service: hyperv1.Ignition, ServicePublishingStrategy: hyperv1.ServicePublishingStrategy{Type: hyperv1.Route}}, + {Service: hyperv1.Konnectivity, ServicePublishingStrategy: hyperv1.ServicePublishingStrategy{Type: hyperv1.Route}}, + {Service: hyperv1.OAuthServer, ServicePublishingStrategy: hyperv1.ServicePublishingStrategy{Type: hyperv1.Route}}, + }, + Networking: hyperv1.ClusterNetworking{ + ClusterNetwork: []hyperv1.ClusterNetworkEntry{{CIDR: *ipnet.MustParseCIDR("10.132.0.0/14")}}, + ServiceNetwork: []hyperv1.ServiceNetworkEntry{{CIDR: *ipnet.MustParseCIDR("172.31.0.0/16")}}, + }, + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(api.Scheme). + WithObjects(hcluster, pullSecret, ingress). + WithStatusSubresource(hcluster). + Build() + + // Apply test-specific mutations (fault injection) + tc.mutateHC(t, hcluster, fakeClient) + if err := fakeClient.Update(t.Context(), hcluster); err != nil { + t.Fatalf("failed to update hcluster: %v", err) + } + + createOrUpdateFn := func(req reconcile.Request) upsert.CreateOrUpdateFN { + return upsert.New(false).CreateOrUpdate + } + if tc.createOrUpdateOverride != nil { + createOrUpdateFn = tc.createOrUpdateOverride + } + + r := &HostedClusterReconciler{ + Client: fakeClient, + Clock: clock.RealClock{}, + ManagementClusterCapabilities: &fakecapabilities.FakeSupportAllCapabilities{}, + EnableOCPClusterMonitoring: true, + CertRotationScale: 24 * time.Hour, + HypershiftOperatorImage: "test-hso-image", + RegistryProvider: fakeReleaseProvider{ + releaseProvider: mockReleaseProvider, + metadataProvider: fakeMetadata, + }, + createOrUpdate: createOrUpdateFn, + now: metav1.Now, + } + + req := ctrl.Request{NamespacedName: crclient.ObjectKeyFromObject(hcluster)} + log := zap.New(zap.UseDevMode(true), zap.Level(zapcore.InfoLevel)) + + _, err := r.reconcile(t.Context(), req, log, hcluster) + + if err == nil { + t.Fatalf("expected reconcile to return an error containing %q, but got nil", tc.expectedErrSubstring) + } + if !strings.Contains(err.Error(), tc.expectedErrSubstring) { + t.Errorf("expected aggregated error to contain %q, got: %v", tc.expectedErrSubstring, err) + } + + tc.verifyResources(t, fakeClient, hcpNamespace, hcName) + }) + } +} diff --git a/hypershift-operator/controllers/hostedcluster/internal/platform/aws/aws.go b/hypershift-operator/controllers/hostedcluster/internal/platform/aws/aws.go index ac588320f15e..f42affcf8081 100644 --- a/hypershift-operator/controllers/hostedcluster/internal/platform/aws/aws.go +++ b/hypershift-operator/controllers/hostedcluster/internal/platform/aws/aws.go @@ -100,7 +100,7 @@ func (p AWS) ReconcileCAPIInfraCR(ctx context.Context, c client.Client, createOr return awsCluster, nil } -func (p AWS) CAPIProviderDeploymentSpec(hcluster *hyperv1.HostedCluster, hcp *hyperv1.HostedControlPlane) (*appsv1.DeploymentSpec, error) { +func (p AWS) CAPIProviderDeploymentSpec(hcluster *hyperv1.HostedCluster, _ *hyperv1.HostedControlPlane) (*appsv1.DeploymentSpec, error) { providerImage := p.capiProviderImage if envImage := os.Getenv(images.AWSCAPIProviderEnvVar); len(envImage) > 0 { // Only override CAPA image with env var if payload version < 4.12 diff --git a/hypershift-operator/controllers/hostedcluster/reconcile_report.go b/hypershift-operator/controllers/hostedcluster/reconcile_report.go new file mode 100644 index 000000000000..5cf1d99730bb --- /dev/null +++ b/hypershift-operator/controllers/hostedcluster/reconcile_report.go @@ -0,0 +1,169 @@ +package hostedcluster + +import ( + "fmt" + "strings" + "time" + + utilerrors "k8s.io/apimachinery/pkg/util/errors" +) + +// operationCategory classifies a reconcile operation for error handling and blocking. +type operationCategory int + +const ( + // critical operations block all downstream component operations when they fail. + critical operationCategory = iota + // nonCritical operations collect errors but never block other work. + nonCritical +) + +// operationResult records the outcome of a single reconcile operation. +type operationResult struct { + name string + category operationCategory + err error + blocked bool +} + +// reconcileReport collects results from all reconcile operations and produces +// the final error aggregate, condition messages, and requeue signals. +type reconcileReport struct { + results []operationResult + requeueAfter *time.Duration + legacy bool +} + +// execute runs fn and records its result. +// In legacy mode, any prior failure blocks execution. +func (r *reconcileReport) execute(name string, cat operationCategory, fn func() error) { + if r.legacy && r.hasAnyFailure() { + r.results = append(r.results, operationResult{name: name, category: cat, blocked: true}) + return + } + r.results = append(r.results, operationResult{name: name, category: cat, err: fn()}) +} + +// executeOrBlock runs fn and records its result as nonCritical, or records +// a blocked entry if a prior operation should block downstream work. +// In fail-fast mode, any prior failure blocks; in error-collecting mode, +// only critical failures block. +func (r *reconcileReport) executeOrBlock(name string, fn func() error) { + if r.shouldBlock() { + r.results = append(r.results, operationResult{name: name, category: nonCritical, blocked: true}) + return + } + r.results = append(r.results, operationResult{name: name, category: nonCritical, err: fn()}) +} + +// hasCriticalFailure returns true if any critical operation has actually failed (not blocked). +func (r *reconcileReport) hasCriticalFailure() bool { + for _, res := range r.results { + if res.category == critical && res.err != nil && !res.blocked { + return true + } + } + return false +} + +// hasAnyFailure returns true if any operation has actually failed (not blocked). +func (r *reconcileReport) hasAnyFailure() bool { + for _, res := range r.results { + if res.err != nil && !res.blocked { + return true + } + } + return false +} + +// shouldBlock returns true if subsequent dependent operations should be skipped. +// In legacy mode, any failure blocks. Otherwise only critical failures block. +func (r *reconcileReport) shouldBlock() bool { + if r.legacy { + return r.hasAnyFailure() + } + return r.hasCriticalFailure() +} + +// criticalFailureNames returns the deduplicated names of critical operations that failed. +func (r *reconcileReport) criticalFailureNames() []string { + var names []string + seen := make(map[string]bool) + for _, res := range r.results { + if res.category == critical && res.err != nil && !res.blocked && !seen[res.name] { + names = append(names, res.name) + seen[res.name] = true + } + } + return names +} + +// blockedNames returns the deduplicated names of operations that were blocked. +func (r *reconcileReport) blockedNames() []string { + var names []string + seen := make(map[string]bool) + for _, res := range r.results { + if res.blocked && !seen[res.name] { + names = append(names, res.name) + seen[res.name] = true + } + } + return names +} + +// allErrors returns all non-nil errors from operations that actually ran. +// Blocked operations are excluded — they carry no diagnostic value beyond +// what conditionMessage() already reports. +func (r *reconcileReport) allErrors() []error { + var errs []error + for _, res := range r.results { + if res.err != nil && !res.blocked { + errs = append(errs, res.err) + } + } + return errs +} + +// aggregate returns the final error for the reconcile return value. +// When critical failures exist, only critical errors are returned with a +// summary of blocked operations — non-critical errors are suppressed since +// the user should fix the critical issue first. +// When no critical failures exist, all errors are returned as-is. +func (r *reconcileReport) aggregate() error { + if !r.hasCriticalFailure() { + return utilerrors.NewAggregate(r.allErrors()) + } + critErr := utilerrors.NewAggregate(r.criticalErrors()) + if blocked := r.blockedNames(); len(blocked) > 0 { + return fmt.Errorf("critical error: %w; blocked operations: [%s]", + critErr, strings.Join(blocked, ", ")) + } + return fmt.Errorf("critical error: %w", critErr) +} + +// criticalErrors returns errors only from critical operations that actually failed. +func (r *reconcileReport) criticalErrors() []error { + var errs []error + for _, res := range r.results { + if res.category == critical && res.err != nil && !res.blocked { + errs = append(errs, res.err) + } + } + return errs +} + +// conditionMessage builds a short summary for structured logging. +func (r *reconcileReport) conditionMessage() string { + criticalNames := r.criticalFailureNames() + if len(criticalNames) == 0 { + return "" + } + blockedOps := r.blockedNames() + if len(blockedOps) == 0 { + return fmt.Sprintf("critical failures: [%s]", + strings.Join(criticalNames, ", ")) + } + return fmt.Sprintf("critical failures: [%s]; blocked operations: [%s]", + strings.Join(criticalNames, ", "), + strings.Join(blockedOps, ", ")) +} diff --git a/hypershift-operator/controllers/hostedcluster/reconcile_report_test.go b/hypershift-operator/controllers/hostedcluster/reconcile_report_test.go new file mode 100644 index 000000000000..729deb29c224 --- /dev/null +++ b/hypershift-operator/controllers/hostedcluster/reconcile_report_test.go @@ -0,0 +1,355 @@ +package hostedcluster + +import ( + "fmt" + "testing" + "time" + + . "github.com/onsi/gomega" +) + +func TestReconcileReport(t *testing.T) { + t.Run("When no operations are recorded it should report no errors", func(t *testing.T) { + g := NewWithT(t) + report := &reconcileReport{} + + g.Expect(report.hasCriticalFailure()).To(BeFalse()) + g.Expect(report.allErrors()).To(BeEmpty()) + g.Expect(report.aggregate()).To(BeNil()) + g.Expect(report.conditionMessage()).To(BeEmpty()) + g.Expect(report.criticalFailureNames()).To(BeEmpty()) + g.Expect(report.blockedNames()).To(BeEmpty()) + }) + + t.Run("When all operations succeed it should report no errors", func(t *testing.T) { + g := NewWithT(t) + report := &reconcileReport{} + report.execute("PullSecretSync", critical, func() error { return nil }) + report.execute("SSHKeySync", nonCritical, func() error { return nil }) + + g.Expect(report.hasCriticalFailure()).To(BeFalse()) + g.Expect(report.allErrors()).To(BeEmpty()) + g.Expect(report.aggregate()).To(BeNil()) + g.Expect(report.conditionMessage()).To(BeEmpty()) + }) + + t.Run("When a critical operation fails it should report critical failure", func(t *testing.T) { + g := NewWithT(t) + report := &reconcileReport{} + report.execute("PullSecretSync", critical, func() error { + return fmt.Errorf("secret not found") + }) + report.execute("SSHKeySync", nonCritical, func() error { return nil }) + + g.Expect(report.hasCriticalFailure()).To(BeTrue()) + g.Expect(report.criticalFailureNames()).To(ConsistOf("PullSecretSync")) + g.Expect(report.allErrors()).To(HaveLen(1)) + g.Expect(report.aggregate()).To(HaveOccurred()) + }) + + t.Run("When a non-critical operation fails it should not report critical failure", func(t *testing.T) { + g := NewWithT(t) + report := &reconcileReport{} + report.execute("PullSecretSync", critical, func() error { return nil }) + report.execute("SSHKeySync", nonCritical, func() error { + return fmt.Errorf("key not found") + }) + + g.Expect(report.hasCriticalFailure()).To(BeFalse()) + g.Expect(report.criticalFailureNames()).To(BeEmpty()) + g.Expect(report.allErrors()).To(HaveLen(1)) + g.Expect(report.aggregate()).To(HaveOccurred()) + }) + + t.Run("When operations are blocked it should track blocked names", func(t *testing.T) { + g := NewWithT(t) + report := &reconcileReport{} + report.execute("PullSecretSync", critical, func() error { + return fmt.Errorf("secret not found") + }) + report.executeOrBlock("OperatorDeployments", func() error { return nil }) + report.executeOrBlock("RBACAndPolicies", func() error { return nil }) + + g.Expect(report.hasCriticalFailure()).To(BeTrue()) + g.Expect(report.blockedNames()).To(ConsistOf("OperatorDeployments", "RBACAndPolicies")) + g.Expect(report.allErrors()).To(HaveLen(1)) + g.Expect(report.aggregate()).To(HaveOccurred()) + g.Expect(report.aggregate().Error()).To(ContainSubstring("secret not found")) + g.Expect(report.aggregate().Error()).To(ContainSubstring("blocked operations: [OperatorDeployments, RBACAndPolicies]")) + }) + + t.Run("When blocked operations are recorded it should not count as critical failure", func(t *testing.T) { + g := NewWithT(t) + report := &reconcileReport{} + report.execute("PullSecretSync", critical, func() error { return nil }) + report.executeOrBlock("CoreHCPChain", func() error { return nil }) + + g.Expect(report.hasCriticalFailure()).To(BeFalse()) + g.Expect(report.blockedNames()).To(BeEmpty()) + }) + + t.Run("When multiple critical operations fail it should deduplicate names", func(t *testing.T) { + g := NewWithT(t) + report := &reconcileReport{} + report.execute("PullSecretSync", critical, func() error { + return fmt.Errorf("error 1") + }) + report.execute("PlatformCredentials", critical, func() error { + return fmt.Errorf("error 2") + }) + + g.Expect(report.criticalFailureNames()).To(ConsistOf("PullSecretSync", "PlatformCredentials")) + }) + + t.Run("When requeueAfter is set it should be preserved", func(t *testing.T) { + g := NewWithT(t) + report := &reconcileReport{} + d := 5 * time.Minute + report.requeueAfter = &d + + g.Expect(report.requeueAfter).ToNot(BeNil()) + g.Expect(*report.requeueAfter).To(Equal(5 * time.Minute)) + }) +} + +func TestLegacyMode(t *testing.T) { + t.Run("When a non-critical operation fails in legacy mode it should block subsequent execute calls", func(t *testing.T) { + g := NewWithT(t) + report := &reconcileReport{legacy: true} + report.execute("SSHKeySync", nonCritical, func() error { + return fmt.Errorf("key not found") + }) + executed := false + report.execute("GlobalConfigSync", nonCritical, func() error { + executed = true + return nil + }) + + g.Expect(executed).To(BeFalse()) + g.Expect(report.blockedNames()).To(ConsistOf("GlobalConfigSync")) + g.Expect(report.allErrors()).To(HaveLen(1)) + }) + + t.Run("When a non-critical operation fails in legacy mode it should block executeOrBlock calls", func(t *testing.T) { + g := NewWithT(t) + report := &reconcileReport{legacy: true} + report.execute("SSHKeySync", nonCritical, func() error { + return fmt.Errorf("key not found") + }) + executed := false + report.executeOrBlock("OperatorDeployments", func() error { + executed = true + return nil + }) + + g.Expect(executed).To(BeFalse()) + g.Expect(report.blockedNames()).To(ConsistOf("OperatorDeployments")) + }) + + t.Run("When in default mode it should allow non-critical errors without blocking execute", func(t *testing.T) { + g := NewWithT(t) + report := &reconcileReport{} + report.execute("SSHKeySync", nonCritical, func() error { + return fmt.Errorf("key not found") + }) + executed := false + report.execute("GlobalConfigSync", nonCritical, func() error { + executed = true + return nil + }) + + g.Expect(executed).To(BeTrue()) + g.Expect(report.blockedNames()).To(BeEmpty()) + g.Expect(report.allErrors()).To(HaveLen(1)) + }) + + t.Run("When in default mode it should allow non-critical errors without blocking executeOrBlock", func(t *testing.T) { + g := NewWithT(t) + report := &reconcileReport{} + report.execute("SSHKeySync", nonCritical, func() error { + return fmt.Errorf("key not found") + }) + executed := false + report.executeOrBlock("OperatorDeployments", func() error { + executed = true + return nil + }) + + g.Expect(executed).To(BeTrue()) + g.Expect(report.blockedNames()).To(BeEmpty()) + }) + + t.Run("When in default mode it should still block on critical failure", func(t *testing.T) { + g := NewWithT(t) + report := &reconcileReport{} + report.execute("PullSecretSync", critical, func() error { + return fmt.Errorf("secret not found") + }) + executed := false + report.executeOrBlock("OperatorDeployments", func() error { + executed = true + return nil + }) + + g.Expect(executed).To(BeFalse()) + g.Expect(report.blockedNames()).To(ConsistOf("OperatorDeployments")) + }) +} + +func TestExecuteOrBlock(t *testing.T) { + t.Run("When no critical failure exists it should execute the function", func(t *testing.T) { + g := NewWithT(t) + report := &reconcileReport{} + executed := false + report.executeOrBlock("Op", func() error { + executed = true + return nil + }) + + g.Expect(executed).To(BeTrue()) + g.Expect(report.blockedNames()).To(BeEmpty()) + }) + + t.Run("When a critical failure exists it should block and not execute the function", func(t *testing.T) { + g := NewWithT(t) + report := &reconcileReport{} + report.execute("CritOp", critical, func() error { + return fmt.Errorf("critical failure") + }) + executed := false + report.executeOrBlock("Op", func() error { + executed = true + return nil + }) + + g.Expect(executed).To(BeFalse()) + g.Expect(report.blockedNames()).To(ConsistOf("Op")) + }) + + t.Run("When no critical failure exists it should record the error", func(t *testing.T) { + g := NewWithT(t) + report := &reconcileReport{} + report.executeOrBlock("Op", func() error { + return fmt.Errorf("err1") + }) + + g.Expect(report.allErrors()).To(HaveLen(1)) + g.Expect(report.blockedNames()).To(BeEmpty()) + }) + + t.Run("When a critical failure exists it should block and not execute", func(t *testing.T) { + g := NewWithT(t) + report := &reconcileReport{} + report.execute("CritOp", critical, func() error { + return fmt.Errorf("critical failure") + }) + report.executeOrBlock("Op", func() error { + t.Fatal("should not be called") + return nil + }) + + g.Expect(report.blockedNames()).To(ConsistOf("Op")) + }) +} + +func TestConditionMessage(t *testing.T) { + t.Run("When only critical failures exist it should format critical failures", func(t *testing.T) { + g := NewWithT(t) + report := &reconcileReport{} + report.execute("PullSecretSync", critical, func() error { + return fmt.Errorf("not found") + }) + report.execute("PlatformCredentials", critical, func() error { + return fmt.Errorf("invalid") + }) + + msg := report.conditionMessage() + g.Expect(msg).To(Equal("critical failures: [PullSecretSync, PlatformCredentials]")) + }) + + t.Run("When only blocked operations exist (no critical failure) it should return empty", func(t *testing.T) { + g := NewWithT(t) + report := &reconcileReport{} + report.executeOrBlock("OperatorDeployments", func() error { return nil }) + report.executeOrBlock("Auxiliary", func() error { return nil }) + + g.Expect(report.conditionMessage()).To(BeEmpty()) + }) + + t.Run("When both critical failures and blocked operations exist it should format both", func(t *testing.T) { + g := NewWithT(t) + report := &reconcileReport{} + report.execute("PullSecretSync", critical, func() error { + return fmt.Errorf("not found") + }) + report.executeOrBlock("OperatorDeployments", func() error { return nil }) + report.executeOrBlock("Auxiliary", func() error { return nil }) + + msg := report.conditionMessage() + g.Expect(msg).To(Equal("critical failures: [PullSecretSync]; blocked operations: [OperatorDeployments, Auxiliary]")) + }) + + t.Run("When no failures or blocked operations exist it should return empty string", func(t *testing.T) { + g := NewWithT(t) + report := &reconcileReport{} + report.execute("PullSecretSync", critical, func() error { return nil }) + report.execute("SSHKeySync", nonCritical, func() error { return nil }) + + g.Expect(report.conditionMessage()).To(BeEmpty()) + }) + + t.Run("When only non-critical failures exist it should return empty string", func(t *testing.T) { + g := NewWithT(t) + report := &reconcileReport{} + report.execute("SSHKeySync", nonCritical, func() error { + return fmt.Errorf("key not found") + }) + + g.Expect(report.conditionMessage()).To(BeEmpty()) + }) +} + +func TestAggregate(t *testing.T) { + t.Run("When critical failure exists it should return critical errors and blocked list", func(t *testing.T) { + g := NewWithT(t) + report := &reconcileReport{} + report.execute("PullSecretSync", critical, func() error { + return fmt.Errorf("pull secret not found") + }) + report.execute("SSHKeySync", nonCritical, func() error { + return fmt.Errorf("ssh key not found") + }) + report.executeOrBlock("OperatorDeployments", func() error { return nil }) + + err := report.aggregate() + g.Expect(err).To(HaveOccurred()) + g.Expect(err.Error()).To(ContainSubstring("pull secret not found")) + g.Expect(err.Error()).To(ContainSubstring("blocked operations: [OperatorDeployments]")) + g.Expect(err.Error()).NotTo(ContainSubstring("ssh key")) + }) + + t.Run("When no critical failure exists it should return all errors", func(t *testing.T) { + g := NewWithT(t) + report := &reconcileReport{} + report.execute("SSHKeySync", nonCritical, func() error { + return fmt.Errorf("ssh key not found") + }) + report.execute("AuditWebhook", nonCritical, func() error { + return fmt.Errorf("webhook not found") + }) + + err := report.aggregate() + g.Expect(err).To(HaveOccurred()) + g.Expect(err.Error()).To(ContainSubstring("ssh key not found")) + g.Expect(err.Error()).To(ContainSubstring("webhook not found")) + }) + + t.Run("When no errors exist it should return nil", func(t *testing.T) { + g := NewWithT(t) + report := &reconcileReport{} + report.execute("PullSecretSync", critical, func() error { return nil }) + report.execute("SSHKeySync", nonCritical, func() error { return nil }) + + g.Expect(report.aggregate()).To(BeNil()) + }) +} diff --git a/hypershift-operator/main.go b/hypershift-operator/main.go index a663242bf0f8..a4ac68bbedc7 100644 --- a/hypershift-operator/main.go +++ b/hypershift-operator/main.go @@ -504,6 +504,7 @@ func setupHostedClusterController(ctx context.Context, mgr ctrl.Manager, opts *S monitoringDashboards := (os.Getenv("MONITORING_DASHBOARDS") == "1") enableCVOManagementClusterMetricsAccess := (os.Getenv(config.EnableCVOManagementClusterMetricsAccessEnvVar) == "1") enableEtcdRecovery := os.Getenv(config.EnableEtcdRecoveryEnvVar) == "1" + reconcileLegacy := os.Getenv(config.ReconcileLegacyEnvVar) == "1" certRotationScale, err := pkiconfig.GetCertRotationScale() if err != nil { @@ -526,6 +527,7 @@ func setupHostedClusterController(ctx context.Context, mgr ctrl.Manager, opts *S CertRotationScale: certRotationScale, EnableCVOManagementClusterMetricsAccess: enableCVOManagementClusterMetricsAccess, EnableEtcdRecovery: enableEtcdRecovery, + ReconcileLegacy: reconcileLegacy, FeatureSet: featuregate.FeatureSet(), OpenShiftTrustedCAFilePath: "/etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem", } diff --git a/support/config/constants.go b/support/config/constants.go index 3eac77aee494..b5b257ffb15c 100644 --- a/support/config/constants.go +++ b/support/config/constants.go @@ -49,6 +49,7 @@ const ( CVOPrometheusURLEnvVar = "CVO_PROMETHEUS_URL" EnableEtcdRecoveryEnvVar = "ENABLE_ETCD_RECOVERY" + ReconcileLegacyEnvVar = "HYPERSHIFT_RECONCILE_LEGACY" AuditWebhookService = "audit-webhook" From c3900c4efeeec225338b7dc1a2937c291c646df5 Mon Sep 17 00:00:00 2001 From: Mulham Raee Date: Tue, 9 Jun 2026 18:31:28 +0200 Subject: [PATCH 2/3] refactor(hostedcluster): integrate pull-secret-unavailable fix into error-collection framework MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the ad-hoc early-return pull-secret recovery path (PR #8352) with the error-collection framework. Instead of inline HCP reconciliation when GetPullSecretBytes fails, the reconciliation now flows through the framework where PullSecretSync captures the error as critical and CoreHCPChain reconciles the HCP with full cert resolution. Key changes: - Move GetPullSecretBytes, CPO image/label resolution, and namespace reconciliation into a single report.execute("CPOImageAndNamespace") block. This prevents namespace PSA label downgrades when CPO labels are unavailable. - Make DetermineHostedClusterPayloadArch and lookupReleaseImage non-fatal so reconciliation continues to the framework. - Make cpoSupportsKASCustomKubeconfig status check unconditional — all supported CPO versions expose custom kubeconfig. - Wrap releaseImageVersion parsing in report.execute(critical) to block OperatorDeployments and RBACAndPolicies on failure instead of hard-returning. - Extract reconcileControlPlaneNamespace into its own method. - Update pull-secret-missing tests with valid fixtures (NonePlatform, Route, valid UUID) so reconciliation reaches the framework. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../hostedcluster/hostedcluster_controller.go | 351 +++++++++--------- .../hostedcluster_controller_test.go | 200 +++++----- support/util/util.go | 18 +- 3 files changed, 300 insertions(+), 269 deletions(-) diff --git a/hypershift-operator/controllers/hostedcluster/hostedcluster_controller.go b/hypershift-operator/controllers/hostedcluster/hostedcluster_controller.go index 34b124cfffe3..a15b34580bd5 100644 --- a/hypershift-operator/controllers/hostedcluster/hostedcluster_controller.go +++ b/hypershift-operator/controllers/hostedcluster/hostedcluster_controller.go @@ -662,56 +662,16 @@ func (r *HostedClusterReconciler) reconcile(ctx context.Context, req ctrl.Reques releaseProvider := r.RegistryProvider.GetReleaseProvider() registryClientImageMetadataProvider := r.RegistryProvider.GetMetadataProvider() - pullSecretBytes, err := hyperutil.GetPullSecretBytes(ctx, r.Client, hcluster) - if err != nil { - log.Error(err, "failed to get pull secret") - // Ensure even with a bad pull secret we process pod placement decisions - // before returning the pull secret error for requeue. - hcp = controlplaneoperator.HostedControlPlane(controlPlaneNamespace.Name, hcluster.Name) - isAutoscalingNeeded, autoscaleErr := r.isAutoscalingNeeded(ctx, hcluster) - if autoscaleErr != nil { - log.Error(autoscaleErr, "failed to determine if autoscaler is needed during pull secret recovery, defaulting to true") - isAutoscalingNeeded = true - } - isAWSNodeTerminationHandlerNeeded, nthErr := r.isAWSNodeTerminationHandlerNeeded(ctx, hcluster) - if nthErr != nil { - log.Error(nthErr, "failed to determine if AWS node termination handler is needed during pull secret recovery, defaulting to true") - isAWSNodeTerminationHandlerNeeded = true - } - _, hcpErr := createOrUpdate(ctx, r.Client, hcp, func() error { - // Skip cert annotation resolution during pull secret recovery — it requires - // the pull secret to resolve the CPO image, which is unavailable here. - return reconcileHostedControlPlane(hcp, hcluster, isAutoscalingNeeded, isAWSNodeTerminationHandlerNeeded, - func() (map[string]string, error) { return nil, nil }) - }) - if hcpErr != nil { - log.Error(hcpErr, "failed to reconcile hostedcontrolplane during pull secret recovery") - } - return ctrl.Result{}, fmt.Errorf("failed to get pull secret: %w", err) - } - - controlPlaneOperatorImage, err := hyperutil.GetControlPlaneOperatorImage(ctx, hcluster, releaseProvider, r.HypershiftOperatorImage, pullSecretBytes) - if err != nil { - return ctrl.Result{}, fmt.Errorf("failed to get controlPlaneOperatorImage: %w", err) - } - controlPlaneOperatorImageLabels, err := hyperutil.GetControlPlaneOperatorImageLabels(ctx, hcluster, controlPlaneOperatorImage, pullSecretBytes, registryClientImageMetadataProvider) - if err != nil { - return ctrl.Result{}, fmt.Errorf("failed to get controlPlaneOperatorImageLabels: %w", err) - } - - _, cpoSupportsKASCustomKubeconfig := controlPlaneOperatorImageLabels[controlPlaneOperatorSupportsKASCustomKubeconfigLabel] - - if cpoSupportsKASCustomKubeconfig { - if len(hcluster.Spec.KubeAPIServerDNSName) > 0 { - CustomKubeconfigSecret := manifests.KubeConfigExternalSecret(hcluster.Namespace, hcluster.Name) - err := r.Client.Get(ctx, client.ObjectKeyFromObject(CustomKubeconfigSecret), CustomKubeconfigSecret) - if err != nil { - if !apierrors.IsNotFound(err) { - return ctrl.Result{}, fmt.Errorf("failed to reconcile external kubeconfig secret: %w", err) - } - } else { - hcluster.Status.CustomKubeconfig = &corev1.LocalObjectReference{Name: CustomKubeconfigSecret.Name} + // Set kubeconfig status unconditionally — all supported CPO versions expose the custom kubeconfig. + if len(hcluster.Spec.KubeAPIServerDNSName) > 0 { + CustomKubeconfigSecret := manifests.KubeConfigExternalSecret(hcluster.Namespace, hcluster.Name) + err := r.Client.Get(ctx, client.ObjectKeyFromObject(CustomKubeconfigSecret), CustomKubeconfigSecret) + if err != nil { + if !apierrors.IsNotFound(err) { + return ctrl.Result{}, fmt.Errorf("failed to reconcile external kubeconfig secret: %w", err) } + } else { + hcluster.Status.CustomKubeconfig = &corev1.LocalObjectReference{Name: CustomKubeconfigSecret.Name} } } @@ -1216,7 +1176,7 @@ func (r *HostedClusterReconciler) reconcile(ctx context.Context, req ctrl.Reques condition.Status = metav1.ConditionFalse condition.Message = err.Error() - if apierrors.IsNotFound(err) { + if errors.Is(err, hyperutil.ErrPullSecretUnavailable) { condition.Reason = hyperv1.SecretNotFoundReason } else { condition.Reason = hyperv1.InvalidImageReason @@ -1233,24 +1193,24 @@ func (r *HostedClusterReconciler) reconcile(ctx context.Context, req ctrl.Reques // Set HostedCluster payload arch payloadArch, err := hyperutil.DetermineHostedClusterPayloadArch(ctx, r.Client, hcluster, registryClientImageMetadataProvider) if err != nil { - condition := metav1.Condition{ + log.Error(err, "failed to determine payload arch") + reason := hyperv1.PayloadArchNotFoundReason + if errors.Is(err, hyperutil.ErrPullSecretUnavailable) { + reason = hyperv1.SecretNotFoundReason + } + meta.SetStatusCondition(&hcluster.Status.Conditions, metav1.Condition{ Type: string(hyperv1.ValidReleaseImage), + Status: metav1.ConditionFalse, ObservedGeneration: hcluster.Generation, - } - condition.Status = metav1.ConditionFalse - condition.Message = err.Error() - condition.Reason = hyperv1.PayloadArchNotFoundReason - meta.SetStatusCondition(&hcluster.Status.Conditions, condition) - - return ctrl.Result{}, err + Message: err.Error(), + Reason: reason, + }) + } else { + hcluster.Status.PayloadArch = payloadArch } - hcluster.Status.PayloadArch = payloadArch + var releaseImage *releaseinfo.ReleaseImage - releaseImage, err := r.lookupReleaseImage(ctx, hcluster, releaseProvider) - if err != nil { - return ctrl.Result{}, fmt.Errorf("failed to lookup release image: %w", err) - } // Set Progressing condition { condition := metav1.Condition{ @@ -1261,24 +1221,36 @@ func (r *HostedClusterReconciler) reconcile(ctx context.Context, req ctrl.Reques Reason: hyperv1.AsExpectedReason, } refWithDigest := func() (string, error) { - _, ref, err := registryClientImageMetadataProvider.GetDigest(ctx, hcluster.Spec.Release.Image, pullSecretBytes) + ps, psErr := hyperutil.GetPullSecretBytes(ctx, r.Client, hcluster) + if psErr != nil { + return "", psErr + } + _, ref, err := registryClientImageMetadataProvider.GetDigest(ctx, hcluster.Spec.Release.Image, ps) if err != nil { return "", err } return ref.String(), nil } - progressing, err := isProgressing(hcluster, releaseImage, refWithDigest) + releaseImage, err = r.lookupReleaseImage(ctx, hcluster, releaseProvider) if err != nil { condition.Status = metav1.ConditionFalse condition.Message = err.Error() condition.Reason = hyperv1.BlockedReason + } else { + progressing, err := isProgressing(hcluster, releaseImage, refWithDigest) + if err != nil { + condition.Status = metav1.ConditionFalse + condition.Message = err.Error() + condition.Reason = hyperv1.BlockedReason + } + if progressing { + condition.Status = metav1.ConditionTrue + condition.Message = "HostedCluster is deploying, upgrading, or reconfiguring" + condition.Reason = "Progressing" + } } - if progressing { - condition.Status = metav1.ConditionTrue - condition.Message = "HostedCluster is deploying, upgrading, or reconfiguring" - condition.Reason = "Progressing" - } + meta.SetStatusCondition(&hcluster.Status.Conditions, condition) } @@ -1349,98 +1321,26 @@ func (r *HostedClusterReconciler) reconcile(ctx context.Context, req ctrl.Reques validReleaseImage := meta.FindStatusCondition(hcluster.Status.Conditions, string(hyperv1.ValidReleaseImage)) if validReleaseImage != nil && validReleaseImage.Status == metav1.ConditionFalse { if validReleaseImage.Reason == hyperv1.SecretNotFoundReason { - return ctrl.Result{}, fmt.Errorf("%s", validReleaseImage.Message) - } - log.Error(fmt.Errorf("release image is invalid"), "reconciliation is blocked", "message", validReleaseImage.Message) - return ctrl.Result{}, nil - } - upgrading, msg, err := isUpgrading(hcluster, releaseImage) - if upgrading { - if err != nil { - log.Error(err, "reconciliation is blocked", "message", validReleaseImage.Message) + // Continue with degraded reconciliation — PullSecretSync will capture + // the error as critical and CoreHCPChain will still reconcile the HCP. + log.Error(fmt.Errorf("%s", validReleaseImage.Message), "pull secret unavailable, continuing with degraded reconciliation") + } else { + log.Error(fmt.Errorf("release image is invalid"), "reconciliation is blocked", "message", validReleaseImage.Message) return ctrl.Result{}, nil } - if msg != "" { - log.Info(msg) - } } - } - - cpoHasUtilities := false - if _, hasLabel := controlPlaneOperatorImageLabels[controlPlaneOperatorSubcommandsLabel]; hasLabel { - cpoHasUtilities = true - } - utilitiesImage := controlPlaneOperatorImage - if !cpoHasUtilities { - utilitiesImage = r.HypershiftOperatorImage - } - - _, controlPlaneOperatorAppliesManagementKASNetworkPolicyLabel := controlPlaneOperatorImageLabels[controlPlaneOperatorAppliesManagementKASNetworkPolicyLabel] - _, controlPlanePKIOperatorSignsCSRs := controlPlaneOperatorImageLabels[controlPlanePKIOperatorSignsCSRsLabel] - _, useRestrictedPSA := controlPlaneOperatorImageLabels[useRestrictedPodSecurityLabel] - _, defaultToControlPlaneV2 := controlPlaneOperatorImageLabels[defaultToControlPlaneV2Label] - - // Reconcile the hosted cluster namespace - _, err = createOrUpdate(ctx, r.Client, controlPlaneNamespace, func() error { - if controlPlaneNamespace.Labels == nil { - controlPlaneNamespace.Labels = make(map[string]string) - } - controlPlaneNamespace.Labels[ControlPlaneNamespaceLabelKey] = "true" - - // Set pod security labels on HCP namespace - psaOverride := hcluster.Annotations[hyperv1.PodSecurityAdmissionLabelOverrideAnnotation] - if psaOverride != "" { - controlPlaneNamespace.Labels["pod-security.kubernetes.io/enforce"] = psaOverride - controlPlaneNamespace.Labels["pod-security.kubernetes.io/audit"] = psaOverride - controlPlaneNamespace.Labels["pod-security.kubernetes.io/warn"] = psaOverride - } else if useRestrictedPSA { - controlPlaneNamespace.Labels["pod-security.kubernetes.io/enforce"] = "restricted" - controlPlaneNamespace.Labels["pod-security.kubernetes.io/audit"] = "restricted" - controlPlaneNamespace.Labels["pod-security.kubernetes.io/warn"] = "restricted" - } else { - controlPlaneNamespace.Labels["pod-security.kubernetes.io/enforce"] = "privileged" - controlPlaneNamespace.Labels["pod-security.kubernetes.io/audit"] = "privileged" - controlPlaneNamespace.Labels["pod-security.kubernetes.io/warn"] = "privileged" - } - controlPlaneNamespace.Labels["security.openshift.io/scc.podSecurityLabelSync"] = "false" - - // Enable monitoring for hosted control plane namespaces - if r.EnableOCPClusterMonitoring { - controlPlaneNamespace.Labels["openshift.io/cluster-monitoring"] = "true" - } - - if r.SetDefaultSecurityContext { - // Only set the SecurtyContext UID annotation if it's not already set. - _, ok := controlPlaneNamespace.Annotations[DefaultSecurityContextUIDAnnnotation] - if !ok { - uid, err := getNextAvailableSecurityContextUID(ctx, r.Client) + if releaseImage != nil { + upgrading, msg, err := isUpgrading(hcluster, releaseImage) + if upgrading { if err != nil { - return fmt.Errorf("failed to get next available SecurityContext UID: %w", err) + log.Error(err, "reconciliation is blocked", "message", validReleaseImage.Message) + return ctrl.Result{}, nil } - if controlPlaneNamespace.Annotations == nil { - controlPlaneNamespace.Annotations = make(map[string]string) + if msg != "" { + log.Info(msg) } - controlPlaneNamespace.Annotations[DefaultSecurityContextUIDAnnnotation] = strconv.FormatInt(uid, 10) } } - - // Enable observability operator monitoring - metrics.EnableOBOMonitoring(controlPlaneNamespace) - - return nil - }) - if err != nil { - return ctrl.Result{}, fmt.Errorf("failed to reconcile namespace: %w", err) - } - - p, err := platform.GetPlatform(ctx, hcluster, releaseProvider, utilitiesImage, pullSecretBytes) - if err != nil { - return ctrl.Result{}, err - } - - releaseImageVersion, err := semver.Parse(releaseImage.Version()) - if err != nil { - return ctrl.Result{}, fmt.Errorf("failed to parse release image version: %w", err) } // --- Categorized reconcile operations with structured error reporting --- @@ -1450,12 +1350,53 @@ func (r *HostedClusterReconciler) reconcile(ctx context.Context, req ctrl.Reques // - nonCritical: failures are collected but never block other work. // // Blocking rules: - // - Critical sync failures (Phase 6a) → Phase 8 components blocked. - // - Core HCP chain failure / nil HCP (Phase 7) → Phase 8 components blocked. - // - Non-critical sync (Phase 6b) → never blocked, always runs. + // - Critical sync failures → Phase 8 components blocked. + // - Core HCP chain failure / nil HCP → Phase 8 components blocked. + // - Non-critical sync → never blocked, always runs. report := &reconcileReport{legacy: r.ReconcileLegacy} + // Phase 5: Pull secret, CPO image resolution, and namespace setup. + // These are grouped because namespace PSA labels depend on CPO image labels — + // reconciling the namespace without them could downgrade security. When this + // block fails (e.g. pull secret unavailable), the namespace retains its + // existing labels from a previous successful reconcile, and CoreHCPChain + // still runs to propagate HCP spec fields. + var pullSecretBytes []byte + controlPlaneOperatorImage := r.HypershiftOperatorImage + var controlPlaneOperatorImageLabels map[string]string + + report.execute("CPOImageAndNamespace", critical, func() error { + var err error + pullSecretBytes, err = hyperutil.GetPullSecretBytes(ctx, r.Client, hcluster) + if err != nil { + return fmt.Errorf("failed to get pull secret: %w", err) + } + controlPlaneOperatorImage, err = hyperutil.GetControlPlaneOperatorImage(ctx, hcluster, releaseProvider, r.HypershiftOperatorImage, pullSecretBytes) + if err != nil { + return fmt.Errorf("failed to get controlPlaneOperatorImage: %w", err) + } + controlPlaneOperatorImageLabels, err = hyperutil.GetControlPlaneOperatorImageLabels(ctx, hcluster, controlPlaneOperatorImage, pullSecretBytes, registryClientImageMetadataProvider) + if err != nil { + return fmt.Errorf("failed to get controlPlaneOperatorImageLabels: %w", err) + } + + return r.reconcileControlPlaneNamespace(ctx, createOrUpdate, hcluster, controlPlaneNamespace, controlPlaneOperatorImageLabels) + }) + + cpoHasUtilities := false + utilitiesImage := r.HypershiftOperatorImage + if _, hasLabel := controlPlaneOperatorImageLabels[controlPlaneOperatorSubcommandsLabel]; hasLabel { + cpoHasUtilities = true + utilitiesImage = controlPlaneOperatorImage + } + // Platform is a hard prerequisite — all framework operations use the platform + // interface and would panic on nil. GetPlatform handles nil pullSecretBytes. + p, err := platform.GetPlatform(ctx, hcluster, releaseProvider, utilitiesImage, pullSecretBytes) + if err != nil { + return ctrl.Result{}, err + } + // Phase 6a: Critical sync operations. // These must succeed for downstream component deployments to function. report.execute("PlatformCredentials", critical, func() error { @@ -1516,6 +1457,11 @@ func (r *HostedClusterReconciler) reconcile(ctx context.Context, req ctrl.Reques return r.reconcileGlobalConfigSync(ctx, hcluster, createOrUpdate, controlPlaneNamespace.Name) }) + _, cpoSupportsKASCustomKubeconfig := controlPlaneOperatorImageLabels[controlPlaneOperatorSupportsKASCustomKubeconfigLabel] + _, controlPlaneOperatorAppliesManagementKASNetworkPolicyLabel := controlPlaneOperatorImageLabels[controlPlaneOperatorAppliesManagementKASNetworkPolicyLabel] + _, controlPlanePKIOperatorSignsCSRs := controlPlaneOperatorImageLabels[controlPlanePKIOperatorSignsCSRsLabel] + _, defaultToControlPlaneV2 := controlPlaneOperatorImageLabels[defaultToControlPlaneV2Label] + // Phase 7: Core HCP chain — sequential (HCP → InfraCR → CAPI Cluster). // Always runs regardless of Phase 6a failures (these are K8s resource operations). report.execute("CoreHCPChain", critical, func() error { @@ -1535,6 +1481,30 @@ func (r *HostedClusterReconciler) reconcile(ctx context.Context, req ctrl.Reques return r.reconcileKubeconfigAndPasswordSync(ctx, createOrUpdate, hcluster, hcp, cpoSupportsKASCustomKubeconfig) }) + report.executeOrBlock("PlatformOIDCAndCSI", func() error { + return r.reconcilePlatformSpecific(ctx, log, hcluster, hcp, createOrUpdate) + }) + + report.executeOrBlock("MonitoringAndCLISecrets", func() error { + return r.reconcileAuxiliary(ctx, createOrUpdate, hcluster, hcp) + }) + + // Release image version is critical because OperatorDeployments and + // RBACAndPolicies depend on it — a zero-value version would produce + // wrong deployments. Failure here blocks those downstream operations. + var releaseImageVersion semver.Version + report.execute("ReleaseImageVersion", critical, func() error { + if releaseImage == nil { + return fmt.Errorf("release image is not available") + } + var err error + releaseImageVersion, err = semver.Parse(releaseImage.Version()) + if err != nil { + return fmt.Errorf("failed to parse release image version: %w", err) + } + return nil + }) + report.executeOrBlock("OperatorDeployments", func() error { return r.reconcileOperatorDeployments(ctx, createOrUpdate, hcluster, hcp, controlPlaneNamespace, p, controlPlaneOperatorImage, utilitiesImage, @@ -1547,14 +1517,6 @@ func (r *HostedClusterReconciler) reconcile(ctx context.Context, req ctrl.Reques releaseImageVersion) }) - report.executeOrBlock("PlatformOIDCAndCSI", func() error { - return r.reconcilePlatformSpecific(ctx, log, hcluster, hcp, createOrUpdate) - }) - - report.executeOrBlock("MonitoringAndCLISecrets", func() error { - return r.reconcileAuxiliary(ctx, createOrUpdate, hcluster, hcp) - }) - report.executeOrBlock("PublicEndpointExposed", func() error { pubEndpointRequeue, err := r.reconcilePublicEndpointExposedCondition(ctx, hcluster) if err != nil { @@ -1761,7 +1723,7 @@ func (r *HostedClusterReconciler) reconcilePlatformCredentialsWithStatus( Message: err.Error(), }) if statusErr := r.Client.Status().Update(ctx, hcluster); statusErr != nil { - return fmt.Errorf("failed to reconcile platform credentials: %s, failed to update status: %w", err, statusErr) + return fmt.Errorf("failed to reconcile platform credentials: %w, failed to update status: %w", err, statusErr) } return fmt.Errorf("failed to reconcile platform credentials: %w", err) } @@ -1835,6 +1797,63 @@ func (r *HostedClusterReconciler) reconcileRestoredFromBackupWithStatus( // reconcilePullSecretSync syncs the pull secret from the HostedCluster namespace // to the control plane namespace. +func (r *HostedClusterReconciler) reconcileControlPlaneNamespace( + ctx context.Context, createOrUpdate upsert.CreateOrUpdateFN, + hcluster *hyperv1.HostedCluster, controlPlaneNamespace *corev1.Namespace, + controlPlaneOperatorImageLabels map[string]string, +) error { + _, useRestrictedPSA := controlPlaneOperatorImageLabels[useRestrictedPodSecurityLabel] + + _, err := createOrUpdate(ctx, r.Client, controlPlaneNamespace, func() error { + if controlPlaneNamespace.Labels == nil { + controlPlaneNamespace.Labels = make(map[string]string) + } + controlPlaneNamespace.Labels[ControlPlaneNamespaceLabelKey] = "true" + + psaOverride := hcluster.Annotations[hyperv1.PodSecurityAdmissionLabelOverrideAnnotation] + if psaOverride != "" { + controlPlaneNamespace.Labels["pod-security.kubernetes.io/enforce"] = psaOverride + controlPlaneNamespace.Labels["pod-security.kubernetes.io/audit"] = psaOverride + controlPlaneNamespace.Labels["pod-security.kubernetes.io/warn"] = psaOverride + } else if useRestrictedPSA { + controlPlaneNamespace.Labels["pod-security.kubernetes.io/enforce"] = "restricted" + controlPlaneNamespace.Labels["pod-security.kubernetes.io/audit"] = "restricted" + controlPlaneNamespace.Labels["pod-security.kubernetes.io/warn"] = "restricted" + } else { + controlPlaneNamespace.Labels["pod-security.kubernetes.io/enforce"] = "privileged" + controlPlaneNamespace.Labels["pod-security.kubernetes.io/audit"] = "privileged" + controlPlaneNamespace.Labels["pod-security.kubernetes.io/warn"] = "privileged" + } + controlPlaneNamespace.Labels["security.openshift.io/scc.podSecurityLabelSync"] = "false" + + if r.EnableOCPClusterMonitoring { + controlPlaneNamespace.Labels["openshift.io/cluster-monitoring"] = "true" + } + + if r.SetDefaultSecurityContext { + _, ok := controlPlaneNamespace.Annotations[DefaultSecurityContextUIDAnnnotation] + if !ok { + uid, err := getNextAvailableSecurityContextUID(ctx, r.Client) + if err != nil { + return fmt.Errorf("failed to get next available SecurityContext UID: %w", err) + } + if controlPlaneNamespace.Annotations == nil { + controlPlaneNamespace.Annotations = make(map[string]string) + } + controlPlaneNamespace.Annotations[DefaultSecurityContextUIDAnnnotation] = strconv.FormatInt(uid, 10) + } + } + + metrics.EnableOBOMonitoring(controlPlaneNamespace) + + return nil + }) + if err != nil { + return fmt.Errorf("failed to reconcile namespace: %w", err) + } + return nil +} + func (r *HostedClusterReconciler) reconcilePullSecretSync( ctx context.Context, hcluster *hyperv1.HostedCluster, createOrUpdate upsert.CreateOrUpdateFN, controlPlaneNamespace string, diff --git a/hypershift-operator/controllers/hostedcluster/hostedcluster_controller_test.go b/hypershift-operator/controllers/hostedcluster/hostedcluster_controller_test.go index 803e1ccf4d3b..1777bfccec3e 100644 --- a/hypershift-operator/controllers/hostedcluster/hostedcluster_controller_test.go +++ b/hypershift-operator/controllers/hostedcluster/hostedcluster_controller_test.go @@ -260,131 +260,143 @@ func TestHasBeenAvailable(t *testing.T) { } } -func TestReconcileWhenPullSecretMissing(t *testing.T) { +func TestReconcileWhenPullSecretUnavailable(t *testing.T) { t.Parallel() - g := NewWithT(t) - hcluster := &hyperv1.HostedCluster{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-cluster", - Namespace: "any", - Annotations: map[string]string{ - hyperv1.RequestServingNodeAdditionalSelectorAnnotation: `{"node-role.kubernetes.io/worker": ""}`, - }, + tests := []struct { + name string + extraObjects []crclient.Object + }{ + { + name: "When the pull secret is missing it should still reconcile the HCP", }, - Spec: hyperv1.HostedClusterSpec{ - ClusterID: "id", - InfraID: "infra-id", - Networking: hyperv1.ClusterNetworking{ - ClusterNetwork: []hyperv1.ClusterNetworkEntry{{CIDR: *ipnet.MustParseCIDR("172.16.1.0/24")}}, - }, - Services: []hyperv1.ServicePublishingStrategyMapping{ - { - Service: hyperv1.Ignition, - ServicePublishingStrategy: hyperv1.ServicePublishingStrategy{ - Type: hyperv1.LoadBalancer, - }, + { + name: "When the pull secret is corrupted it should still reconcile the HCP", + extraObjects: []crclient.Object{ + &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "pull-secret", Namespace: "any"}, + Data: map[string][]byte{}, }, }, - PullSecret: corev1.LocalObjectReference{ - Name: "pull-secret", - }, - Release: hyperv1.Release{Image: "quay.io/openshift-release-dev/ocp-release:4.15.0"}, - Etcd: hyperv1.EtcdSpec{ManagementType: hyperv1.Managed}, - Platform: hyperv1.PlatformSpec{ - Type: hyperv1.AWSPlatform, - }, - NodeSelector: map[string]string{ - "node-role.kubernetes.io/worker": "", - }, }, } - hcpNs := hcpmanifests.HostedControlPlaneNamespace(hcluster.Namespace, hcluster.Name) - hcp := controlplaneoperator.HostedControlPlane(hcpNs, hcluster.Name) - - // Create the namespace so createOrUpdate can find it - ns := &corev1.Namespace{ - ObjectMeta: metav1.ObjectMeta{Name: hcpNs}, - } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + g := NewWithT(t) - // No pull secret in the fake client — this is the scenario we're testing - objects := []crclient.Object{ - hcp, - hcluster, - ns, - } + hcluster := &hyperv1.HostedCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: "any", + Annotations: map[string]string{ + hyperv1.RequestServingNodeAdditionalSelectorAnnotation: `{"node-role.kubernetes.io/worker": ""}`, + }, + }, + Spec: hyperv1.HostedClusterSpec{ + ClusterID: "12345678-1234-1234-1234-123456789abc", + InfraID: "infra-id", + Networking: hyperv1.ClusterNetworking{ + ClusterNetwork: []hyperv1.ClusterNetworkEntry{{CIDR: *ipnet.MustParseCIDR("172.16.1.0/24")}}, + ServiceNetwork: []hyperv1.ServiceNetworkEntry{{CIDR: *ipnet.MustParseCIDR("172.31.0.0/16")}}, + }, + Services: []hyperv1.ServicePublishingStrategyMapping{ + { + Service: hyperv1.Ignition, + ServicePublishingStrategy: hyperv1.ServicePublishingStrategy{ + Type: hyperv1.Route, + }, + }, + }, + PullSecret: corev1.LocalObjectReference{Name: "pull-secret"}, + Release: hyperv1.Release{Image: "quay.io/openshift-release-dev/ocp-release:4.15.0"}, + Etcd: hyperv1.EtcdSpec{ManagementType: hyperv1.Managed}, + Platform: hyperv1.PlatformSpec{Type: hyperv1.NonePlatform}, + NodeSelector: map[string]string{ + "node-role.kubernetes.io/worker": "", + }, + }, + } - client := fake.NewClientBuilder().WithScheme(api.Scheme).WithObjects(objects...).WithStatusSubresource(hcluster).Build() + hcpNs := hcpmanifests.HostedControlPlaneNamespace(hcluster.Namespace, hcluster.Name) + hcp := controlplaneoperator.HostedControlPlane(hcpNs, hcluster.Name) + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: hcpNs}} - mockCtrl := gomock.NewController(t) - defer mockCtrl.Finish() - mockedProvider := releaseinfo.NewMockProviderWithOpenShiftImageRegistryOverrides(mockCtrl) + objects := append([]crclient.Object{hcp, hcluster, ns}, tc.extraObjects...) + client := fake.NewClientBuilder().WithScheme(api.Scheme). + WithObjects(objects...).WithStatusSubresource(hcluster).Build() - r := &HostedClusterReconciler{ - Client: client, - Clock: clocktesting.NewFakeClock(time.Now()), - CertRotationScale: 24 * time.Hour, - createOrUpdate: func(reconcile.Request) upsert.CreateOrUpdateFN { return ctrl.CreateOrUpdate }, - ManagementClusterCapabilities: &fakecapabilities.FakeSupportNoCapabilities{}, - RegistryProvider: fakeReleaseProvider{ - releaseProvider: mockedProvider, - metadataProvider: fakeimagemetadataprovider.FakeRegistryClientImageMetadataProvider{ - Result: &dockerv1client.DockerImageConfig{}, - }, - }, - now: func() metav1.Time { return metav1.NewTime(time.Now()) }, - } + mockCtrl := gomock.NewController(t) + defer mockCtrl.Finish() + mockedProvider := releaseinfo.NewMockProviderWithOpenShiftImageRegistryOverrides(mockCtrl) - ctx := t.Context() - _, err := r.Reconcile(ctx, ctrl.Request{NamespacedName: crclient.ObjectKeyFromObject(hcluster)}) + r := &HostedClusterReconciler{ + Client: client, + Clock: clocktesting.NewFakeClock(time.Now()), + CertRotationScale: 24 * time.Hour, + createOrUpdate: func(reconcile.Request) upsert.CreateOrUpdateFN { return ctrl.CreateOrUpdate }, + ManagementClusterCapabilities: &fakecapabilities.FakeSupportAllCapabilities{}, + RegistryProvider: fakeReleaseProvider{ + releaseProvider: mockedProvider, + metadataProvider: fakeimagemetadataprovider.FakeRegistryClientImageMetadataProvider{ + Result: &dockerv1client.DockerImageConfig{}, + }, + }, + now: func() metav1.Time { return metav1.NewTime(time.Now()) }, + } - // When the pull secret is missing, reconciliation should return an error for requeue - g.Expect(err).To(HaveOccurred(), "When pull secret is missing it should return an error") - g.Expect(err.Error()).To(ContainSubstring("failed to get pull secret"), "When pull secret is missing the error should mention pull secret") + ctx := t.Context() + _, err := r.Reconcile(ctx, ctrl.Request{NamespacedName: crclient.ObjectKeyFromObject(hcluster)}) - // Verify reconcileHostedControlPlane still ran — the HCP should have the HostedCluster's spec fields - updatedHCP := controlplaneoperator.HostedControlPlane(hcpNs, hcluster.Name) - err = client.Get(ctx, crclient.ObjectKeyFromObject(updatedHCP), updatedHCP) - g.Expect(err).ToNot(HaveOccurred(), "When pull secret is missing it should still be able to get the HCP") - - g.Expect(updatedHCP.Spec.NodeSelector).To(Equal(hcluster.Spec.NodeSelector), - "When pull secret is missing it should still propagate NodeSelector to HCP") - g.Expect(updatedHCP.Annotations).To(HaveKeyWithValue( - hyperv1.RequestServingNodeAdditionalSelectorAnnotation, - hcluster.Annotations[hyperv1.RequestServingNodeAdditionalSelectorAnnotation]), - "When pull secret is missing it should still propagate RequestServingNodeAdditionalSelector to HCP") - g.Expect(updatedHCP.Spec.ReleaseImage).To(Equal(hcluster.Spec.Release.Image), - "When pull secret is missing it should still propagate ReleaseImage to HCP") - - // With no NodePools in the fake client, autoscaling is not needed so the - // autoscaler should be disabled. This validates the defaulting logic in the - // pull secret recovery path. - g.Expect(updatedHCP.Annotations).To(HaveKeyWithValue( - hyperv1.DisableClusterAutoscalerAnnotation, "true"), - "When pull secret is missing and no NodePools exist it should disable the cluster autoscaler") + // Reconciliation should return an error for requeue via the error-collection + // framework (PullSecretSync and CPOImageAndNamespace record critical errors). + g.Expect(err).To(HaveOccurred(), "it should return an error") + g.Expect(err.Error()).To(ContainSubstring("pull secret"), "the error should mention pull secret") + + // CoreHCPChain should still run — HCP spec fields must be propagated + // even during a pull secret outage. + updatedHCP := controlplaneoperator.HostedControlPlane(hcpNs, hcluster.Name) + err = client.Get(ctx, crclient.ObjectKeyFromObject(updatedHCP), updatedHCP) + g.Expect(err).ToNot(HaveOccurred(), "it should still be able to get the HCP") + + g.Expect(updatedHCP.Spec.NodeSelector).To(Equal(hcluster.Spec.NodeSelector), + "it should still propagate NodeSelector to HCP") + g.Expect(updatedHCP.Annotations).To(HaveKeyWithValue( + hyperv1.RequestServingNodeAdditionalSelectorAnnotation, + hcluster.Annotations[hyperv1.RequestServingNodeAdditionalSelectorAnnotation]), + "it should still propagate RequestServingNodeAdditionalSelector to HCP") + g.Expect(updatedHCP.Spec.ReleaseImage).To(Equal(hcluster.Spec.Release.Image), + "it should still propagate ReleaseImage to HCP") + g.Expect(updatedHCP.Annotations).To(HaveKeyWithValue( + hyperv1.DisableClusterAutoscalerAnnotation, "true"), + "it should disable the cluster autoscaler when no NodePools exist") + }) + } } func TestReconcileWhenPullSecretMissingAndNodePoolListFails(t *testing.T) { t.Parallel() g := NewWithT(t) + clusterID := "12345678-1234-1234-1234-123456789abc" hcluster := &hyperv1.HostedCluster{ ObjectMeta: metav1.ObjectMeta{ Name: "test-cluster", Namespace: "any", }, Spec: hyperv1.HostedClusterSpec{ - ClusterID: "id", + ClusterID: clusterID, InfraID: "infra-id", Networking: hyperv1.ClusterNetworking{ ClusterNetwork: []hyperv1.ClusterNetworkEntry{{CIDR: *ipnet.MustParseCIDR("172.16.1.0/24")}}, + ServiceNetwork: []hyperv1.ServiceNetworkEntry{{CIDR: *ipnet.MustParseCIDR("172.31.0.0/16")}}, }, Services: []hyperv1.ServicePublishingStrategyMapping{ { Service: hyperv1.Ignition, ServicePublishingStrategy: hyperv1.ServicePublishingStrategy{ - Type: hyperv1.LoadBalancer, + Type: hyperv1.Route, }, }, }, @@ -394,7 +406,7 @@ func TestReconcileWhenPullSecretMissingAndNodePoolListFails(t *testing.T) { Release: hyperv1.Release{Image: "quay.io/openshift-release-dev/ocp-release:4.15.0"}, Etcd: hyperv1.EtcdSpec{ManagementType: hyperv1.Managed}, Platform: hyperv1.PlatformSpec{ - Type: hyperv1.AWSPlatform, + Type: hyperv1.NonePlatform, }, }, } @@ -424,7 +436,7 @@ func TestReconcileWhenPullSecretMissingAndNodePoolListFails(t *testing.T) { Clock: clocktesting.NewFakeClock(time.Now()), CertRotationScale: 24 * time.Hour, createOrUpdate: func(reconcile.Request) upsert.CreateOrUpdateFN { return ctrl.CreateOrUpdate }, - ManagementClusterCapabilities: &fakecapabilities.FakeSupportNoCapabilities{}, + ManagementClusterCapabilities: &fakecapabilities.FakeSupportAllCapabilities{}, RegistryProvider: fakeReleaseProvider{ releaseProvider: mockedProvider, metadataProvider: fakeimagemetadataprovider.FakeRegistryClientImageMetadataProvider{ @@ -2557,7 +2569,7 @@ func TestValidateReleaseImage(t *testing.T) { releaseImageLoookup: func(_ context.Context, _ string, _ []byte) (*releaseinfo.ReleaseImage, error) { return testutils.InitReleaseImageOrDie("4.15.0"), nil }, - expectedResult: errors.New("failed to get pull secret: secrets \"pull-secret\" not found"), + expectedResult: errors.New("pull secret unavailable: secrets \"pull-secret\" not found"), expectedNotFoundError: true, }, { @@ -2581,7 +2593,7 @@ func TestValidateReleaseImage(t *testing.T) { releaseImageLoookup: func(_ context.Context, _ string, _ []byte) (*releaseinfo.ReleaseImage, error) { return testutils.InitReleaseImageOrDie("4.15.0"), nil }, - expectedResult: errors.New("expected .dockerconfigjson key in pull secret"), + expectedResult: errors.New("pull secret unavailable: expected .dockerconfigjson key in secret \"pull-secret\""), }, { name: "unable to pull release image, error", diff --git a/support/util/util.go b/support/util/util.go index 4b335251c026..7bb76f764457 100644 --- a/support/util/util.go +++ b/support/util/util.go @@ -7,6 +7,7 @@ import ( "crypto/tls" "encoding/base64" "encoding/json" + "errors" "fmt" "hash/fnv" "io" @@ -34,6 +35,9 @@ import ( ignitionapi "github.com/coreos/ignition/v2/config/v3_2/types" ) +// ErrPullSecretUnavailable indicates the pull secret is missing or malformed. +var ErrPullSecretUnavailable = errors.New("pull secret unavailable") + type JSONMapper func(jsonData []byte) []byte // NewOmitFieldIfEmptyJSONMapper is a JSONMapper that omits the given field @@ -328,13 +332,9 @@ func GetMgmtClusterCPUArch(kc kubeclient.Interface) (string, error) { // DetermineHostedClusterPayloadArch returns the HostedCluster payload's CPU architecture type func DetermineHostedClusterPayloadArch(ctx context.Context, c client.Client, hc *hyperv1.HostedCluster, imageMetadataProvider ImageMetadataProvider) (hyperv1.PayloadArchType, error) { - var pullSecret corev1.Secret - if err := c.Get(ctx, types.NamespacedName{Namespace: hc.Namespace, Name: hc.Spec.PullSecret.Name}, &pullSecret); err != nil { - return "", fmt.Errorf("failed to get pull secret: %w", err) - } - pullSecretBytes, ok := pullSecret.Data[corev1.DockerConfigJsonKey] - if !ok { - return "", fmt.Errorf("expected %s key in pull secret", corev1.DockerConfigJsonKey) + pullSecretBytes, err := GetPullSecretBytes(ctx, c, hc) + if err != nil { + return "", err } isMultiArchReleaseImage, err := registryclient.IsMultiArchManifestList(ctx, hc.Spec.Release.Image, pullSecretBytes, imageMetadataProvider) @@ -449,12 +449,12 @@ func SanitizeIgnitionPayload(payload []byte) error { func GetPullSecretBytes(ctx context.Context, c client.Client, hc *hyperv1.HostedCluster) ([]byte, error) { pullSecret := corev1.Secret{} if err := c.Get(ctx, types.NamespacedName{Namespace: hc.Namespace, Name: hc.Spec.PullSecret.Name}, &pullSecret); err != nil { - return nil, fmt.Errorf("failed to get pull secret: %w", err) + return nil, fmt.Errorf("%w: %w", ErrPullSecretUnavailable, err) } pullSecretBytes, ok := pullSecret.Data[corev1.DockerConfigJsonKey] if !ok { - return nil, fmt.Errorf("expected %s key in pull secret", corev1.DockerConfigJsonKey) + return nil, fmt.Errorf("%w: expected %s key in secret %q", ErrPullSecretUnavailable, corev1.DockerConfigJsonKey, hc.Spec.PullSecret.Name) } return pullSecretBytes, nil From ba3485ae8622c1ec89f26a052e134419301e2b2f Mon Sep 17 00:00:00 2001 From: Mulham Raee Date: Wed, 17 Jun 2026 18:02:39 +0200 Subject: [PATCH 3/3] refactor(hostedcluster): extract legacy reconciler for rollback safety MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the original reconcile method into reconcile_legacy.go as reconcileLegacy, activated by HYPERSHIFT_RECONCILE_LEGACY=1. This replaces the legacy flag in reconcileReport — the changes to the v2 reconciler are too deep for a flag to faithfully reproduce the old behavior. - Extract pre-refactor reconcile into reconcile_legacy.go - Remove legacy field from reconcileReport and simplify shouldBlock - Dispatch via ReconcileLegacy flag in the Reconcile entry point - Fix reconcileOpenShiftTrustedCAs unused bool return (unparam lint) Co-Authored-By: Claude Opus 4.6 (1M context) --- .../hostedcluster/hostedcluster_controller.go | 20 +- .../hostedcluster/reconcile_legacy.go | 1790 +++++++++++++++++ .../hostedcluster/reconcile_report.go | 25 +- .../hostedcluster/reconcile_report_test.go | 41 +- 4 files changed, 1807 insertions(+), 69 deletions(-) create mode 100644 hypershift-operator/controllers/hostedcluster/reconcile_legacy.go diff --git a/hypershift-operator/controllers/hostedcluster/hostedcluster_controller.go b/hypershift-operator/controllers/hostedcluster/hostedcluster_controller.go index a15b34580bd5..092ba6bda18b 100644 --- a/hypershift-operator/controllers/hostedcluster/hostedcluster_controller.go +++ b/hypershift-operator/controllers/hostedcluster/hostedcluster_controller.go @@ -365,6 +365,8 @@ func (r *HostedClusterReconciler) Reconcile(ctx context.Context, req ctrl.Reques var res reconcile.Result if r.overwriteReconcile != nil { res, err = r.overwriteReconcile(ctx, req, log, hcluster) + } else if r.ReconcileLegacy { + res, err = r.reconcileLegacy(ctx, req, log, hcluster) } else { res, err = r.reconcile(ctx, req, log, hcluster) } @@ -1354,7 +1356,7 @@ func (r *HostedClusterReconciler) reconcile(ctx context.Context, req ctrl.Reques // - Core HCP chain failure / nil HCP → Phase 8 components blocked. // - Non-critical sync → never blocked, always runs. - report := &reconcileReport{legacy: r.ReconcileLegacy} + report := &reconcileReport{} // Phase 5: Pull secret, CPO image resolution, and namespace setup. // These are grouped because namespace PSA labels depend on CPO image labels — @@ -2293,7 +2295,7 @@ func (r *HostedClusterReconciler) reconcileAuxiliary( if err := r.reconcileSREMetricsConfig(ctx, createOrUpdate, hcp); err != nil { errs = append(errs, fmt.Errorf("failed to reconcile SRE metrics config: %w", err)) } - if _, err := r.reconcileOpenShiftTrustedCAs(ctx, hcp); err != nil { + if err := r.reconcileOpenShiftTrustedCAs(ctx, hcp); err != nil { errs = append(errs, fmt.Errorf("failed to reconcile OpenShift trusted CAs: %w", err)) } return utilerrors.NewAggregate(errs) @@ -3047,7 +3049,7 @@ func (r *HostedClusterReconciler) reconcileKubevirtCSIClusterRBAC(ctx context.Co // reconcileOpenShiftTrustedCAs checks for the existence of /etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem, if it exists, // creates a new ConfigMap to be mounted in the CPO deployment utilizing the file -func (r *HostedClusterReconciler) reconcileOpenShiftTrustedCAs(ctx context.Context, hostedControlPlane *hyperv1.HostedControlPlane) (bool, error) { +func (r *HostedClusterReconciler) reconcileOpenShiftTrustedCAs(ctx context.Context, hostedControlPlane *hyperv1.HostedControlPlane) error { trustedCABundle := new(bytes.Buffer) var trustCABundleFile []byte @@ -3055,29 +3057,29 @@ func (r *HostedClusterReconciler) reconcileOpenShiftTrustedCAs(ctx context.Conte if err == nil { trustCABundleFile, err = os.ReadFile(r.OpenShiftTrustedCAFilePath) if err != nil { - return false, fmt.Errorf("unable to read trust bundle file: %w", err) + return fmt.Errorf("unable to read trust bundle file: %w", err) } } if err != nil { if errors.Is(err, os.ErrNotExist) { - return false, nil + return nil } - return false, err + return err } if _, err = trustedCABundle.Write(trustCABundleFile); err != nil { - return false, fmt.Errorf("unable to write trust bundle to buffer: %w", err) + return fmt.Errorf("unable to write trust bundle to buffer: %w", err) } // Next, save the contents to a new ConfigMap in the hosted control plane's namespace openShiftTrustedCABundleConfigMapForCPO := manifests.OpenShiftTrustedCABundleForNamespace(hostedControlPlane.Namespace) openShiftTrustedCABundleConfigMapForCPO.Data["ca-bundle.crt"] = trustedCABundle.String() if _, err = controllerutil.CreateOrUpdate(ctx, r.Client, openShiftTrustedCABundleConfigMapForCPO, NoopReconcile); err != nil { - return false, fmt.Errorf("failed to create openshift-config-managed-trusted-ca-bundle for CPO deployment %T: %w", trustedCABundle.String(), err) + return fmt.Errorf("failed to create openshift-config-managed-trusted-ca-bundle for CPO deployment %T: %w", trustedCABundle.String(), err) } - return true, nil + return nil } func servicePublishingStrategyByType(hcp *hyperv1.HostedCluster, svcType hyperv1.ServiceType) *hyperv1.ServicePublishingStrategy { diff --git a/hypershift-operator/controllers/hostedcluster/reconcile_legacy.go b/hypershift-operator/controllers/hostedcluster/reconcile_legacy.go new file mode 100644 index 000000000000..84f3b13412eb --- /dev/null +++ b/hypershift-operator/controllers/hostedcluster/reconcile_legacy.go @@ -0,0 +1,1790 @@ +package hostedcluster + +import ( + "context" + "fmt" + "strconv" + "strings" + "time" + + hyperv1 "github.com/openshift/hypershift/api/hypershift/v1beta1" + "github.com/openshift/hypershift/api/util/configrefs" + "github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane/imageprovider" + cpomanifests "github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane/manifests" + "github.com/openshift/hypershift/hypershift-operator/controllers/hostedcluster/internal/platform" + hcmetrics "github.com/openshift/hypershift/hypershift-operator/controllers/hostedcluster/metrics" + "github.com/openshift/hypershift/hypershift-operator/controllers/manifests" + "github.com/openshift/hypershift/hypershift-operator/controllers/manifests/controlplaneoperator" + controlplanepkioperatormanifests "github.com/openshift/hypershift/hypershift-operator/controllers/manifests/controlplanepkioperator" + "github.com/openshift/hypershift/hypershift-operator/controllers/manifests/ignitionserver" + "github.com/openshift/hypershift/support/azureutil" + "github.com/openshift/hypershift/support/config" + controlplanecomponent "github.com/openshift/hypershift/support/controlplane-component" + "github.com/openshift/hypershift/support/k8sutil" + "github.com/openshift/hypershift/support/metrics" + "github.com/openshift/hypershift/support/netutil" + "github.com/openshift/hypershift/support/podspec" + "github.com/openshift/hypershift/support/secretproviderclass" + "github.com/openshift/hypershift/support/upsert" + hyperutil "github.com/openshift/hypershift/support/util" + + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" + "k8s.io/apimachinery/pkg/api/equality" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + + "github.com/blang/semver" + "github.com/go-logr/logr" +) + +// reconcileLegacy is the original reconcile implementation preserved for +// rollback safety. Activated by setting HYPERSHIFT_RECONCILE_LEGACY=1. +// +//nolint:gocyclo +func (r *HostedClusterReconciler) reconcileLegacy(ctx context.Context, req ctrl.Request, log logr.Logger, hcluster *hyperv1.HostedCluster) (ctrl.Result, error) { + controlPlaneNamespace := manifests.HostedControlPlaneNamespaceObject(hcluster.Namespace, hcluster.Name) + hcp := controlplaneoperator.HostedControlPlane(controlPlaneNamespace.Name, hcluster.Name) + err := r.Client.Get(ctx, client.ObjectKeyFromObject(hcp), hcp) + if err != nil { + if !apierrors.IsNotFound(err) { + return ctrl.Result{}, fmt.Errorf("failed to get hostedcontrolplane: %w", err) + } else { + hcp = nil + } + } + + // Bubble up ValidIdentityProvider condition from the hostedControlPlane. + // We set this condition even if the HC is being deleted. Otherwise, a hostedCluster with a conflicted identity provider + // would fail to complete deletion forever with no clear signal for consumers. + if hcluster.Spec.Platform.Type == hyperv1.AWSPlatform { + updated := false + var validIdentityProviderCondition *metav1.Condition + if hcp != nil { + validIdentityProviderCondition = meta.FindStatusCondition(hcp.Status.Conditions, string(hyperv1.ValidAWSIdentityProvider)) + } + + // condition not found in HCP or HCP has been deleted + if validIdentityProviderCondition == nil { + updated = meta.SetStatusCondition(&hcluster.Status.Conditions, metav1.Condition{ + Type: string(hyperv1.ValidAWSIdentityProvider), + Status: metav1.ConditionUnknown, + Reason: hyperv1.StatusUnknownReason, + ObservedGeneration: hcluster.Generation, + }) + } else { + validIdentityProviderCondition.ObservedGeneration = hcluster.Generation + updated = meta.SetStatusCondition(&hcluster.Status.Conditions, *validIdentityProviderCondition) + } + + if updated { + // Persist status updates + if err := r.Client.Status().Update(ctx, hcluster); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to update status: %w", err) + } + } + } + + // Bubble up AWSDefaultSecurityGroupDeleted condition from the hostedControlPlane to report blocking objects on deletion. + if condition, changed := computeAWSDefaultSGDeletedCondition(hcluster, hcp); changed { + meta.SetStatusCondition(&hcluster.Status.Conditions, *condition) + if err := r.Client.Status().Update(ctx, hcluster); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to update status: %w", err) + } + } + + // Bubble up CloudResourcesDestroyed condition from the hostedControlPlane. + // We set this condition even if the HC is being deleted, so we can construct SLIs for deletion times. + { + if hcp != nil && !hcp.DeletionTimestamp.IsZero() { + freshCondition := &metav1.Condition{ + Type: string(hyperv1.CloudResourcesDestroyed), + Status: metav1.ConditionUnknown, + Reason: hyperv1.StatusUnknownReason, + ObservedGeneration: hcluster.Generation, + } + + cloudResourcesDestroyedCondition := meta.FindStatusCondition(hcp.Status.Conditions, string(hyperv1.CloudResourcesDestroyed)) + if cloudResourcesDestroyedCondition != nil { + freshCondition = cloudResourcesDestroyedCondition + } + + oldCondition := meta.FindStatusCondition(hcluster.Status.Conditions, string(hyperv1.CloudResourcesDestroyed)) + if oldCondition == nil || oldCondition.Message != freshCondition.Message { + freshCondition.ObservedGeneration = hcluster.Generation + meta.SetStatusCondition(&hcluster.Status.Conditions, *freshCondition) + // Persist status updates + if err := r.Client.Status().Update(ctx, hcluster); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to update status: %w", err) + } + } + } + } + + var hcDestroyGracePeriod time.Duration + + if gracePeriodString := hcluster.Annotations[hyperv1.HCDestroyGracePeriodAnnotation]; len(gracePeriodString) > 0 { + hcDestroyGracePeriod, err = time.ParseDuration(gracePeriodString) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to parse %s annotation: %w", hyperv1.HCDestroyGracePeriodAnnotation, err) + } + } + + // If deleted, clean up and return early. + if !hcluster.DeletionTimestamp.IsZero() { + // This new condition is necessary for OCM personnel to report any cloud dangling objects to the user. + // The grace period is customizable using an annotation called HCDestroyGracePeriodAnnotation. It's a time.Duration annotation. + // This annotation will create a new condition called HostedClusterDestroyed which in conjunction with CloudResourcesDestroyed + // a SRE could determine if there are dangling objects once the HostedCluster is deleted. These cloud dangling objects will remain + // in AWS, and SRE will report them to the final user. + hostedClusterDestroyedCondition := meta.FindStatusCondition(hcluster.Status.Conditions, string(hyperv1.HostedClusterDestroyed)) + if hostedClusterDestroyedCondition == nil || hostedClusterDestroyedCondition.Status != metav1.ConditionTrue { + // Keep trying to delete until we know it's safe to finalize. + completed, err := r.delete(ctx, hcluster) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to delete hostedcluster: %w", err) + } + if !completed { + log.Info("hostedcluster is still deleting", "name", req.NamespacedName) + return ctrl.Result{RequeueAfter: clusterDeletionRequeueDuration}, nil + } + } + + // Once the deletion has occurred, we need to clean up cluster-wide resources + selector := client.MatchingLabelsSelector{Selector: labels.SelectorFromSet(labels.Set{ + controlplanepkioperatormanifests.OwningHostedClusterNamespaceLabel: hcluster.Namespace, + controlplanepkioperatormanifests.OwningHostedClusterNameLabel: hcluster.Name, + })} + var crs rbacv1.ClusterRoleList + if err := r.List(ctx, &crs, selector); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to list cluster roles: %w", err) + } + if len(crs.Items) > 0 { + if err := r.DeleteAllOf(ctx, &rbacv1.ClusterRole{}, selector); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to delete cluster roles: %w", err) + } + } + var crbs rbacv1.ClusterRoleBindingList + if err := r.List(ctx, &crbs, selector); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to list cluster role bindings: %w", err) + } + if len(crbs.Items) > 0 { + if err := r.DeleteAllOf(ctx, &rbacv1.ClusterRoleBinding{}, selector); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to delete cluster role bindings: %w", err) + } + } + + // Remove any referenced resource annotations for this hosted cluster from secrets and configmaps + deleteReferencedResourceAnnotation := func(obj client.Object) error { + annotations := obj.GetAnnotations() + if annotations == nil { + return nil + } + key := referencedResourceAnnotationPrefix + hcluster.Name + if _, ok := annotations[key]; !ok { + return nil + } + delete(annotations, key) + obj.SetAnnotations(annotations) + if err := r.Update(ctx, obj); err != nil { + return err + } + return nil + } + + var secretList corev1.SecretList + if err := r.List(ctx, &secretList, client.InNamespace(hcluster.Namespace)); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to list secrets: %w", err) + } + for _, secret := range secretList.Items { + if err := deleteReferencedResourceAnnotation(&secret); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to delete referenced resource annotation on secret: %w", err) + } + } + + var configmapList corev1.ConfigMapList + if err := r.List(ctx, &configmapList, client.InNamespace(hcluster.Namespace)); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to list configmaps: %w", err) + } + for _, configmap := range configmapList.Items { + if err := deleteReferencedResourceAnnotation(&configmap); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to delete referenced resource annotation on configmap: %w", err) + } + } + + if hcDestroyGracePeriod > 0 { + if hostedClusterDestroyedCondition == nil { + hostedClusterDestroyedCondition = &metav1.Condition{ + Type: string(hyperv1.HostedClusterDestroyed), + Status: metav1.ConditionTrue, + Message: fmt.Sprintf("Grace period set: %v", hcDestroyGracePeriod), + Reason: hyperv1.WaitingForGracePeriodReason, + LastTransitionTime: metav1.NewTime(time.Now()), + ObservedGeneration: hcluster.Generation, + } + + meta.SetStatusCondition(&hcluster.Status.Conditions, *hostedClusterDestroyedCondition) + if err := r.Client.Status().Update(ctx, hcluster); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to update status: %w", err) + } + log.Info("Waiting for grace period", "gracePeriod", hcDestroyGracePeriod) + return ctrl.Result{RequeueAfter: hcDestroyGracePeriod}, nil + } + + if time.Since(hostedClusterDestroyedCondition.LastTransitionTime.Time) < hcDestroyGracePeriod { + log.Info("Waiting for grace period", "gracePeriod", hcDestroyGracePeriod) + return ctrl.Result{RequeueAfter: hcDestroyGracePeriod - time.Since(hostedClusterDestroyedCondition.LastTransitionTime.Time)}, nil + } + log.Info("grace period finished", "gracePeriod", hcDestroyGracePeriod) + } + + // Now we can remove the finalizer. + if controllerutil.ContainsFinalizer(hcluster, HostedClusterFinalizer) { + controllerutil.RemoveFinalizer(hcluster, HostedClusterFinalizer) + if err := r.Update(ctx, hcluster); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to remove finalizer from hostedcluster: %w", err) + } + } + + log.Info("Deleted hostedcluster", "name", req.NamespacedName) + return ctrl.Result{}, nil + } + + // Part zero: fix up conversion + originalSpec := hcluster.Spec.DeepCopy() + + // Reconcile converted AWS roles. + if hcluster.Spec.Platform.AWS != nil { + if err := r.dereferenceAWSRoles(ctx, hcluster.Name, &hcluster.Spec.Platform.AWS.RolesRef, hcluster.Namespace); err != nil { + return ctrl.Result{}, err + } + } + if hcluster.Spec.SecretEncryption != nil && hcluster.Spec.SecretEncryption.KMS != nil && hcluster.Spec.SecretEncryption.KMS.AWS != nil { + if strings.HasPrefix(hcluster.Spec.SecretEncryption.KMS.AWS.Auth.AWSKMSRoleARN, "arn-from-secret::") { + secretName := strings.TrimPrefix(hcluster.Spec.SecretEncryption.KMS.AWS.Auth.AWSKMSRoleARN, "arn-from-secret::") + arn, err := r.getARNFromSecret(ctx, hcluster.Name, secretName, hcluster.Namespace) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to get ARN from secret %s/%s: %w", hcluster.Namespace, secretName, err) + } + hcluster.Spec.SecretEncryption.KMS.AWS.Auth.AWSKMSRoleARN = arn + } + } + + createOrUpdate := r.createOrUpdate(req) + + // Reconcile platform defaults + if err := r.reconcilePlatformDefaultSettings(ctx, hcluster, createOrUpdate, log); err != nil { + return ctrl.Result{}, err + } + + // Update fields if required. + if !equality.Semantic.DeepEqual(&hcluster.Spec, originalSpec) { + log.Info("Updating deprecated fields for hosted cluster") + return ctrl.Result{}, r.Client.Update(ctx, hcluster) + } + + // Part one: update status + + // Set kubeconfig status + { + kubeConfigSecret := manifests.KubeConfigSecret(hcluster.Namespace, hcluster.Name) + err := r.Client.Get(ctx, client.ObjectKeyFromObject(kubeConfigSecret), kubeConfigSecret) + if err != nil { + if !apierrors.IsNotFound(err) { + return ctrl.Result{}, fmt.Errorf("failed to reconcile kubeconfig secret: %w", err) + } + } else { + hcluster.Status.KubeConfig = &corev1.LocalObjectReference{Name: kubeConfigSecret.Name} + } + } + + // Reconcile the ICSP/IDMS from the management cluster + err = r.RegistryProvider.Reconcile(ctx, r.Client) + if err != nil { + return ctrl.Result{}, err + } + releaseProvider := r.RegistryProvider.GetReleaseProvider() + registryClientImageMetadataProvider := r.RegistryProvider.GetMetadataProvider() + + pullSecretBytes, err := hyperutil.GetPullSecretBytes(ctx, r.Client, hcluster) + if err != nil { + log.Error(err, "failed to get pull secret") + // Ensure even with a bad pull secret we process pod placement decisions + // before returning the pull secret error for requeue. + hcp = controlplaneoperator.HostedControlPlane(controlPlaneNamespace.Name, hcluster.Name) + isAutoscalingNeeded, autoscaleErr := r.isAutoscalingNeeded(ctx, hcluster) + if autoscaleErr != nil { + log.Error(autoscaleErr, "failed to determine if autoscaler is needed during pull secret recovery, defaulting to true") + isAutoscalingNeeded = true + } + isAWSNodeTerminationHandlerNeeded, nthErr := r.isAWSNodeTerminationHandlerNeeded(ctx, hcluster) + if nthErr != nil { + log.Error(nthErr, "failed to determine if AWS node termination handler is needed during pull secret recovery, defaulting to true") + isAWSNodeTerminationHandlerNeeded = true + } + _, hcpErr := createOrUpdate(ctx, r.Client, hcp, func() error { + // Skip cert annotation resolution during pull secret recovery — it requires + // the pull secret to resolve the CPO image, which is unavailable here. + return reconcileHostedControlPlane(hcp, hcluster, isAutoscalingNeeded, isAWSNodeTerminationHandlerNeeded, + func() (map[string]string, error) { return nil, nil }) + }) + if hcpErr != nil { + log.Error(hcpErr, "failed to reconcile hostedcontrolplane during pull secret recovery") + } + return ctrl.Result{}, fmt.Errorf("failed to get pull secret: %w", err) + } + + controlPlaneOperatorImage, err := hyperutil.GetControlPlaneOperatorImage(ctx, hcluster, releaseProvider, r.HypershiftOperatorImage, pullSecretBytes) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to get controlPlaneOperatorImage: %w", err) + } + controlPlaneOperatorImageLabels, err := hyperutil.GetControlPlaneOperatorImageLabels(ctx, hcluster, controlPlaneOperatorImage, pullSecretBytes, registryClientImageMetadataProvider) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to get controlPlaneOperatorImageLabels: %w", err) + } + + _, cpoSupportsKASCustomKubeconfig := controlPlaneOperatorImageLabels[controlPlaneOperatorSupportsKASCustomKubeconfigLabel] + + if cpoSupportsKASCustomKubeconfig { + if len(hcluster.Spec.KubeAPIServerDNSName) > 0 { + CustomKubeconfigSecret := manifests.KubeConfigExternalSecret(hcluster.Namespace, hcluster.Name) + err := r.Client.Get(ctx, client.ObjectKeyFromObject(CustomKubeconfigSecret), CustomKubeconfigSecret) + if err != nil { + if !apierrors.IsNotFound(err) { + return ctrl.Result{}, fmt.Errorf("failed to reconcile external kubeconfig secret: %w", err) + } + } else { + hcluster.Status.CustomKubeconfig = &corev1.LocalObjectReference{Name: CustomKubeconfigSecret.Name} + } + } + } + + // Set kubeadminPassword status + { + explicitOauthConfig := hcluster.Spec.Configuration != nil && hcluster.Spec.Configuration.OAuth != nil + if explicitOauthConfig { + hcluster.Status.KubeadminPassword = nil + } else { + kubeadminPasswordSecret := manifests.KubeadminPasswordSecret(hcluster.Namespace, hcluster.Name) + err := r.Client.Get(ctx, client.ObjectKeyFromObject(kubeadminPasswordSecret), kubeadminPasswordSecret) + if err != nil { + if !apierrors.IsNotFound(err) { + return ctrl.Result{}, fmt.Errorf("failed to reconcile kubeadmin password secret: %w", err) + } + } else { + hcluster.Status.KubeadminPassword = &corev1.LocalObjectReference{Name: kubeadminPasswordSecret.Name} + } + } + } + + // Set version status + hcluster.Status.Version = computeClusterVersionStatus(r.Clock, hcluster, hcp) + + // Copy the CVO conditions from the HCP. + hcpCVOConditions := map[hyperv1.ConditionType]*metav1.Condition{ + hyperv1.ClusterVersionSucceeding: nil, + hyperv1.ClusterVersionProgressing: nil, + hyperv1.ClusterVersionReleaseAccepted: nil, + hyperv1.ClusterVersionRetrievedUpdates: nil, + hyperv1.ClusterVersionUpgradeable: nil, + hyperv1.ClusterVersionAvailable: nil, + } + if hcp != nil { + hcpCVOConditions = map[hyperv1.ConditionType]*metav1.Condition{ + hyperv1.ClusterVersionSucceeding: meta.FindStatusCondition(hcp.Status.Conditions, string(hyperv1.ClusterVersionFailing)), + hyperv1.ClusterVersionProgressing: meta.FindStatusCondition(hcp.Status.Conditions, string(hyperv1.ClusterVersionProgressing)), + hyperv1.ClusterVersionReleaseAccepted: meta.FindStatusCondition(hcp.Status.Conditions, string(hyperv1.ClusterVersionReleaseAccepted)), + hyperv1.ClusterVersionRetrievedUpdates: meta.FindStatusCondition(hcp.Status.Conditions, string(hyperv1.ClusterVersionRetrievedUpdates)), + hyperv1.ClusterVersionUpgradeable: meta.FindStatusCondition(hcp.Status.Conditions, string(hyperv1.ClusterVersionUpgradeable)), + hyperv1.ClusterVersionAvailable: meta.FindStatusCondition(hcp.Status.Conditions, string(hyperv1.ClusterVersionAvailable)), + } + } + + for conditionType := range hcpCVOConditions { + var hcCVOCondition *metav1.Condition + // Set unknown status. + var unknownStatusMessage string + if hcpCVOConditions[conditionType] == nil { + unknownStatusMessage = "Condition not found in the CVO." + } + + hcCVOCondition = &metav1.Condition{ + Type: string(conditionType), + Status: metav1.ConditionUnknown, + Reason: hyperv1.StatusUnknownReason, + Message: unknownStatusMessage, + ObservedGeneration: hcluster.Generation, + } + + if hcp != nil && hcpCVOConditions[conditionType] != nil { + // Bubble up info from HCP. + hcCVOCondition = hcpCVOConditions[conditionType] + hcCVOCondition.ObservedGeneration = hcluster.Generation + + // Inverse ClusterVersionFailing condition into ClusterVersionSucceeding + // So consumers e.g. UI can categorize as good (True) / bad (False). + if conditionType == hyperv1.ClusterVersionSucceeding { + hcCVOCondition.Type = string(hyperv1.ClusterVersionSucceeding) + var status metav1.ConditionStatus + switch hcpCVOConditions[conditionType].Status { + case metav1.ConditionTrue: + status = metav1.ConditionFalse + case metav1.ConditionFalse: + status = metav1.ConditionTrue + } + hcCVOCondition.Status = status + } + } + + if hcCVOCondition.Type == string(hyperv1.ClusterVersionRetrievedUpdates) && hcCVOCondition.Reason == hyperv1.StatusUnknownReason { + // until all HostedControlPlane controllers understand how to propagate this condition, avoid bothering folks with unknown status in HostedCluster conditions. + meta.RemoveStatusCondition(&hcluster.Status.Conditions, string(hyperv1.ClusterVersionRetrievedUpdates)) + continue + } + + meta.SetStatusCondition(&hcluster.Status.Conditions, *hcCVOCondition) + } + + // Copy the Degraded condition on the hostedcontrolplane + { + condition := &metav1.Condition{ + Type: string(hyperv1.HostedClusterDegraded), + Status: metav1.ConditionUnknown, + Reason: hyperv1.StatusUnknownReason, + Message: "The hosted control plane is not found", + ObservedGeneration: hcluster.Generation, + } + if hcp != nil { + degradedCondition := meta.FindStatusCondition(hcp.Status.Conditions, string(hyperv1.HostedControlPlaneDegraded)) + if degradedCondition != nil { + condition = degradedCondition + condition.Type = string(hyperv1.HostedClusterDegraded) + if condition.Status == metav1.ConditionFalse { + condition.Message = "The hosted cluster is not degraded" + } + } + } + condition.ObservedGeneration = hcluster.Generation + meta.SetStatusCondition(&hcluster.Status.Conditions, *condition) + } + + // Copy the ValidKubeVirtInfraNetworkMTU condition from the HostedControlPlane + if hcluster.Spec.Platform.Type == hyperv1.KubevirtPlatform { + if hcp != nil { + validMtuCondCreated := meta.FindStatusCondition(hcp.Status.Conditions, string(hyperv1.ValidKubeVirtInfraNetworkMTU)) + if validMtuCondCreated != nil { + validMtuCondCreated.ObservedGeneration = hcluster.Generation + meta.SetStatusCondition(&hcluster.Status.Conditions, *validMtuCondCreated) + } + } + if err := r.syncKVLiveMigratableCondition(ctx, hcluster); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to update condition: %w", err) + } + } + + // Copy conditions from hostedcontrolplane + { + hcpConditions := []hyperv1.ConditionType{ + hyperv1.EtcdAvailable, + hyperv1.KubeAPIServerAvailable, + hyperv1.InfrastructureReady, + hyperv1.ExternalDNSReachable, + hyperv1.ValidHostedControlPlaneConfiguration, + hyperv1.ValidReleaseInfo, + hyperv1.ValidIDPConfiguration, + hyperv1.HostedClusterRestoredFromBackup, + hyperv1.DataPlaneConnectionAvailable, + hyperv1.ControlPlaneConnectionAvailable, + hyperv1.EtcdBackupSucceeded, + } + + for _, conditionType := range hcpConditions { + condition := &metav1.Condition{ + Type: string(conditionType), + Status: metav1.ConditionUnknown, + Reason: hyperv1.StatusUnknownReason, + Message: "The hosted control plane is not found", + ObservedGeneration: hcluster.Generation, + } + if hcp != nil { + hcpCondition := meta.FindStatusCondition(hcp.Status.Conditions, string(conditionType)) + if hcpCondition != nil { + condition = hcpCondition + } else { + condition.Message = "Condition not found in the HCP" + } + } + condition.ObservedGeneration = hcluster.Generation + meta.SetStatusCondition(&hcluster.Status.Conditions, *condition) + } + } + + // Copy the platform status from the hostedcontrolplane + if hcp != nil { + hcluster.Status.Platform = hcp.Status.Platform + hcluster.Status.AutoNode = hcp.Status.AutoNode + } + + // Copy the control plane version status from the hostedcontrolplane + propagateControlPlaneVersion(hcluster, hcp) + + // Set the AutoNodeEnabled condition reflecting both spec intent and actual component rollout progress. + autoNodeCondition, autoNodeProgressing := r.reconcileAutoNodeEnabledCondition(ctx, hcluster, controlPlaneNamespace.Name) + meta.SetStatusCondition(&hcluster.Status.Conditions, autoNodeCondition) + + // Copy the AWSDefaultSecurityGroupCreated condition from the hostedcontrolplane + if hcluster.Spec.Platform.Type == hyperv1.AWSPlatform { + if hcp != nil { + sgCreated := meta.FindStatusCondition(hcp.Status.Conditions, string(hyperv1.AWSDefaultSecurityGroupCreated)) + if sgCreated != nil { + sgCreated.ObservedGeneration = hcluster.Generation + meta.SetStatusCondition(&hcluster.Status.Conditions, *sgCreated) + } + + validKMSConfig := meta.FindStatusCondition(hcp.Status.Conditions, string(hyperv1.ValidAWSKMSConfig)) + if validKMSConfig != nil { + validKMSConfig.ObservedGeneration = hcluster.Generation + meta.SetStatusCondition(&hcluster.Status.Conditions, *validKMSConfig) + } + } + } + + if hcluster.Spec.Platform.Type == hyperv1.AzurePlatform { + if hcp != nil { + validKMSConfig := meta.FindStatusCondition(hcp.Status.Conditions, string(hyperv1.ValidAzureKMSConfig)) + if validKMSConfig != nil { + validKMSConfig.ObservedGeneration = hcluster.Generation + meta.SetStatusCondition(&hcluster.Status.Conditions, *validKMSConfig) + } + } + } + + // Reconcile unmanaged etcd client tls secret validation error status. Note only update status on validation error case to + // provide clear status to the user on the resource without having to look at operator logs. + { + if hcluster.Spec.Etcd.ManagementType == hyperv1.Unmanaged { + unmanagedEtcdTLSClientSecret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: hcluster.GetNamespace(), + Name: hcluster.Spec.Etcd.Unmanaged.TLS.ClientSecret.Name, + }, + } + if err := r.Client.Get(ctx, client.ObjectKeyFromObject(unmanagedEtcdTLSClientSecret), unmanagedEtcdTLSClientSecret); err != nil { + if apierrors.IsNotFound(err) { + unmanagedEtcdTLSClientSecret = nil + } else { + return ctrl.Result{}, fmt.Errorf("failed to get unmanaged etcd tls secret: %w", err) + } + } + meta.SetStatusCondition(&hcluster.Status.Conditions, computeUnmanagedEtcdAvailability(hcluster, unmanagedEtcdTLSClientSecret)) + } + } + + // Set the Available condition + // TODO: This is really setting something that could be more granular like + // HostedControlPlaneAvailable, and then the HostedCluster high-level Available + // condition could be computed as a function of the granular ThingAvailable + // conditions (so that it could incorporate e.g. HostedControlPlane and IgnitionServer + // availability in the ultimate HostedCluster Available condition) + { + availableCondition := computeHostedClusterAvailability(hcluster, hcp) + _, isHasBeenAvailableAnnotationSet := hcluster.Annotations[hcmetrics.HasBeenAvailableAnnotation] + + meta.SetStatusCondition(&hcluster.Status.Conditions, availableCondition) + + if availableCondition.Status == metav1.ConditionTrue && !isHasBeenAvailableAnnotationSet { + original := hcluster.DeepCopy() + + if hcluster.Annotations == nil { + hcluster.Annotations = make(map[string]string) + } + + hcluster.Annotations[hcmetrics.HasBeenAvailableAnnotation] = "true" + + if err := r.Patch(ctx, hcluster, client.MergeFromWithOptions(original)); err != nil { + return ctrl.Result{}, fmt.Errorf("cannot patch hosted cluster with has been available annotation: %w", err) + } + } + } + + // Copy AWSEndpointAvailable and AWSEndpointServiceAvailable conditions from the AWSEndpointServices. + if hcluster.Spec.Platform.Type == hyperv1.AWSPlatform { + hcpNamespace := manifests.HostedControlPlaneNamespace(hcluster.Namespace, hcluster.Name) + var awsEndpointServiceList hyperv1.AWSEndpointServiceList + if err := r.List(ctx, &awsEndpointServiceList, &client.ListOptions{Namespace: hcpNamespace}); err != nil { + condition := metav1.Condition{ + Type: string(hyperv1.AWSEndpointAvailable), + Status: metav1.ConditionUnknown, + Reason: hyperv1.NotFoundReason, + Message: fmt.Sprintf("error listing awsendpointservices in namespace %s: %v", hcpNamespace, err), + } + meta.SetStatusCondition(&hcluster.Status.Conditions, condition) + } else { + meta.SetStatusCondition(&hcluster.Status.Conditions, computeAWSEndpointServiceCondition(awsEndpointServiceList, hyperv1.AWSEndpointAvailable)) + meta.SetStatusCondition(&hcluster.Status.Conditions, computeAWSEndpointServiceCondition(awsEndpointServiceList, hyperv1.AWSEndpointServiceAvailable)) + } + } + + // Copy GCPEndpointAvailable and GCPServiceAttachmentAvailable conditions from the GCPPrivateServiceConnect resources. + if hcluster.Spec.Platform.Type == hyperv1.GCPPlatform { + hcpNamespace := manifests.HostedControlPlaneNamespace(hcluster.Namespace, hcluster.Name) + var gcpPSCList hyperv1.GCPPrivateServiceConnectList + if err := r.List(ctx, &gcpPSCList, &client.ListOptions{Namespace: hcpNamespace}); err != nil { + condition := metav1.Condition{ + Type: string(hyperv1.GCPEndpointAvailable), + Status: metav1.ConditionUnknown, + Reason: hyperv1.NotFoundReason, + Message: fmt.Sprintf("error listing GCPPrivateServiceConnect in namespace %s: %v", hcpNamespace, err), + } + meta.SetStatusCondition(&hcluster.Status.Conditions, condition) + } else { + meta.SetStatusCondition(&hcluster.Status.Conditions, computeGCPPSCCondition(gcpPSCList, hyperv1.GCPEndpointAvailable)) + meta.SetStatusCondition(&hcluster.Status.Conditions, computeGCPPSCCondition(gcpPSCList, hyperv1.GCPServiceAttachmentAvailable)) + } + } + + // Copy Azure Private Link conditions from the AzurePrivateLinkService resources. + // ARO HCP uses Swift networking, not Private Link Services. + if hcluster.Spec.Platform.Type == hyperv1.AzurePlatform && !netutil.UseSwiftNetworkingHC(hcluster) { + hcpNamespace := manifests.HostedControlPlaneNamespace(hcluster.Namespace, hcluster.Name) + var azPLSList hyperv1.AzurePrivateLinkServiceList + if err := r.List(ctx, &azPLSList, &client.ListOptions{Namespace: hcpNamespace}); err != nil { + condition := metav1.Condition{ + Type: string(hyperv1.AzurePrivateLinkServiceAvailable), + Status: metav1.ConditionUnknown, + Reason: hyperv1.NotFoundReason, + Message: fmt.Sprintf("error listing AzurePrivateLinkService in namespace %s: %v", hcpNamespace, err), + } + meta.SetStatusCondition(&hcluster.Status.Conditions, condition) + } else if len(azPLSList.Items) > 0 { + meta.SetStatusCondition(&hcluster.Status.Conditions, computeAzurePLSCondition(azPLSList, hyperv1.AzurePrivateLinkServiceAvailable)) + meta.SetStatusCondition(&hcluster.Status.Conditions, computeAzurePLSCondition(azPLSList, hyperv1.AzurePLSCreated)) + meta.SetStatusCondition(&hcluster.Status.Conditions, computeAzurePLSCondition(azPLSList, hyperv1.AzureInternalLoadBalancerAvailable)) + meta.SetStatusCondition(&hcluster.Status.Conditions, computeAzurePLSCondition(azPLSList, hyperv1.AzurePrivateEndpointAvailable)) + meta.SetStatusCondition(&hcluster.Status.Conditions, computeAzurePLSCondition(azPLSList, hyperv1.AzurePrivateDNSAvailable)) + } + } + + // Set ValidConfiguration condition + { + condition := metav1.Condition{ + Type: string(hyperv1.ValidHostedClusterConfiguration), + ObservedGeneration: hcluster.Generation, + } + if err := r.validateConfigAndClusterCapabilities(ctx, hcluster); err != nil { + condition.Status = metav1.ConditionFalse + condition.Message = err.Error() + condition.Reason = hyperv1.InvalidConfigurationReason + } else { + condition.Status = metav1.ConditionTrue + condition.Message = "Configuration passes validation" + condition.Reason = hyperv1.AsExpectedReason + } + meta.SetStatusCondition(&hcluster.Status.Conditions, condition) + } + + // Set SupportedHostedCluster condition + { + condition := metav1.Condition{ + Type: string(hyperv1.SupportedHostedCluster), + ObservedGeneration: hcluster.Generation, + } + if err := r.validateHostedClusterSupport(hcluster); err != nil { + condition.Status = metav1.ConditionFalse + condition.Message = err.Error() + condition.Reason = hyperv1.UnsupportedHostedClusterReason + } else { + condition.Status = metav1.ConditionTrue + condition.Message = "HostedCluster is supported by operator configuration" + condition.Reason = hyperv1.AsExpectedReason + } + meta.SetStatusCondition(&hcluster.Status.Conditions, condition) + } + + // Set ValidProxyConfiguration condition + { + condition := metav1.Condition{ + Type: string(hyperv1.ValidProxyConfiguration), + ObservedGeneration: hcluster.Generation, + } + if hcluster.Spec.Configuration != nil && hcluster.Spec.Configuration.Proxy != nil && hcluster.Spec.Configuration.Proxy.TrustedCA.Name != "" { + if err := r.validateProxyConfiguration(ctx, hcluster); err != nil { + condition.Status = metav1.ConditionFalse + condition.Message = fmt.Sprintf("Proxy CA bundle is invalid for HostedCluster %s/%s: %v", hcluster.Namespace, hcluster.Name, err) + condition.Reason = hyperv1.ProxyCABundleInvalidReason + log.Info("Proxy CA bundle validation failed", "namespace", hcluster.Namespace, "name", hcluster.Name, "error", err) + } else { + condition.Status = metav1.ConditionTrue + condition.Message = "Proxy CA bundle is valid" + condition.Reason = hyperv1.AsExpectedReason + } + } else { + // No proxy configured or no CA bundle + condition.Status = metav1.ConditionTrue + condition.Message = "No proxy CA bundle configured" + condition.Reason = hyperv1.AsExpectedReason + } + meta.SetStatusCondition(&hcluster.Status.Conditions, condition) + } + + // Set Ignition Server endpoint + { + serviceStrategy := servicePublishingStrategyByType(hcluster, hyperv1.Ignition) + if serviceStrategy == nil { + // We don't return the error here as reconciling won't solve the input problem. + // An update event will trigger reconciliation. + log.Error(fmt.Errorf("ignition server service strategy not specified"), "") + return ctrl.Result{}, nil + } + switch serviceStrategy.Type { + case hyperv1.Route: + if serviceStrategy.Route != nil && serviceStrategy.Route.Hostname != "" { + hcluster.Status.IgnitionEndpoint = serviceStrategy.Route.Hostname + } else { + ignitionServerRoute := ignitionserver.Route(controlPlaneNamespace.GetName()) + if err := r.Client.Get(ctx, client.ObjectKeyFromObject(ignitionServerRoute), ignitionServerRoute); err != nil { + if !apierrors.IsNotFound(err) { + return ctrl.Result{}, fmt.Errorf("failed to get ignitionServerRoute: %w", err) + } + } + if ignitionServerRoute.Spec.Host != "" { + hcluster.Status.IgnitionEndpoint = ignitionServerRoute.Spec.Host + } + } + case hyperv1.NodePort: + if serviceStrategy.NodePort == nil { + // We don't return the error here as reconciling won't solve the input problem. + // An update event will trigger reconciliation. + log.Error(fmt.Errorf("nodeport metadata not specified for ignition service"), "") + return ctrl.Result{}, nil + } + ignitionService := ignitionserver.ProxyService(controlPlaneNamespace.GetName()) + if err = r.Client.Get(ctx, client.ObjectKeyFromObject(ignitionService), ignitionService); err != nil { + if !apierrors.IsNotFound(err) { + return ctrl.Result{}, fmt.Errorf("failed to get ignition proxy service: %w", err) + } else { + // ignition-server-proxy service not found, possible IBM platform or older CPO that doesn't create the service + ignitionService = ignitionserver.Service(controlPlaneNamespace.GetName()) + if err = r.Client.Get(ctx, client.ObjectKeyFromObject(ignitionService), ignitionService); err != nil { + if !apierrors.IsNotFound(err) { + return ctrl.Result{}, fmt.Errorf("failed to get ignition service: %w", err) + } + } + } + } + if err == nil && serviceFirstNodePortAvailable(ignitionService) { + hcluster.Status.IgnitionEndpoint = fmt.Sprintf("%s:%d", serviceStrategy.NodePort.Address, ignitionService.Spec.Ports[0].NodePort) + } + default: + // We don't return the error here as reconciling won't solve the input problem. + // An update event will trigger reconciliation. + log.Error(fmt.Errorf("unknown service strategy type for ignition service: %s", serviceStrategy.Type), "") + return ctrl.Result{}, nil + } + } + + // Set the Control Plane and OAuth endpoints URL + { + if hcp != nil { + hcluster.Status.ControlPlaneEndpoint = hcp.Status.ControlPlaneEndpoint + + // TODO: (cewong) Remove this hack when we no longer need to support HostedControlPlanes that report + // the wrong port for the route strategy. + if isAPIServerRoute(hcluster) { + hcluster.Status.ControlPlaneEndpoint.Port = 443 + } + hcluster.Status.OAuthCallbackURLTemplate = hcp.Status.OAuthCallbackURLTemplate + } + } + + // Set the ignition server availability condition by checking its deployment. + { + // Assume the server is unavailable unless proven otherwise. + newCondition := metav1.Condition{ + Type: string(hyperv1.IgnitionEndpointAvailable), + Status: metav1.ConditionUnknown, + Reason: hyperv1.StatusUnknownReason, + } + // Check to ensure the deployment exists and is available. + deployment := ignitionserver.Deployment(controlPlaneNamespace.Name) + if err := r.Client.Get(ctx, client.ObjectKeyFromObject(deployment), deployment); err != nil { + if apierrors.IsNotFound(err) { + newCondition = metav1.Condition{ + Type: string(hyperv1.IgnitionEndpointAvailable), + Status: metav1.ConditionFalse, + Reason: hyperv1.NotFoundReason, + Message: "Ignition server deployment not found", + } + } else { + return ctrl.Result{}, fmt.Errorf("failed to get ignition server deployment: %w", err) + } + } else { + // Assume the deployment is unavailable until proven otherwise. + newCondition = metav1.Condition{ + Type: string(hyperv1.IgnitionEndpointAvailable), + Status: metav1.ConditionFalse, + Reason: hyperv1.WaitingForAvailableReason, + Message: "Ignition server deployment is not yet available", + } + for _, cond := range deployment.Status.Conditions { + if cond.Type == appsv1.DeploymentAvailable && cond.Status == corev1.ConditionTrue { + newCondition = metav1.Condition{ + Type: string(hyperv1.IgnitionEndpointAvailable), + Status: metav1.ConditionTrue, + Reason: hyperv1.AsExpectedReason, + Message: "Ignition server deployment is available", + } + break + } + } + } + newCondition.ObservedGeneration = hcluster.Generation + meta.SetStatusCondition(&hcluster.Status.Conditions, newCondition) + } + meta.SetStatusCondition(&hcluster.Status.Conditions, hyperutil.GenerateReconciliationActiveCondition(hcluster.Spec.PausedUntil, hcluster.Generation)) + + // Set ValidReleaseImage condition + { + condition := meta.FindStatusCondition(hcluster.Status.Conditions, string(hyperv1.ValidReleaseImage)) + + // This check can be expensive looking up release image versions + // (hopefully they are cached). Skip if we have already observed for + // this generation. + if condition == nil || condition.ObservedGeneration != hcluster.Generation || condition.Status != metav1.ConditionTrue { + condition := metav1.Condition{ + Type: string(hyperv1.ValidReleaseImage), + ObservedGeneration: hcluster.Generation, + } + err := r.validateReleaseImage(ctx, hcluster, releaseProvider) + if err != nil { + condition.Status = metav1.ConditionFalse + condition.Message = err.Error() + + if apierrors.IsNotFound(err) { + condition.Reason = hyperv1.SecretNotFoundReason + } else { + condition.Reason = hyperv1.InvalidImageReason + } + } else { + condition.Status = metav1.ConditionTrue + condition.Message = "Release image is valid" + condition.Reason = hyperv1.AsExpectedReason + } + meta.SetStatusCondition(&hcluster.Status.Conditions, condition) + } + } + + // Set HostedCluster payload arch + payloadArch, err := hyperutil.DetermineHostedClusterPayloadArch(ctx, r.Client, hcluster, registryClientImageMetadataProvider) + if err != nil { + condition := metav1.Condition{ + Type: string(hyperv1.ValidReleaseImage), + ObservedGeneration: hcluster.Generation, + } + condition.Status = metav1.ConditionFalse + condition.Message = err.Error() + condition.Reason = hyperv1.PayloadArchNotFoundReason + meta.SetStatusCondition(&hcluster.Status.Conditions, condition) + + return ctrl.Result{}, err + } + + hcluster.Status.PayloadArch = payloadArch + + releaseImage, err := r.lookupReleaseImage(ctx, hcluster, releaseProvider) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to lookup release image: %w", err) + } + // Set Progressing condition + { + condition := metav1.Condition{ + Type: string(hyperv1.HostedClusterProgressing), + ObservedGeneration: hcluster.Generation, + Status: metav1.ConditionFalse, + Message: "HostedCluster is at expected version", + Reason: hyperv1.AsExpectedReason, + } + refWithDigest := func() (string, error) { + _, ref, err := registryClientImageMetadataProvider.GetDigest(ctx, hcluster.Spec.Release.Image, pullSecretBytes) + if err != nil { + return "", err + } + return ref.String(), nil + } + + progressing, err := isProgressing(hcluster, releaseImage, refWithDigest) + if err != nil { + condition.Status = metav1.ConditionFalse + condition.Message = err.Error() + condition.Reason = hyperv1.BlockedReason + } + if progressing { + condition.Status = metav1.ConditionTrue + condition.Message = "HostedCluster is deploying, upgrading, or reconfiguring" + condition.Reason = "Progressing" + } + meta.SetStatusCondition(&hcluster.Status.Conditions, condition) + } + + // Copy the configuration status from the hostedcontrolplane + if hcp != nil { + hcluster.Status.Configuration = hcp.Status.Configuration + } + + // Persist status updates + if err := r.Client.Status().Update(ctx, hcluster); err != nil { + if apierrors.IsConflict(err) { + return ctrl.Result{Requeue: true}, nil + } + return ctrl.Result{}, fmt.Errorf("failed to update status: %w", err) + } + + // Part two: reconcile the state of the world + + // Ensure the cluster has a finalizer for cleanup and update right away. + if !controllerutil.ContainsFinalizer(hcluster, HostedClusterFinalizer) { + controllerutil.AddFinalizer(hcluster, HostedClusterFinalizer) + if err := r.Update(ctx, hcluster); err != nil { + if apierrors.IsConflict(err) { + return ctrl.Result{Requeue: true}, nil + } + return ctrl.Result{}, fmt.Errorf("failed to add finalizer to cluster: %w", err) + } + } + + // Ensure the CAPI deployments have finalizers + if err := r.reconcileCAPIFinalizers(ctx, hcluster, false); err != nil { + return ctrl.Result{}, fmt.Errorf("error adding finalizers to CAPI deployments: %w", err) + } + + // if paused: ensure associated HostedControlPlane (if it exists) is also paused and stop reconciliation + if isPaused, duration := hyperutil.IsReconciliationPaused(log, hcluster.Spec.PausedUntil); isPaused { + if err := pauseHostedControlPlane(ctx, r.Client, hcp, hcluster.Spec.PausedUntil); err != nil { + return ctrl.Result{}, err + } + if err := pauseCAPICluster(ctx, r.Client, hcluster, true); err != nil { + return ctrl.Result{}, err + } + log.Info("Reconciliation paused", "name", req.NamespacedName, "pausedUntil", *hcluster.Spec.PausedUntil) + return ctrl.Result{RequeueAfter: duration}, nil + } + + if err := r.defaultClusterIDsIfNeeded(ctx, hcluster); err != nil { + return ctrl.Result{}, err + } + + if err = r.reconcileCLISecrets(ctx, createOrUpdate, hcluster); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to reconcile the CLI secrets: %w", err) + } + + // Set the infraID as Tag on all created AWS + if err := r.reconcileAWSResourceTags(ctx, hcluster); err != nil { + return ctrl.Result{}, err + } + + // Block here if the cluster configuration does not pass validation + { + validConfig := meta.FindStatusCondition(hcluster.Status.Conditions, string(hyperv1.ValidHostedClusterConfiguration)) + if validConfig != nil && validConfig.Status == metav1.ConditionFalse { + // an error should be returned here because the ValidHostedClusterConfiguration status may be transient + return ctrl.Result{}, fmt.Errorf("configuration is invalid: %s", validConfig.Message) + } + supportedHostedCluster := meta.FindStatusCondition(hcluster.Status.Conditions, string(hyperv1.SupportedHostedCluster)) + if supportedHostedCluster != nil && supportedHostedCluster.Status == metav1.ConditionFalse { + log.Error(fmt.Errorf("not supported by operator configuration"), "reconciliation is blocked", "message", supportedHostedCluster.Message) + return ctrl.Result{}, nil + } + validReleaseImage := meta.FindStatusCondition(hcluster.Status.Conditions, string(hyperv1.ValidReleaseImage)) + if validReleaseImage != nil && validReleaseImage.Status == metav1.ConditionFalse { + if validReleaseImage.Reason == hyperv1.SecretNotFoundReason { + return ctrl.Result{}, fmt.Errorf("%s", validReleaseImage.Message) + } + log.Error(fmt.Errorf("release image is invalid"), "reconciliation is blocked", "message", validReleaseImage.Message) + return ctrl.Result{}, nil + } + upgrading, msg, err := isUpgrading(hcluster, releaseImage) + if upgrading { + if err != nil { + log.Error(err, "reconciliation is blocked", "message", validReleaseImage.Message) + return ctrl.Result{}, nil + } + if msg != "" { + log.Info(msg) + } + } + } + + cpoHasUtilities := false + if _, hasLabel := controlPlaneOperatorImageLabels[controlPlaneOperatorSubcommandsLabel]; hasLabel { + cpoHasUtilities = true + } + utilitiesImage := controlPlaneOperatorImage + if !cpoHasUtilities { + utilitiesImage = r.HypershiftOperatorImage + } + + _, controlPlaneOperatorAppliesManagementKASNetworkPolicyLabel := controlPlaneOperatorImageLabels[controlPlaneOperatorAppliesManagementKASNetworkPolicyLabel] + _, controlPlanePKIOperatorSignsCSRs := controlPlaneOperatorImageLabels[controlPlanePKIOperatorSignsCSRsLabel] + _, useRestrictedPSA := controlPlaneOperatorImageLabels[useRestrictedPodSecurityLabel] + _, defaultToControlPlaneV2 := controlPlaneOperatorImageLabels[defaultToControlPlaneV2Label] + + // Reconcile the hosted cluster namespace + _, err = createOrUpdate(ctx, r.Client, controlPlaneNamespace, func() error { + if controlPlaneNamespace.Labels == nil { + controlPlaneNamespace.Labels = make(map[string]string) + } + controlPlaneNamespace.Labels[ControlPlaneNamespaceLabelKey] = "true" + + // Set pod security labels on HCP namespace + psaOverride := hcluster.Annotations[hyperv1.PodSecurityAdmissionLabelOverrideAnnotation] + if psaOverride != "" { + controlPlaneNamespace.Labels["pod-security.kubernetes.io/enforce"] = psaOverride + controlPlaneNamespace.Labels["pod-security.kubernetes.io/audit"] = psaOverride + controlPlaneNamespace.Labels["pod-security.kubernetes.io/warn"] = psaOverride + } else if useRestrictedPSA { + controlPlaneNamespace.Labels["pod-security.kubernetes.io/enforce"] = "restricted" + controlPlaneNamespace.Labels["pod-security.kubernetes.io/audit"] = "restricted" + controlPlaneNamespace.Labels["pod-security.kubernetes.io/warn"] = "restricted" + } else { + controlPlaneNamespace.Labels["pod-security.kubernetes.io/enforce"] = "privileged" + controlPlaneNamespace.Labels["pod-security.kubernetes.io/audit"] = "privileged" + controlPlaneNamespace.Labels["pod-security.kubernetes.io/warn"] = "privileged" + } + controlPlaneNamespace.Labels["security.openshift.io/scc.podSecurityLabelSync"] = "false" + + // Enable monitoring for hosted control plane namespaces + if r.EnableOCPClusterMonitoring { + controlPlaneNamespace.Labels["openshift.io/cluster-monitoring"] = "true" + } + + if r.SetDefaultSecurityContext { + // Only set the SecurtyContext UID annotation if it's not already set. + _, ok := controlPlaneNamespace.Annotations[DefaultSecurityContextUIDAnnnotation] + if !ok { + uid, err := getNextAvailableSecurityContextUID(ctx, r.Client) + if err != nil { + return fmt.Errorf("failed to get next available SecurityContext UID: %w", err) + } + if controlPlaneNamespace.Annotations == nil { + controlPlaneNamespace.Annotations = make(map[string]string) + } + controlPlaneNamespace.Annotations[DefaultSecurityContextUIDAnnnotation] = strconv.FormatInt(uid, 10) + } + } + + // Enable observability operator monitoring + metrics.EnableOBOMonitoring(controlPlaneNamespace) + + return nil + }) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to reconcile namespace: %w", err) + } + + p, err := platform.GetPlatform(ctx, hcluster, releaseProvider, utilitiesImage, pullSecretBytes) + if err != nil { + return ctrl.Result{}, err + } + + // Reconcile Platform specifics. + { + if err := p.ReconcileCredentials(ctx, r.Client, createOrUpdate, hcluster, controlPlaneNamespace.Name); err != nil { + meta.SetStatusCondition(&hcluster.Status.Conditions, metav1.Condition{ + Type: string(hyperv1.PlatformCredentialsFound), + Status: metav1.ConditionFalse, + Reason: hyperv1.PlatformCredentialsNotFoundReason, + ObservedGeneration: hcluster.Generation, + Message: err.Error(), + }) + if statusErr := r.Client.Status().Update(ctx, hcluster); statusErr != nil { + return ctrl.Result{}, fmt.Errorf("failed to reconcile platform credentials: %w, failed to update status: %w", err, statusErr) + } + return ctrl.Result{}, fmt.Errorf("failed to reconcile platform credentials: %w", err) + } + if !meta.IsStatusConditionTrue(hcluster.Status.Conditions, string(hyperv1.PlatformCredentialsFound)) { + meta.SetStatusCondition(&hcluster.Status.Conditions, metav1.Condition{ + Type: string(hyperv1.PlatformCredentialsFound), + Status: metav1.ConditionTrue, + Reason: hyperv1.AsExpectedReason, + ObservedGeneration: hcluster.Generation, + Message: "Required platform credentials are found", + }) + if statusErr := r.Client.Status().Update(ctx, hcluster); statusErr != nil { + return ctrl.Result{}, fmt.Errorf("failed to reconcile platform credentials: %w, failed to update status: %w", err, statusErr) + } + } + } + + // Set the HostedCluster restored from backup condition + { + if _, exists := hcluster.Annotations[hyperv1.HostedClusterRestoredFromBackupAnnotation]; exists { + freshCondition := &metav1.Condition{ + Type: string(hyperv1.HostedClusterRestoredFromBackup), + Reason: hyperv1.RecoveryFinishedReason, + Status: metav1.ConditionUnknown, + ObservedGeneration: hcluster.Generation, + } + + if hcp != nil { + hostedClusterRestoredFromBackupCondition := meta.FindStatusCondition(hcp.Status.Conditions, string(hyperv1.HostedClusterRestoredFromBackup)) + if hostedClusterRestoredFromBackupCondition != nil { + freshCondition = hostedClusterRestoredFromBackupCondition + } + } + + oldCondition := meta.FindStatusCondition(hcluster.Status.Conditions, string(hyperv1.HostedClusterRestoredFromBackup)) + + // Preserve previous status if we can no longer determine the status + if oldCondition != nil && freshCondition.Status == metav1.ConditionUnknown { + freshCondition.Status = oldCondition.Status + } + + // If the condition is not set, or the status is different, set the condition + if oldCondition == nil || oldCondition.Status != freshCondition.Status { + freshCondition.ObservedGeneration = hcluster.Generation + } + + // If the condition is true, delete the hc annotation. It will be eventually bubbled down to the hcp. + if freshCondition.Status == metav1.ConditionTrue { + hclusterAnnotations := hcluster.GetAnnotations() + delete(hclusterAnnotations, hyperv1.HostedClusterRestoredFromBackupAnnotation) + hcluster.SetAnnotations(hclusterAnnotations) + if err := r.Client.Update(ctx, hcluster); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to remove annotations %v: %w", string(hyperv1.HostedClusterRestoredFromBackup), err) + } + } + + // Persist status updates + meta.SetStatusCondition(&hcluster.Status.Conditions, *freshCondition) + if err := r.Client.Status().Update(ctx, hcluster); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to update status %v: %w", string(hyperv1.HostedClusterRestoredFromBackup), err) + } + } + } + + // Reconcile the HostedControlPlane pull secret by resolving the source secret + // reference from the HostedCluster and syncing the secret in the control plane namespace. + { + var src corev1.Secret + if err := r.Client.Get(ctx, client.ObjectKey{Namespace: hcluster.GetNamespace(), Name: hcluster.Spec.PullSecret.Name}, &src); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to get pull secret %s: %w", hcluster.Spec.PullSecret.Name, err) + } + if err := ensureReferencedResourceAnnotation(ctx, r.Client, hcluster.Name, &src); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to set referenced resource annotation: %w", err) + } + dst := controlplaneoperator.PullSecret(controlPlaneNamespace.Name) + _, err = createOrUpdate(ctx, r.Client, dst, func() error { + srcData, srcHasData := src.Data[".dockerconfigjson"] + if !srcHasData { + return fmt.Errorf("hostedcluster pull secret %q must have a .dockerconfigjson key", src.Name) + } + dst.Type = corev1.SecretTypeDockerConfigJson + if dst.Data == nil { + dst.Data = map[string][]byte{} + } + dst.Data[".dockerconfigjson"] = srcData + return nil + }) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to reconcile pull secret: %w", err) + } + } + + // Reconcile the HostedControlPlane Secret Encryption Info + if hcluster.Spec.SecretEncryption != nil { + log.Info("Reconciling secret encryption configuration") + switch hcluster.Spec.SecretEncryption.Type { + case hyperv1.AESCBC: + if hcluster.Spec.SecretEncryption.AESCBC == nil || len(hcluster.Spec.SecretEncryption.AESCBC.ActiveKey.Name) == 0 { + log.Error(fmt.Errorf("aescbc metadata is nil"), "") + // don't return error here as reconciling won't fix input error + return ctrl.Result{}, nil + } + var src corev1.Secret + if err := r.Client.Get(ctx, client.ObjectKey{Namespace: hcluster.GetNamespace(), Name: hcluster.Spec.SecretEncryption.AESCBC.ActiveKey.Name}, &src); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to get active aescbc secret %s: %w", hcluster.Spec.SecretEncryption.AESCBC.ActiveKey.Name, err) + } + if err := ensureReferencedResourceAnnotation(ctx, r.Client, hcluster.Name, &src); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to set referenced resource annotation: %w", err) + } + if _, ok := src.Data[hyperv1.AESCBCKeySecretKey]; !ok { + log.Error(fmt.Errorf("no key field %s specified for aescbc active key secret", hyperv1.AESCBCKeySecretKey), "") + // don't return error here as reconciling won't fix input error + return ctrl.Result{}, nil + } + hostedControlPlaneActiveKeySecret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: controlPlaneNamespace.Name, + Name: src.Name, + }, + } + _, err = createOrUpdate(ctx, r.Client, hostedControlPlaneActiveKeySecret, func() error { + if hostedControlPlaneActiveKeySecret.Data == nil { + hostedControlPlaneActiveKeySecret.Data = map[string][]byte{} + } + hostedControlPlaneActiveKeySecret.Data[hyperv1.AESCBCKeySecretKey] = src.Data[hyperv1.AESCBCKeySecretKey] + hostedControlPlaneActiveKeySecret.Type = corev1.SecretTypeOpaque + return nil + }) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed reconciling aescbc active key: %w", err) + } + if hcluster.Spec.SecretEncryption.AESCBC.BackupKey != nil && len(hcluster.Spec.SecretEncryption.AESCBC.BackupKey.Name) > 0 { + var src corev1.Secret + if err := r.Client.Get(ctx, client.ObjectKey{Namespace: hcluster.GetNamespace(), Name: hcluster.Spec.SecretEncryption.AESCBC.BackupKey.Name}, &src); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to get backup aescbc secret %s: %w", hcluster.Spec.SecretEncryption.AESCBC.BackupKey.Name, err) + } + if err := ensureReferencedResourceAnnotation(ctx, r.Client, hcluster.Name, &src); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to set referenced resource annotation: %w", err) + } + if _, ok := src.Data[hyperv1.AESCBCKeySecretKey]; !ok { + log.Error(fmt.Errorf("no key field %s specified for aescbc backup key secret", hyperv1.AESCBCKeySecretKey), "") + // don't return error here as reconciling won't fix input error + return ctrl.Result{}, nil + } + hostedControlPlaneBackupKeySecret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: controlPlaneNamespace.Name, + Name: src.Name, + }, + } + _, err = createOrUpdate(ctx, r.Client, hostedControlPlaneBackupKeySecret, func() error { + if hostedControlPlaneBackupKeySecret.Data == nil { + hostedControlPlaneBackupKeySecret.Data = map[string][]byte{} + } + hostedControlPlaneBackupKeySecret.Data[hyperv1.AESCBCKeySecretKey] = src.Data[hyperv1.AESCBCKeySecretKey] + hostedControlPlaneBackupKeySecret.Type = corev1.SecretTypeOpaque + return nil + }) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed reconciling aescbc backup key: %w", err) + } + } + case hyperv1.KMS: + if hcluster.Spec.SecretEncryption.KMS == nil { + log.Error(fmt.Errorf("kms metadata nil"), "") + // don't return error here as reconciling won't fix input error + return ctrl.Result{}, nil + } + if err := p.ReconcileSecretEncryption(ctx, r.Client, createOrUpdate, + hcluster, + controlPlaneNamespace.Name); err != nil { + return ctrl.Result{}, err + } + default: + log.Error(fmt.Errorf("unsupported encryption type %s", hcluster.Spec.SecretEncryption.Type), "") + // don't return error here as reconciling won't fix input error + return ctrl.Result{}, nil + } + } + + // Reconcile the HostedControlPlane audit webhook config if specified + // reference from the HostedCluster and syncing the secret in the control plane namespace. + { + if hcluster.Spec.AuditWebhook != nil && len(hcluster.Spec.AuditWebhook.Name) > 0 { + var src corev1.Secret + if err := r.Client.Get(ctx, client.ObjectKey{Namespace: hcluster.GetNamespace(), Name: hcluster.Spec.AuditWebhook.Name}, &src); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to get audit webhook config %s: %w", hcluster.Spec.AuditWebhook.Name, err) + } + if err := ensureReferencedResourceAnnotation(ctx, r.Client, hcluster.Name, &src); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to set referenced resource annotation: %w", err) + } + configData, ok := src.Data[hyperv1.AuditWebhookKubeconfigKey] + if !ok { + return ctrl.Result{}, fmt.Errorf("audit webhook secret does not contain key %s", hyperv1.AuditWebhookKubeconfigKey) + } + + hostedControlPlaneAuditWebhookSecret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: controlPlaneNamespace.Name, + Name: src.Name, + }, + } + _, err = createOrUpdate(ctx, r.Client, hostedControlPlaneAuditWebhookSecret, func() error { + if hostedControlPlaneAuditWebhookSecret.Data == nil { + hostedControlPlaneAuditWebhookSecret.Data = map[string][]byte{} + } + hostedControlPlaneAuditWebhookSecret.Data[hyperv1.AuditWebhookKubeconfigKey] = configData + hostedControlPlaneAuditWebhookSecret.Type = corev1.SecretTypeOpaque + return nil + }) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed reconciling audit webhook secret: %w", err) + } + } + } + + // Reconcile the HostedControlPlane SSH secret by resolving the source secret reference + // from the HostedCluster and syncing the secret in the control plane namespace. + if len(hcluster.Spec.SSHKey.Name) > 0 { + var src corev1.Secret + err = r.Client.Get(ctx, client.ObjectKey{Namespace: hcluster.Namespace, Name: hcluster.Spec.SSHKey.Name}, &src) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to get hostedcluster SSHKey secret %s: %w", hcluster.Spec.SSHKey.Name, err) + } + if err := ensureReferencedResourceAnnotation(ctx, r.Client, hcluster.Name, &src); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to set referenced resource annotation: %w", err) + } + dest := controlplaneoperator.SSHKey(controlPlaneNamespace.Name) + _, err = createOrUpdate(ctx, r.Client, dest, func() error { + srcData, srcHasData := src.Data["id_rsa.pub"] + if !srcHasData { + return fmt.Errorf("hostedcluster SSHKey secret %q must have a id_rsa.pub key", src.Name) + } + dest.Type = corev1.SecretTypeOpaque + if dest.Data == nil { + dest.Data = map[string][]byte{} + } + dest.Data["id_rsa.pub"] = srcData + return nil + }) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to reconcile controlplane SSHKey secret: %w", err) + } + } + + // Reconcile the HostedControlPlane AdditionalTrustBundle ConfigMap by resolving the source reference + // from the HostedCluster and syncing the CM in the control plane namespace. + if err := r.reconcileAdditionalTrustBundle(ctx, hcluster, createOrUpdate, controlPlaneNamespace.Name); err != nil { + return ctrl.Result{}, err + } + + // Reconcile the service account signing key if set + if hcluster.Spec.ServiceAccountSigningKey != nil { + if err := r.reconcileServiceAccountSigningKey(ctx, hcluster, controlPlaneNamespace.Name, createOrUpdate); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to reconcile service account signing key: %w", err) + } + } + + // Reconcile etcd client MTLS secret if the control plane is using an unmanaged etcd cluster + if hcluster.Spec.Etcd.ManagementType == hyperv1.Unmanaged { + unmanagedEtcdTLSClientSecret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: hcluster.GetNamespace(), + Name: hcluster.Spec.Etcd.Unmanaged.TLS.ClientSecret.Name, + }, + } + if err := r.Client.Get(ctx, client.ObjectKeyFromObject(unmanagedEtcdTLSClientSecret), unmanagedEtcdTLSClientSecret); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to get unmanaged etcd tls secret: %w", err) + } + if err := ensureReferencedResourceAnnotation(ctx, r.Client, hcluster.Name, unmanagedEtcdTLSClientSecret); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to set referenced resource annotation: %w", err) + } + hostedControlPlaneEtcdClientSecret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: controlPlaneNamespace.Name, + Name: hcluster.Spec.Etcd.Unmanaged.TLS.ClientSecret.Name, + }, + } + if result, err := createOrUpdate(ctx, r.Client, hostedControlPlaneEtcdClientSecret, func() error { + if hostedControlPlaneEtcdClientSecret.Data == nil { + hostedControlPlaneEtcdClientSecret.Data = map[string][]byte{} + } + hostedControlPlaneEtcdClientSecret.Data = unmanagedEtcdTLSClientSecret.Data + hostedControlPlaneEtcdClientSecret.Type = corev1.SecretTypeOpaque + return nil + }); err != nil { + return ctrl.Result{}, fmt.Errorf("failed reconciling etcd client secret: %w", err) + } else { + log.Info("reconciled etcd client mtls secret to control plane namespace", "result", result) + } + } + + // Reconcile the ETCD member recovery + var requeueAfter *time.Duration + if r.EnableEtcdRecovery && + hcluster.Spec.Etcd.ManagementType == hyperv1.Managed && + hcluster.Spec.ControllerAvailabilityPolicy == hyperv1.HighlyAvailable { + var err error + if requeueAfter, err = r.reconcileETCDMemberRecovery(ctx, hcluster, createOrUpdate); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to perform etcd member recovery: %w", err) + } + } + + // Reconcile global config related configmaps and secrets + { + if hcluster.Spec.Configuration != nil { + configMapRefs := configrefs.ConfigMapRefs(hcluster.Spec.Configuration) + for _, configMapRef := range configMapRefs { + sourceCM := &corev1.ConfigMap{} + if err := r.Get(ctx, client.ObjectKey{Namespace: hcluster.Namespace, Name: configMapRef}, sourceCM); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to get referenced configmap %s/%s: %w", hcluster.Namespace, configMapRef, err) + } + if err := ensureReferencedResourceAnnotation(ctx, r.Client, hcluster.Name, sourceCM); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to set referenced resource annotation: %w", err) + } + destCM := &corev1.ConfigMap{} + destCM.Name = sourceCM.Name + destCM.Namespace = controlPlaneNamespace.Name + if _, err := createOrUpdate(ctx, r.Client, destCM, func() error { + destCM.Annotations = sourceCM.Annotations + destCM.Labels = sourceCM.Labels + destCM.Data = sourceCM.Data + destCM.BinaryData = sourceCM.BinaryData + destCM.Immutable = sourceCM.Immutable + return nil + }); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to reconcile referenced config map %s/%s: %w", destCM.Namespace, destCM.Name, err) + } + } + secretRefs := configrefs.SecretRefs(hcluster.Spec.Configuration) + for _, secretRef := range secretRefs { + sourceSecret := &corev1.Secret{} + if err := r.Get(ctx, client.ObjectKey{Namespace: hcluster.Namespace, Name: secretRef}, sourceSecret); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to get referenced secret %s/%s: %w", hcluster.Namespace, secretRef, err) + } + if err := ensureReferencedResourceAnnotation(ctx, r.Client, hcluster.Name, sourceSecret); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to set referenced resource annotation: %w", err) + } + if err := ensureHostedResourcesAreEmpty(ctx, r.Client, hcluster, sourceSecret); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to validate referenced secret %s/%s: %w", hcluster.Namespace, secretRef, err) + } + destSecret := &corev1.Secret{} + destSecret.Name = sourceSecret.Name + destSecret.Namespace = controlPlaneNamespace.Name + if _, err := createOrUpdate(ctx, r.Client, destSecret, func() error { + destSecret.Annotations = sourceSecret.Annotations + destSecret.Labels = sourceSecret.Labels + destSecret.Data = sourceSecret.Data + destSecret.Immutable = sourceSecret.Immutable + destSecret.Type = sourceSecret.Type + return nil + }); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to reconcile secret %s/%s: %w", destSecret.Namespace, destSecret.Name, err) + } + } + } + } + + // Get release image version + var releaseImageVersion semver.Version + releaseImageVersion, err = semver.Parse(releaseImage.Version()) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to parse release image version: %w", err) + } + + // Reconcile the HostedControlPlane + isAutoscalingNeeded, err := r.isAutoscalingNeeded(ctx, hcluster) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to determine if autoscaler is needed: %w", err) + } + isAWSNodeTerminationHandlerNeeded, err := r.isAWSNodeTerminationHandlerNeeded(ctx, hcluster) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to determine if AWS node termination handler is needed: %w", err) + } + hcp = controlplaneoperator.HostedControlPlane(controlPlaneNamespace.Name, hcluster.Name) + _, err = createOrUpdate(ctx, r.Client, hcp, func() error { + return reconcileHostedControlPlane(hcp, hcluster, isAutoscalingNeeded, isAWSNodeTerminationHandlerNeeded, + annotationsForCertRenewal(log, + hcp, + shouldCheckForStaleCerts(hcluster, defaultToControlPlaneV2), + r.kasServingCertHashFromSecret(ctx, hcp), + r.kasServingCertHashFromEndpoint(ctx, kasHostAndPortFromHCP(hcp)))) + }) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to reconcile hostedcontrolplane: %w", err) + } + + // Reconcile CAPI Infra CR. + infraCR, err := p.ReconcileCAPIInfraCR(ctx, r.Client, createOrUpdate, + hcluster, + controlPlaneNamespace.Name, + hcp.Status.ControlPlaneEndpoint) + if err != nil { + return ctrl.Result{}, err + } + + // Reconcile cluster prometheus RBAC resources if enabled + if r.EnableOCPClusterMonitoring { + if err := r.reconcileClusterPrometheusRBAC(ctx, createOrUpdate, hcp.Namespace); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to reconcile RBAC for OCP cluster prometheus: %w", err) + } + } + + // Reconcile the CAPI Cluster resource + // In the None platform case, there is no CAPI provider/resources so infraCR is nil + if infraCR != nil { + capiCluster := controlplaneoperator.CAPICluster(controlPlaneNamespace.Name, hcluster.Spec.InfraID) + _, err = createOrUpdate(ctx, r.Client, capiCluster, func() error { + return reconcileCAPICluster(capiCluster, hcluster, hcp, infraCR) + }) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to reconcile capi cluster: %w", err) + } + } + + // Reconcile the monitoring dashboard if configured + if r.MonitoringDashboards { + if err := r.reconcileMonitoringDashboard(ctx, createOrUpdate, hcluster); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to reconcile monitoring dashboard: %w", err) + } + } + + // Reconcile the HostedControlPlane kubeconfig if one is reported + if hcp.Status.KubeConfig != nil { + src := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: hcp.Namespace, + Name: hcp.Status.KubeConfig.Name, + }, + } + err := r.Client.Get(ctx, client.ObjectKeyFromObject(src), src) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to get controlplane kubeconfig secret %q: %w", client.ObjectKeyFromObject(src), err) + } + dest := manifests.KubeConfigSecret(hcluster.Namespace, hcluster.Name) + _, err = createOrUpdate(ctx, r.Client, dest, func() error { + key := hcp.Status.KubeConfig.Key + srcData, srcHasData := src.Data[key] + if !srcHasData { + return fmt.Errorf("controlplane kubeconfig secret %q must have a %q key", client.ObjectKeyFromObject(src), key) + } + dest.Labels = hcluster.Labels + dest.Type = corev1.SecretTypeOpaque + if dest.Data == nil { + dest.Data = map[string][]byte{} + } + dest.Data["kubeconfig"] = srcData + dest.SetOwnerReferences([]metav1.OwnerReference{{ + APIVersion: hyperv1.GroupVersion.String(), + Kind: "HostedCluster", + Name: hcluster.Name, + UID: hcluster.UID, + }}) + return nil + }) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to reconcile hostedcluster kubeconfig secret: %w", err) + } + } + + if cpoSupportsKASCustomKubeconfig { + // Reconcile the HostedControlPlane external kubeconfig if one is reported + if len(hcp.Spec.KubeAPIServerDNSName) > 0 { + requeue, err := r.reconcileCustomExternalKubeconfig(ctx, createOrUpdate, hcp, hcluster) + if err != nil { + return ctrl.Result{}, err + } + if requeue != nil { + requeueAfter = requeue + } + } else { + // Delete the custom external kubeconfig secret if it exists and the external name is not set + if hcluster.Status.CustomKubeconfig != nil { + customKubeconfig := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: hcluster.Namespace, + Name: hcluster.Status.CustomKubeconfig.Name, + }, + } + if _, err := k8sutil.DeleteIfNeeded(ctx, r.Client, customKubeconfig); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to delete custom external kubeconfig secret %q: %w", client.ObjectKeyFromObject(customKubeconfig), err) + } + hcluster.Status.CustomKubeconfig = nil + } + } + } + + // Reconcile the HostedControlPlane kubeadminPassword + if hcp.Status.KubeadminPassword != nil { + src := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: hcp.Namespace, + Name: hcp.Status.KubeadminPassword.Name, + }, + } + err := r.Client.Get(ctx, client.ObjectKeyFromObject(src), src) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to get controlplane kubeadmin password secret %q: %w", client.ObjectKeyFromObject(src), err) + } + dest := manifests.KubeadminPasswordSecret(hcluster.Namespace, hcluster.Name) + _, err = createOrUpdate(ctx, r.Client, dest, func() error { + dest.Type = corev1.SecretTypeOpaque + dest.Data = map[string][]byte{} + for k, v := range src.Data { + dest.Data[k] = v + } + dest.SetOwnerReferences([]metav1.OwnerReference{{ + APIVersion: hyperv1.GroupVersion.String(), + Kind: "HostedCluster", + Name: hcluster.Name, + UID: hcluster.UID, + }}) + return nil + }) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to reconcile hostedcluster kubeconfig secret: %w", err) + } + } else { + KubeadminPasswordSecret := manifests.KubeadminPasswordSecret(hcluster.Namespace, hcluster.Name) + if err := r.Client.Get(ctx, client.ObjectKeyFromObject(KubeadminPasswordSecret), KubeadminPasswordSecret); err != nil { + if !apierrors.IsNotFound(err) { + return ctrl.Result{}, fmt.Errorf("failed to get hostedcluster kubeadmin password secret %q: %w", client.ObjectKeyFromObject(KubeadminPasswordSecret), err) + } + } else { + if err := r.Client.Delete(ctx, KubeadminPasswordSecret); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to delete hostedcluster kubeadmin password secret %q: %w", client.ObjectKeyFromObject(KubeadminPasswordSecret), err) + } + } + } + + if _, err := r.defaultIngressDomain(ctx); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to determine default ingress domain: %w", err) + } + + // Reconcile SRE metrics config + if err := r.reconcileSREMetricsConfig(ctx, createOrUpdate, hcp); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to reconcile SRE metrics config: %w", err) + } + + err = r.reconcileOpenShiftTrustedCAs(ctx, hcp) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to reconcile OpenShift trusted CAs: %w", err) + } + + imageProvider := imageprovider.New(releaseImage) + imageProvider.ComponentImages()["token-minter"] = utilitiesImage + imageProvider.ComponentImages()[podspec.AvailabilityProberImageName] = utilitiesImage + + securityContextUID := controlplanecomponent.DefaultSecurityContextUID + if r.SetDefaultSecurityContext { + securityContextUID, err = strconv.ParseInt(controlPlaneNamespace.Annotations[DefaultSecurityContextUIDAnnnotation], 10, 64) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to parse SecurityContext UID: %w", err) + } + } + cpContext := controlplanecomponent.ControlPlaneContext{ + Context: ctx, + Client: r.Client, + ApplyProvider: upsert.NewApplyProvider(r.EnableCIDebugOutput), + HCP: hcp, + SetDefaultSecurityContext: r.SetDefaultSecurityContext, + DefaultSecurityContextUID: securityContextUID, + EnableCIDebugOutput: r.EnableCIDebugOutput, + MetricsSet: r.MetricsSet, + ReleaseImageProvider: imageProvider, + OmitOwnerReference: true, + } + + // Reconcile the control plane operator + err = r.reconcileControlPlaneOperator(cpContext, createOrUpdate, hcluster, controlPlaneOperatorImage, utilitiesImage, cpoHasUtilities, r.CertRotationScale, releaseImageVersion, releaseProvider) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to reconcile control plane operator: %w", err) + } + + // Reconcile the CAPI manager components + err = r.reconcileCAPIManager(cpContext, createOrUpdate, hcluster) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to reconcile capi manager: %w", err) + } + + // Reconcile the CAPI provider components + if err = r.reconcileCAPIProvider(cpContext, hcluster, hcp, p); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to reconcile capi provider: %w", err) + } + + if _, pkiDisabled := hcp.Annotations[hyperv1.DisablePKIReconciliationAnnotation]; controlPlanePKIOperatorSignsCSRs && !pkiDisabled { + // Reconcile the control plane PKI operator RBAC - the CPO does not have rights to do this itself + err = r.reconcileControlPlanePKIOperatorRBAC(ctx, createOrUpdate, hcluster) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to reconcile control plane PKI operator RBAC: %w", err) + } + } + + // Reconcile the network policies + if err = r.reconcileNetworkPolicies(ctx, log, createOrUpdate, hcluster, hcp, releaseImageVersion, controlPlaneOperatorAppliesManagementKASNetworkPolicyLabel); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to reconcile network policies: %w", err) + } + + // Reconcile platform specific items + switch hcluster.Spec.Platform.Type { + case hyperv1.KubevirtPlatform: + if hcluster.Spec.Platform.Kubevirt != nil && hcluster.Spec.Platform.Kubevirt.Credentials != nil { + if err := r.Client.Status().Update(ctx, hcluster); err != nil { + if apierrors.IsConflict(err) { + return ctrl.Result{Requeue: true}, nil + } + return ctrl.Result{}, fmt.Errorf("failed to update status after network policy RBAC check: %w", err) + } + } + err = r.reconcileKubevirtCSIClusterRBAC(ctx, createOrUpdate, hcluster) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to reconcile kubevirt CSI cluster wide RBAC: %w", err) + } + case hyperv1.AWSPlatform: + if err := r.reconcileOIDCDocumentsWithStatus(ctx, hcluster, func() error { + return r.reconcileAWSOIDCDocuments(ctx, log, hcluster, hcp) + }); err != nil { + return ctrl.Result{}, err + } + case hyperv1.AzurePlatform: + if azureutil.IsAroHCPByHC(hcluster) { + // Reconcile CPO SecretProviderClass CR + cpoSecretProviderClass := cpomanifests.ManagedAzureSecretProviderClass(config.ManagedAzureCPOSecretProviderClassName, hcp.Namespace) + if _, err = createOrUpdate(ctx, r, cpoSecretProviderClass, func() error { + secretproviderclass.ReconcileManagedAzureSecretProviderClass(cpoSecretProviderClass, hcp, hcp.Spec.Platform.Azure.AzureAuthenticationConfig.ManagedIdentities.ControlPlane.ControlPlaneOperator) + return nil + }); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to reconcile control plane operator secret provider class: %w", err) + } + + // Reconcile CAPZ SecretProviderClass CR + nodepoolMgmtSecretProviderClass := cpomanifests.ManagedAzureSecretProviderClass(config.ManagedAzureNodePoolMgmtSecretProviderClassName, hcp.Namespace) + if _, err = createOrUpdate(ctx, r, nodepoolMgmtSecretProviderClass, func() error { + secretproviderclass.ReconcileManagedAzureSecretProviderClass(nodepoolMgmtSecretProviderClass, hcp, hcp.Spec.Platform.Azure.AzureAuthenticationConfig.ManagedIdentities.ControlPlane.NodePoolManagement) + return nil + }); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to reconcile nodepool management secret provider class: %w", err) + } + } + + if hcluster.Spec.SecretEncryption != nil && hcluster.Spec.SecretEncryption.KMS != nil { + if azureutil.IsAroHCPByHC(hcluster) { + // Reconcile KMS SecretProviderClass CR + kmsSecretProviderClass := cpomanifests.ManagedAzureSecretProviderClass(config.ManagedAzureKMSSecretProviderClassName, hcp.Namespace) + if _, err := createOrUpdate(ctx, r, kmsSecretProviderClass, func() error { + secretproviderclass.ReconcileManagedAzureSecretProviderClass(kmsSecretProviderClass, hcp, hcp.Spec.SecretEncryption.KMS.Azure.KMS) + return nil + }); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to reconcile KMS SecretProviderClass: %w", err) + } + } + } + case hyperv1.GCPPlatform: + if err := r.reconcileOIDCDocumentsWithStatus(ctx, hcluster, func() error { + return r.reconcileGCPOIDCDocuments(ctx, log, hcluster, hcp) + }); err != nil { + return ctrl.Result{}, err + } + } + + if err := r.reconcileKarpenterOperator(cpContext, hcluster, r.HypershiftOperatorImage, controlPlaneOperatorImage); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to reconcile karpenter operator: %w", err) + } + + if pubEndpointRequeue, err := r.reconcilePublicEndpointExposedCondition(ctx, hcluster); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to reconcile PublicEndpointExposed condition: %w", err) + } else if pubEndpointRequeue != nil && (requeueAfter == nil || *pubEndpointRequeue < *requeueAfter) { + requeueAfter = pubEndpointRequeue + } + + log.Info("successfully reconciled") + result := ctrl.Result{} + if requeueAfter != nil { + result.RequeueAfter = *requeueAfter + } + if autoNodeProgressing { + autoNodeRequeue := 15 * time.Second + if result.RequeueAfter == 0 || autoNodeRequeue < result.RequeueAfter { + result.RequeueAfter = autoNodeRequeue + } + } + return result, nil +} diff --git a/hypershift-operator/controllers/hostedcluster/reconcile_report.go b/hypershift-operator/controllers/hostedcluster/reconcile_report.go index 5cf1d99730bb..50240cd970aa 100644 --- a/hypershift-operator/controllers/hostedcluster/reconcile_report.go +++ b/hypershift-operator/controllers/hostedcluster/reconcile_report.go @@ -31,23 +31,15 @@ type operationResult struct { type reconcileReport struct { results []operationResult requeueAfter *time.Duration - legacy bool } // execute runs fn and records its result. -// In legacy mode, any prior failure blocks execution. func (r *reconcileReport) execute(name string, cat operationCategory, fn func() error) { - if r.legacy && r.hasAnyFailure() { - r.results = append(r.results, operationResult{name: name, category: cat, blocked: true}) - return - } r.results = append(r.results, operationResult{name: name, category: cat, err: fn()}) } // executeOrBlock runs fn and records its result as nonCritical, or records -// a blocked entry if a prior operation should block downstream work. -// In fail-fast mode, any prior failure blocks; in error-collecting mode, -// only critical failures block. +// a blocked entry if a prior critical operation failed. func (r *reconcileReport) executeOrBlock(name string, fn func() error) { if r.shouldBlock() { r.results = append(r.results, operationResult{name: name, category: nonCritical, blocked: true}) @@ -66,22 +58,9 @@ func (r *reconcileReport) hasCriticalFailure() bool { return false } -// hasAnyFailure returns true if any operation has actually failed (not blocked). -func (r *reconcileReport) hasAnyFailure() bool { - for _, res := range r.results { - if res.err != nil && !res.blocked { - return true - } - } - return false -} - // shouldBlock returns true if subsequent dependent operations should be skipped. -// In legacy mode, any failure blocks. Otherwise only critical failures block. +// Only critical failures block. func (r *reconcileReport) shouldBlock() bool { - if r.legacy { - return r.hasAnyFailure() - } return r.hasCriticalFailure() } diff --git a/hypershift-operator/controllers/hostedcluster/reconcile_report_test.go b/hypershift-operator/controllers/hostedcluster/reconcile_report_test.go index 729deb29c224..2c797c9827b5 100644 --- a/hypershift-operator/controllers/hostedcluster/reconcile_report_test.go +++ b/hypershift-operator/controllers/hostedcluster/reconcile_report_test.go @@ -112,41 +112,8 @@ func TestReconcileReport(t *testing.T) { }) } -func TestLegacyMode(t *testing.T) { - t.Run("When a non-critical operation fails in legacy mode it should block subsequent execute calls", func(t *testing.T) { - g := NewWithT(t) - report := &reconcileReport{legacy: true} - report.execute("SSHKeySync", nonCritical, func() error { - return fmt.Errorf("key not found") - }) - executed := false - report.execute("GlobalConfigSync", nonCritical, func() error { - executed = true - return nil - }) - - g.Expect(executed).To(BeFalse()) - g.Expect(report.blockedNames()).To(ConsistOf("GlobalConfigSync")) - g.Expect(report.allErrors()).To(HaveLen(1)) - }) - - t.Run("When a non-critical operation fails in legacy mode it should block executeOrBlock calls", func(t *testing.T) { - g := NewWithT(t) - report := &reconcileReport{legacy: true} - report.execute("SSHKeySync", nonCritical, func() error { - return fmt.Errorf("key not found") - }) - executed := false - report.executeOrBlock("OperatorDeployments", func() error { - executed = true - return nil - }) - - g.Expect(executed).To(BeFalse()) - g.Expect(report.blockedNames()).To(ConsistOf("OperatorDeployments")) - }) - - t.Run("When in default mode it should allow non-critical errors without blocking execute", func(t *testing.T) { +func TestBlockingBehavior(t *testing.T) { + t.Run("When a non-critical operation fails it should allow subsequent execute calls", func(t *testing.T) { g := NewWithT(t) report := &reconcileReport{} report.execute("SSHKeySync", nonCritical, func() error { @@ -163,7 +130,7 @@ func TestLegacyMode(t *testing.T) { g.Expect(report.allErrors()).To(HaveLen(1)) }) - t.Run("When in default mode it should allow non-critical errors without blocking executeOrBlock", func(t *testing.T) { + t.Run("When a non-critical operation fails it should allow executeOrBlock calls", func(t *testing.T) { g := NewWithT(t) report := &reconcileReport{} report.execute("SSHKeySync", nonCritical, func() error { @@ -179,7 +146,7 @@ func TestLegacyMode(t *testing.T) { g.Expect(report.blockedNames()).To(BeEmpty()) }) - t.Run("When in default mode it should still block on critical failure", func(t *testing.T) { + t.Run("When a critical operation fails it should block executeOrBlock calls", func(t *testing.T) { g := NewWithT(t) report := &reconcileReport{} report.execute("PullSecretSync", critical, func() error {