diff --git a/pkg/cluster/cluster.go b/pkg/cluster/cluster.go index 1ee54d75..54e91abc 100644 --- a/pkg/cluster/cluster.go +++ b/pkg/cluster/cluster.go @@ -1245,13 +1245,18 @@ func (c *Cluster) Update(oldSpec, newSpec *cpov1.Postgresql) error { } if !updateFailed { - // Major version upgrade must only fire after success of earlier operations and should stay last - if err := c.majorVersionUpgrade(); err != nil { - c.logger.Errorf("major version upgrade failed: %v", err) + if upgradeErr := c.executeMajorVersionUpgrade(); upgradeErr != nil { + c.logger.Errorf("major version upgrade failed: %v", upgradeErr) updateFailed = true } } + if updateFailed { + c.logger.Errorf("Update for cluster %s/%s finished with errors..", c.Namespace, c.Name) + } else { + c.logger.Infof("Update for cluster %s/%s completed successfully.", c.Namespace, c.Name) + } + return nil } diff --git a/pkg/cluster/majorversionupgrade.go b/pkg/cluster/majorversionupgrade.go index 61edf190..5f204af0 100644 --- a/pkg/cluster/majorversionupgrade.go +++ b/pkg/cluster/majorversionupgrade.go @@ -3,8 +3,10 @@ package cluster import ( "context" "encoding/json" + "errors" "fmt" "strings" + "time" "github.com/Masterminds/semver" "github.com/cybertec-postgresql/cybertec-pg-operator/pkg/spec" @@ -29,6 +31,8 @@ const ( majorVersionUpgradeFailureAnnotation = "last-major-upgrade-failure" ) +var errUpgradePrepNotReady = errors.New("cluster not ready for upgrade") + // IsBiggerPostgresVersion Compare two Postgres version numbers func IsBiggerPostgresVersion(old string, new string) bool { oldN := VersionMap[old] @@ -232,12 +236,14 @@ func (c *Cluster) majorVersionUpgrade() error { continue } if checkStreaming && member.State != "streaming" { - c.logger.Infof("skipping major version upgrade, replica %s is not streaming from primary", member.Name) - return nil + // c.logger.Infof("skipping major version upgrade, replica %s is not streaming from primary", member.Name) + // return nil + return fmt.Errorf("%w: replica %s is not streaming (state: %s)", errUpgradePrepNotReady, member.Name, member.State) } if member.Lag > 16*1024*1024 { - c.logger.Infof("skipping major version upgrade, replication lag on member %s is too high", member.Name) - return nil + // c.logger.Infof("skipping major version upgrade, replication lag on member %s is too high", member.Name) + // return nil + return fmt.Errorf("%w: replication lag on member %s is too high (%d bytes)", errUpgradePrepNotReady, member.Name, member.Lag) } } @@ -246,11 +252,10 @@ func (c *Cluster) majorVersionUpgrade() error { if allRunning { c.logger.Infof("healthy cluster ready to upgrade, current: %d desired: %d", c.currentMajorVersion, desiredVersion) if c.currentMajorVersion < desiredVersion { - defer func() error { - if err = c.criticalOperationLabel(pods, nil); err != nil { - return fmt.Errorf("failed to remove critical-operation label: %s", err) + defer func() { + if err := c.criticalOperationLabel(pods, nil); err != nil { + c.logger.Errorf("failed to remove critical-operation label: %v", err) } - return nil }() val := "true" if err = c.criticalOperationLabel(pods, &val); err != nil { @@ -260,37 +265,72 @@ func (c *Cluster) majorVersionUpgrade() error { podName := &spec.NamespacedName{Namespace: masterPod.Namespace, Name: masterPod.Name} c.logger.Infof("triggering major version upgrade on pod %s of %d pods", masterPod.Name, numberOfPods) c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Major Version Upgrade", "starting major version upgrade on pod %s of %d pods", masterPod.Name, numberOfPods) - upgradeCommand := fmt.Sprintf("set -o pipefail && /usr/bin/python3 /scripts/inplace_upgrade.py %d 2>&1 | tee last_upgrade.log", numberOfPods) - - c.logger.Debug("checking if the spilo image runs with root or non-root (check for user id=0)") + upgradeCommand := fmt.Sprintf("/usr/local/bin/python3 /scripts/inplace_upgrade.py %d 2>&1", numberOfPods) + c.logger.Debug("checking if the container runs with root or non-root (check for user id=0)") resultIdCheck, errIdCheck := c.ExecCommand(podName, "/bin/bash", "-c", "/usr/bin/id -u") if errIdCheck != nil { c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeWarning, "Major Version Upgrade", "checking user id to run upgrade from %d to %d FAILED: %v", c.currentMajorVersion, desiredVersion, errIdCheck) } resultIdCheck = strings.TrimSuffix(resultIdCheck, "\n") - var result, scriptErrMsg string + var result string + if resultIdCheck != "0" { c.logger.Infof("user id was identified as: %s, hence default user is non-root already", resultIdCheck) result, err = c.ExecCommand(podName, "/bin/bash", "-c", upgradeCommand) - scriptErrMsg, _ = c.ExecCommand(podName, "/bin/bash", "-c", "tail -n 1 last_upgrade.log") } else { c.logger.Infof("user id was identified as: %s, using su to reach the postgres user", resultIdCheck) result, err = c.ExecCommand(podName, "/bin/su", "postgres", "-c", upgradeCommand) - scriptErrMsg, _ = c.ExecCommand(podName, "/bin/bash", "-c", "tail -n 1 last_upgrade.log") } + if err != nil { isUpgradeSuccess = false c.annotatePostgresResource(isUpgradeSuccess) - c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeWarning, "Major Version Upgrade", "upgrade from %d to %d FAILED: %v", c.currentMajorVersion, desiredVersion, scriptErrMsg) - return fmt.Errorf("%s", scriptErrMsg) + + finalErrorMsg := strings.TrimSpace(result) + if finalErrorMsg == "" { + finalErrorMsg = err.Error() + } + + lines := strings.Split(finalErrorMsg, "\n") + if len(lines) > 5 { + finalErrorMsg = strings.Join(lines[len(lines)-5:], " | ") + } + + c.logger.Errorf("Major upgrade failed: %v", err) + c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeWarning, "Major Version Upgrade", "upgrade from %d to %d FAILED: %s", c.currentMajorVersion, desiredVersion, finalErrorMsg) + + return fmt.Errorf("upgrade script failed: %s", finalErrorMsg) } c.annotatePostgresResource(isUpgradeSuccess) c.logger.Infof("upgrade action triggered and command completed: %s", result[:100]) - c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Major Version Upgrade", "upgrade from %d to %d finished", c.currentMajorVersion, desiredVersion) + c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Major Version Upgrade", "major version upgrade from version %d to version %d was successfully completed.", c.currentMajorVersion, desiredVersion) } } return nil } + +func (c *Cluster) executeMajorVersionUpgrade() error { + maxRetries := 6 + var lastErr error + + for i := 0; i < maxRetries; i++ { + lastErr = c.majorVersionUpgrade() + if lastErr == nil { + return nil + } + + if errors.Is(lastErr, errUpgradePrepNotReady) { + c.logger.Warnf("Major version upgrade deferred (attempt %d/%d): %v. Retrying in 15s...", i+1, maxRetries, lastErr) + + if i < maxRetries-1 { + time.Sleep(15 * time.Second) + continue + } + } + return lastErr + } + return lastErr +} diff --git a/pkg/cluster/sync.go b/pkg/cluster/sync.go index 7ebcc8b8..11e18adb 100644 --- a/pkg/cluster/sync.go +++ b/pkg/cluster/sync.go @@ -172,6 +172,7 @@ func generateSerialNumber() (*big.Int, error) { // Unlike the update, sync does not error out if some objects do not exist and takes care of creating them. func (c *Cluster) Sync(newSpec *cpov1.Postgresql) error { var err error + syncFailed := false c.mu.Lock() defer c.mu.Unlock() @@ -326,9 +327,21 @@ func (c *Cluster) Sync(newSpec *cpov1.Postgresql) error { } } - // Major version upgrade must only run after success of all earlier operations, must remain last item in sync - if err := c.majorVersionUpgrade(); err != nil { - c.logger.Errorf("major version upgrade failed: %v", err) + if err != nil { + syncFailed = true + } + if !syncFailed { + err = c.executeMajorVersionUpgrade() + if err != nil { + c.logger.Errorf("major version upgrade failed after retries: %v", err) + syncFailed = true + } + } + + if syncFailed { + c.logger.Errorf("Update for cluster %s/%s finished with errors..", c.Namespace, c.Name) + } else { + c.logger.Infof("Update for cluster %s/%s completed successfully.", c.Namespace, c.Name) } return err diff --git a/ui/Dockerfile b/ui/Dockerfile index 63e8817e..37abc7c8 100644 --- a/ui/Dockerfile +++ b/ui/Dockerfile @@ -38,4 +38,4 @@ ARG VERSION=dev RUN sed -i "s/__version__ = .*/__version__ = '${VERSION}'/" /operator_ui/__init__.py WORKDIR / -CMD ["/usr/bin/python3", "-m", "operator_ui"] +CMD ["/usr/local/bin/python3", "-m", "operator_ui"] diff --git a/ui/start_server.sh b/ui/start_server.sh index e2c3980c..7ed2546c 100644 --- a/ui/start_server.sh +++ b/ui/start_server.sh @@ -1,2 +1,2 @@ #!/bin/bash -/usr/bin/python3 -m operator_ui +/usr/local/bin/python3 -m operator_ui