@@ -3,8 +3,10 @@ package cluster
33import (
44 "context"
55 "encoding/json"
6+ "errors"
67 "fmt"
78 "strings"
9+ "time"
810
911 "github.com/Masterminds/semver"
1012 "github.com/cybertec-postgresql/cybertec-pg-operator/pkg/spec"
@@ -29,6 +31,8 @@ const (
2931 majorVersionUpgradeFailureAnnotation = "last-major-upgrade-failure"
3032)
3133
34+ var errUpgradePrepNotReady = errors .New ("cluster not ready for upgrade" )
35+
3236// IsBiggerPostgresVersion Compare two Postgres version numbers
3337func IsBiggerPostgresVersion (old string , new string ) bool {
3438 oldN := VersionMap [old ]
@@ -232,12 +236,14 @@ func (c *Cluster) majorVersionUpgrade() error {
232236 continue
233237 }
234238 if checkStreaming && member .State != "streaming" {
235- c .logger .Infof ("skipping major version upgrade, replica %s is not streaming from primary" , member .Name )
236- return nil
239+ // c.logger.Infof("skipping major version upgrade, replica %s is not streaming from primary", member.Name)
240+ // return nil
241+ return fmt .Errorf ("%w: replica %s is not streaming (state: %s)" , errUpgradePrepNotReady , member .Name , member .State )
237242 }
238243 if member .Lag > 16 * 1024 * 1024 {
239- c .logger .Infof ("skipping major version upgrade, replication lag on member %s is too high" , member .Name )
240- return nil
244+ // c.logger.Infof("skipping major version upgrade, replication lag on member %s is too high", member.Name)
245+ // return nil
246+ return fmt .Errorf ("%w: replication lag on member %s is too high (%d bytes)" , errUpgradePrepNotReady , member .Name , member .Lag )
241247 }
242248 }
243249
@@ -246,11 +252,10 @@ func (c *Cluster) majorVersionUpgrade() error {
246252 if allRunning {
247253 c .logger .Infof ("healthy cluster ready to upgrade, current: %d desired: %d" , c .currentMajorVersion , desiredVersion )
248254 if c .currentMajorVersion < desiredVersion {
249- defer func () error {
250- if err = c .criticalOperationLabel (pods , nil ); err != nil {
251- return fmt . Errorf ("failed to remove critical-operation label: %s " , err )
255+ defer func () {
256+ if err : = c .criticalOperationLabel (pods , nil ); err != nil {
257+ c . logger . Errorf ("failed to remove critical-operation label: %v " , err )
252258 }
253- return nil
254259 }()
255260 val := "true"
256261 if err = c .criticalOperationLabel (pods , & val ); err != nil {
@@ -260,9 +265,9 @@ func (c *Cluster) majorVersionUpgrade() error {
260265 podName := & spec.NamespacedName {Namespace : masterPod .Namespace , Name : masterPod .Name }
261266 c .logger .Infof ("triggering major version upgrade on pod %s of %d pods" , masterPod .Name , numberOfPods )
262267 c .eventRecorder .Eventf (c .GetReference (), v1 .EventTypeNormal , "Major Version Upgrade" , "starting major version upgrade on pod %s of %d pods" , masterPod .Name , numberOfPods )
263- upgradeCommand := fmt .Sprintf ("set -o pipefail && /usr/bin/python3 /scripts/inplace_upgrade.py %d 2>&1 | tee last_upgrade.log" , numberOfPods )
268+ upgradeCommand := fmt .Sprintf ("set -o pipefail && /usr/local/ bin/python3 /scripts/inplace_upgrade.py %d 2>&1 | tee last_upgrade.log" , numberOfPods )
264269
265- c .logger .Debug ("checking if the spilo image runs with root or non-root (check for user id=0)" )
270+ c .logger .Debug ("checking if the container runs with root or non-root (check for user id=0)" )
266271 resultIdCheck , errIdCheck := c .ExecCommand (podName , "/bin/bash" , "-c" , "/usr/bin/id -u" )
267272 if errIdCheck != nil {
268273 c .eventRecorder .Eventf (c .GetReference (), v1 .EventTypeWarning , "Major Version Upgrade" , "checking user id to run upgrade from %d to %d FAILED: %v" , c .currentMajorVersion , desiredVersion , errIdCheck )
@@ -294,3 +299,26 @@ func (c *Cluster) majorVersionUpgrade() error {
294299
295300 return nil
296301}
302+
303+ func (c * Cluster ) executeMajorVersionUpgrade () error {
304+ maxRetries := 6
305+ var lastErr error
306+
307+ for i := 0 ; i < maxRetries ; i ++ {
308+ lastErr = c .majorVersionUpgrade ()
309+ if lastErr == nil {
310+ return nil
311+ }
312+
313+ if errors .Is (lastErr , errUpgradePrepNotReady ) {
314+ c .logger .Warnf ("Major version upgrade deferred (attempt %d/%d): %v. Retrying in 15s..." , i + 1 , maxRetries , lastErr )
315+
316+ if i < maxRetries - 1 {
317+ time .Sleep (15 * time .Second )
318+ continue
319+ }
320+ }
321+ return lastErr
322+ }
323+ return lastErr
324+ }
0 commit comments