Skip to content

Commit 7e1528c

Browse files
committed
Add Loop to wait with major-upgrade if replica yet not ready (not streaming or to much lag). Ensure we're not retrying upgrade-script in the loop, just retry if requirements not ready
1 parent 0cf4e0f commit 7e1528c

3 files changed

Lines changed: 62 additions & 16 deletions

File tree

pkg/cluster/cluster.go

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1245,13 +1245,18 @@ func (c *Cluster) Update(oldSpec, newSpec *cpov1.Postgresql) error {
12451245
}
12461246

12471247
if !updateFailed {
1248-
// Major version upgrade must only fire after success of earlier operations and should stay last
1249-
if err := c.majorVersionUpgrade(); err != nil {
1250-
c.logger.Errorf("major version upgrade failed: %v", err)
1248+
if upgradeErr := c.executeMajorVersionUpgrade(); upgradeErr != nil {
1249+
c.logger.Errorf("major version upgrade failed: %v", upgradeErr)
12511250
updateFailed = true
12521251
}
12531252
}
12541253

1254+
if updateFailed {
1255+
c.logger.Errorf("Update for cluster %s/%s finished with errors..", c.Namespace, c.Name)
1256+
} else {
1257+
c.logger.Infof("Update for cluster %s/%s completed successfully.", c.Namespace, c.Name)
1258+
}
1259+
12551260
return nil
12561261
}
12571262

pkg/cluster/majorversionupgrade.go

Lines changed: 38 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@ package cluster
33
import (
44
"context"
55
"encoding/json"
6+
"errors"
67
"fmt"
78
"strings"
9+
"time"
810

911
"github.com/Masterminds/semver"
1012
"github.com/cybertec-postgresql/cybertec-pg-operator/pkg/spec"
@@ -29,6 +31,8 @@ const (
2931
majorVersionUpgradeFailureAnnotation = "last-major-upgrade-failure"
3032
)
3133

34+
var errUpgradePrepNotReady = errors.New("cluster not ready for upgrade")
35+
3236
// IsBiggerPostgresVersion Compare two Postgres version numbers
3337
func IsBiggerPostgresVersion(old string, new string) bool {
3438
oldN := VersionMap[old]
@@ -232,12 +236,14 @@ func (c *Cluster) majorVersionUpgrade() error {
232236
continue
233237
}
234238
if checkStreaming && member.State != "streaming" {
235-
c.logger.Infof("skipping major version upgrade, replica %s is not streaming from primary", member.Name)
236-
return nil
239+
// c.logger.Infof("skipping major version upgrade, replica %s is not streaming from primary", member.Name)
240+
// return nil
241+
return fmt.Errorf("%w: replica %s is not streaming (state: %s)", errUpgradePrepNotReady, member.Name, member.State)
237242
}
238243
if member.Lag > 16*1024*1024 {
239-
c.logger.Infof("skipping major version upgrade, replication lag on member %s is too high", member.Name)
240-
return nil
244+
// c.logger.Infof("skipping major version upgrade, replication lag on member %s is too high", member.Name)
245+
// return nil
246+
return fmt.Errorf("%w: replication lag on member %s is too high (%d bytes)", errUpgradePrepNotReady, member.Name, member.Lag)
241247
}
242248
}
243249

@@ -246,11 +252,10 @@ func (c *Cluster) majorVersionUpgrade() error {
246252
if allRunning {
247253
c.logger.Infof("healthy cluster ready to upgrade, current: %d desired: %d", c.currentMajorVersion, desiredVersion)
248254
if c.currentMajorVersion < desiredVersion {
249-
defer func() error {
250-
if err = c.criticalOperationLabel(pods, nil); err != nil {
251-
return fmt.Errorf("failed to remove critical-operation label: %s", err)
255+
defer func() {
256+
if err := c.criticalOperationLabel(pods, nil); err != nil {
257+
c.logger.Errorf("failed to remove critical-operation label: %v", err)
252258
}
253-
return nil
254259
}()
255260
val := "true"
256261
if err = c.criticalOperationLabel(pods, &val); err != nil {
@@ -260,9 +265,9 @@ func (c *Cluster) majorVersionUpgrade() error {
260265
podName := &spec.NamespacedName{Namespace: masterPod.Namespace, Name: masterPod.Name}
261266
c.logger.Infof("triggering major version upgrade on pod %s of %d pods", masterPod.Name, numberOfPods)
262267
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Major Version Upgrade", "starting major version upgrade on pod %s of %d pods", masterPod.Name, numberOfPods)
263-
upgradeCommand := fmt.Sprintf("set -o pipefail && /usr/bin/python3 /scripts/inplace_upgrade.py %d 2>&1 | tee last_upgrade.log", numberOfPods)
268+
upgradeCommand := fmt.Sprintf("set -o pipefail && /usr/local/bin/python3 /scripts/inplace_upgrade.py %d 2>&1 | tee last_upgrade.log", numberOfPods)
264269

265-
c.logger.Debug("checking if the spilo image runs with root or non-root (check for user id=0)")
270+
c.logger.Debug("checking if the container runs with root or non-root (check for user id=0)")
266271
resultIdCheck, errIdCheck := c.ExecCommand(podName, "/bin/bash", "-c", "/usr/bin/id -u")
267272
if errIdCheck != nil {
268273
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeWarning, "Major Version Upgrade", "checking user id to run upgrade from %d to %d FAILED: %v", c.currentMajorVersion, desiredVersion, errIdCheck)
@@ -294,3 +299,26 @@ func (c *Cluster) majorVersionUpgrade() error {
294299

295300
return nil
296301
}
302+
303+
func (c *Cluster) executeMajorVersionUpgrade() error {
304+
maxRetries := 6
305+
var lastErr error
306+
307+
for i := 0; i < maxRetries; i++ {
308+
lastErr = c.majorVersionUpgrade()
309+
if lastErr == nil {
310+
return nil
311+
}
312+
313+
if errors.Is(lastErr, errUpgradePrepNotReady) {
314+
c.logger.Warnf("Major version upgrade deferred (attempt %d/%d): %v. Retrying in 15s...", i+1, maxRetries, lastErr)
315+
316+
if i < maxRetries-1 {
317+
time.Sleep(15 * time.Second)
318+
continue
319+
}
320+
}
321+
return lastErr
322+
}
323+
return lastErr
324+
}

pkg/cluster/sync.go

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,7 @@ func generateSerialNumber() (*big.Int, error) {
172172
// Unlike the update, sync does not error out if some objects do not exist and takes care of creating them.
173173
func (c *Cluster) Sync(newSpec *cpov1.Postgresql) error {
174174
var err error
175+
syncFailed := false
175176
c.mu.Lock()
176177
defer c.mu.Unlock()
177178

@@ -326,9 +327,21 @@ func (c *Cluster) Sync(newSpec *cpov1.Postgresql) error {
326327
}
327328
}
328329

329-
// Major version upgrade must only run after success of all earlier operations, must remain last item in sync
330-
if err := c.majorVersionUpgrade(); err != nil {
331-
c.logger.Errorf("major version upgrade failed: %v", err)
330+
if err != nil {
331+
syncFailed = true
332+
}
333+
if !syncFailed {
334+
err = c.executeMajorVersionUpgrade()
335+
if err != nil {
336+
c.logger.Errorf("major version upgrade failed after retries: %v", err)
337+
syncFailed = true
338+
}
339+
}
340+
341+
if syncFailed {
342+
c.logger.Errorf("Update for cluster %s/%s finished with errors..", c.Namespace, c.Name)
343+
} else {
344+
c.logger.Infof("Update for cluster %s/%s completed successfully.", c.Namespace, c.Name)
332345
}
333346

334347
return err

0 commit comments

Comments
 (0)