diff --git a/pkg/node/node.go b/pkg/node/node.go index 35b78d4dc05..33148e7f851 100644 --- a/pkg/node/node.go +++ b/pkg/node/node.go @@ -205,7 +205,7 @@ const ( minPaymentThreshold = 2 * refreshRate // minimal accepted payment threshold of full nodes maxPaymentThreshold = 24 * refreshRate // maximal accepted payment threshold of full nodes mainnetNetworkID = uint64(1) // - reserveWakeUpDuration = 15 * time.Minute // time to wait before waking up reserveWorker + reserveWakeUpDuration = 5 * time.Minute // time to wait before waking up reserveWorker reserveMinEvictCount = 1_000 cacheMinEvictCount = 10_000 maxAllowedDoubling = 1 diff --git a/pkg/storer/reserve.go b/pkg/storer/reserve.go index b6799ea1cee..2d7e3c70c6c 100644 --- a/pkg/storer/reserve.go +++ b/pkg/storer/reserve.go @@ -167,20 +167,43 @@ func (db *DB) reserveWorker(ctx context.Context, ready chan<- struct{}) { case <-thresholdTicker.C: radius := db.reserve.Radius() + if radius <= db.reserveOptions.minimumRadius { + continue + } count, err := db.countWithinRadius(ctx) if err != nil { db.logger.Warning("reserve worker count within radius", "error", err) continue } - if count < threshold(db.reserve.Capacity()) && db.syncer.SyncRate() == 0 && radius > db.reserveOptions.minimumRadius { - radius-- - if err := db.reserve.SetRadius(radius); err != nil { - db.logger.Error(err, "reserve set radius") - } - db.metrics.StorageRadius.Set(float64(radius)) - db.logger.Info("reserve radius decrease", "radius", radius) + t := threshold(db.reserve.Capacity()) + if count >= t { + continue + } + + // Decrement the storage radius. The decrement is gated only on + // the reserve fill state (count < threshold) and the operator's + // minimum-radius floor. There is no sync-rate gate here, which + // mirrors the unreserve path that raises radius without + // consulting sync activity. A previous SyncRate() == 0 gate + // proved structurally unreachable on live networks: peer churn + // kept historical sync above zero, and the resetIntervals call + // in puller.onChange (triggered by this very radius change) + // retriggered historical sync, locking the gate closed (issues + // #5396, #5428). When count is well below threshold, jump by + // two steps to keep adjustments bounded but recover faster from + // large gaps; under uniform CAC bin distribution each step + // roughly doubles count-within-radius. + steps := uint8(1) + if count*4 <= t && radius-1 > db.reserveOptions.minimumRadius { + steps = 2 + } + radius -= steps + if err := db.reserve.SetRadius(radius); err != nil { + db.logger.Error(err, "reserve set radius") } + db.metrics.StorageRadius.Set(float64(radius)) + db.logger.Info("reserve radius decrease", "radius", radius, "steps", steps, "count_within_radius", count, "threshold", t) } } } diff --git a/pkg/storer/reserve_test.go b/pkg/storer/reserve_test.go index 4c83df534dc..c9eca611d71 100644 --- a/pkg/storer/reserve_test.go +++ b/pkg/storer/reserve_test.go @@ -516,20 +516,25 @@ func TestRadiusManager(t *testing.T) { waitForRadius(t, storer.Reserve(), 0) }) - t.Run("radius doesn't change due to non-zero pull rate", func(t *testing.T) { + t.Run("radius decreases even with non-zero pull rate", func(t *testing.T) { t.Parallel() storer, err := diskStorer(t, dbTestOps(baseAddr, 10, nil, nil, time.Millisecond*500))() if err != nil { t.Fatal(err) } readyC := make(chan struct{}) + // Reserve is empty, so countWithinRadius == 0 < threshold; the worker + // should decrement radius regardless of the syncer's reported rate. + // The old behavior (gated on SyncRate() == 0) made this scenario + // permanently stuck on live networks where peer churn keeps the rate + // above zero — see issues #5396 and #5428. storer.StartReserveWorker(context.Background(), pullerMock.NewMockRateReporter(1), networkRadiusFunc(3), readyC) select { case <-readyC: case <-t.Context().Done(): t.Fatal("start reserve worker timeout") } - waitForRadius(t, storer.Reserve(), 3) + waitForRadius(t, storer.Reserve(), 0) }) }