Skip to content

Commit e929c1a

Browse files
committed
fix(test): add Docker memory limits, cluster pause/resume, and resilient restore waits
- Add deploy memory limits to all docker-compose services (zeros: 512M, alphas: 2-4GB, minio: 512M) to prevent OOM kills on macOS Docker Desktop - Add --cache "size-mb=1024" to alpha commands for explicit cache sizing - Implement pause/resume of the default cluster during custom-cluster tests so the full Docker memory budget is available for custom clusters - Make WaitForRestore resilient to transient errors (connection reset, unavailable, transport errors) with a 10-minute deadline instead of infinite loop - Simplify dgraph-installed Make target to always rebuild - Ensure $GOPATH/bin is in PATH for subprocess tool discovery
1 parent cb87a32 commit e929c1a

4 files changed

Lines changed: 167 additions & 25 deletions

File tree

Makefile

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -82,10 +82,7 @@ uninstall: ## Uninstall dgraph binary
8282

8383
.PHONY: dgraph-installed
8484
dgraph-installed:
85-
@if [ ! -f "$(GOPATH)/bin/dgraph" ] || [ ! -f "$(LINUX_GOBIN)/dgraph" ]; then \
86-
echo "Dgraph binary missing, running make install..."; \
87-
$(MAKE) install; \
88-
fi
85+
$(MAKE) install
8986

9087
.PHONY: deps
9188
deps: ## Check test dependencies (pass AUTO_INSTALL=true to auto-install missing ones)

systest/online-restore/docker-compose.yml

Lines changed: 40 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@ services:
88
ports:
99
- 8080
1010
- 9080
11+
deploy:
12+
resources:
13+
limits:
14+
memory: 4096M
1115
volumes:
1216
- type: bind
1317
source: ${LINUX_GOBIN:-$GOPATH/bin}
@@ -31,8 +35,8 @@ services:
3135
read_only: true
3236
command:
3337
/gobin/dgraph ${COVERAGE_OUTPUT} alpha --my=alpha1:7080 --zero=zero1:5080 --logtostderr -v=2
34-
--raft "idx=1;" --encryption "key-file=/data/keys/enc_key;" --security "whitelist=0.0.0.0/0;"
35-
--tls "ca-cert=/dgraph-tls/ca.crt; server-cert=/dgraph-tls/node.crt;
38+
--cache "size-mb=1024;" --raft "idx=1;" --encryption "key-file=/data/keys/enc_key;" --security
39+
"whitelist=0.0.0.0/0;" --tls "ca-cert=/dgraph-tls/ca.crt; server-cert=/dgraph-tls/node.crt;
3640
server-key=/dgraph-tls/node.key; internal-port=true;
3741
client-cert=/dgraph-tls/client.alpha1.crt; client-key=/dgraph-tls/client.alpha1.key;"
3842
alpha2:
@@ -45,6 +49,10 @@ services:
4549
ports:
4650
- 8080
4751
- 9080
52+
deploy:
53+
resources:
54+
limits:
55+
memory: 4096M
4856
volumes:
4957
- type: bind
5058
source: ${LINUX_GOBIN:-$GOPATH/bin}
@@ -68,8 +76,8 @@ services:
6876
read_only: true
6977
command:
7078
/gobin/dgraph ${COVERAGE_OUTPUT} alpha --my=alpha2:7080 --zero=zero1:5080 --logtostderr -v=2
71-
--raft "idx=2;" --encryption "key-file=/data/keys/enc_key;" --security "whitelist=0.0.0.0/0;"
72-
--tls "ca-cert=/dgraph-tls/ca.crt; server-cert=/dgraph-tls/node.crt;
79+
--cache "size-mb=1024;" --raft "idx=2;" --encryption "key-file=/data/keys/enc_key;" --security
80+
"whitelist=0.0.0.0/0;" --tls "ca-cert=/dgraph-tls/ca.crt; server-cert=/dgraph-tls/node.crt;
7381
server-key=/dgraph-tls/node.key; internal-port=true;
7482
client-cert=/dgraph-tls/client.alpha2.crt; client-key=/dgraph-tls/client.alpha2.key;"
7583
alpha3:
@@ -82,6 +90,10 @@ services:
8290
ports:
8391
- 8080
8492
- 9080
93+
deploy:
94+
resources:
95+
limits:
96+
memory: 4096M
8597
volumes:
8698
- type: bind
8799
source: ${LINUX_GOBIN:-$GOPATH/bin}
@@ -105,8 +117,8 @@ services:
105117
read_only: true
106118
command:
107119
/gobin/dgraph ${COVERAGE_OUTPUT} alpha --my=alpha3:7080 --zero=zero1:5080 --logtostderr -v=2
108-
--raft "idx=3;" --encryption "key-file=/data/keys/enc_key;" --security "whitelist=0.0.0.0/0;"
109-
--tls "ca-cert=/dgraph-tls/ca.crt; server-cert=/dgraph-tls/node.crt;
120+
--cache "size-mb=1024;" --raft "idx=3;" --encryption "key-file=/data/keys/enc_key;" --security
121+
"whitelist=0.0.0.0/0;" --tls "ca-cert=/dgraph-tls/ca.crt; server-cert=/dgraph-tls/node.crt;
110122
server-key=/dgraph-tls/node.key; internal-port=true;
111123
client-cert=/dgraph-tls/client.alpha3.crt; client-key=/dgraph-tls/client.alpha3.key;"
112124
alpha4:
@@ -119,6 +131,10 @@ services:
119131
ports:
120132
- 8080
121133
- 9080
134+
deploy:
135+
resources:
136+
limits:
137+
memory: 4096M
122138
volumes:
123139
- type: bind
124140
source: ${LINUX_GOBIN:-$GOPATH/bin}
@@ -142,8 +158,8 @@ services:
142158
read_only: true
143159
command:
144160
/gobin/dgraph ${COVERAGE_OUTPUT} alpha --my=alpha4:7080 --zero=zero1:5080 --logtostderr -v=2
145-
--raft "idx=4;" --encryption "key-file=/data/keys/enc_key;" --security "whitelist=0.0.0.0/0;"
146-
--tls "ca-cert=/dgraph-tls/ca.crt; server-cert=/dgraph-tls/node.crt;
161+
--cache "size-mb=1024;" --raft "idx=4;" --encryption "key-file=/data/keys/enc_key;" --security
162+
"whitelist=0.0.0.0/0;" --tls "ca-cert=/dgraph-tls/ca.crt; server-cert=/dgraph-tls/node.crt;
147163
server-key=/dgraph-tls/node.key; internal-port=true;
148164
client-cert=/dgraph-tls/client.alpha4.crt; client-key=/dgraph-tls/client.alpha4.key;"
149165
alpha5:
@@ -156,6 +172,10 @@ services:
156172
ports:
157173
- 8080
158174
- 9080
175+
deploy:
176+
resources:
177+
limits:
178+
memory: 4096M
159179
volumes:
160180
- type: bind
161181
source: ${LINUX_GOBIN:-$GOPATH/bin}
@@ -179,8 +199,8 @@ services:
179199
read_only: true
180200
command:
181201
/gobin/dgraph ${COVERAGE_OUTPUT} alpha --my=alpha5:7080 --zero=zero1:5080 --logtostderr -v=2
182-
--raft "idx=5;" --encryption "key-file=/data/keys/enc_key;" --security "whitelist=0.0.0.0/0;"
183-
--tls "ca-cert=/dgraph-tls/ca.crt; server-cert=/dgraph-tls/node.crt;
202+
--cache "size-mb=1024;" --raft "idx=5;" --encryption "key-file=/data/keys/enc_key;" --security
203+
"whitelist=0.0.0.0/0;" --tls "ca-cert=/dgraph-tls/ca.crt; server-cert=/dgraph-tls/node.crt;
184204
server-key=/dgraph-tls/node.key; internal-port=true;
185205
client-cert=/dgraph-tls/client.alpha5.crt; client-key=/dgraph-tls/client.alpha5.key;"
186206
alpha6:
@@ -193,6 +213,10 @@ services:
193213
ports:
194214
- 8080
195215
- 9080
216+
deploy:
217+
resources:
218+
limits:
219+
memory: 4096M
196220
volumes:
197221
- type: bind
198222
source: ${LINUX_GOBIN:-$GOPATH/bin}
@@ -216,8 +240,8 @@ services:
216240
read_only: true
217241
command:
218242
/gobin/dgraph ${COVERAGE_OUTPUT} alpha --my=alpha6:7080 --zero=zero1:5080 --logtostderr -v=2
219-
--raft "idx=6;" --encryption "key-file=/data/keys/enc_key;" --security "whitelist=0.0.0.0/0;"
220-
--tls "ca-cert=/dgraph-tls/ca.crt; server-cert=/dgraph-tls/node.crt;
243+
--cache "size-mb=1024;" --raft "idx=6;" --encryption "key-file=/data/keys/enc_key;" --security
244+
"whitelist=0.0.0.0/0;" --tls "ca-cert=/dgraph-tls/ca.crt; server-cert=/dgraph-tls/node.crt;
221245
server-key=/dgraph-tls/node.key; internal-port=true;
222246
client-cert=/dgraph-tls/client.alpha6.crt; client-key=/dgraph-tls/client.alpha6.key;"
223247
zero1:
@@ -228,6 +252,10 @@ services:
228252
ports:
229253
- 5080
230254
- 6080
255+
deploy:
256+
resources:
257+
limits:
258+
memory: 1024M
231259
volumes:
232260
- type: bind
233261
source: ${LINUX_GOBIN:-$GOPATH/bin}

t/t.go

Lines changed: 85 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -592,6 +592,76 @@ func runTests(taskCh chan task, closer *z.Closer) error {
592592
}
593593
}()
594594

595+
// defaultPaused tracks whether the default cluster has been stopped to
596+
// free memory for custom-cluster tests.
597+
var defaultPaused bool
598+
599+
// pauseDefault stops the default cluster containers (without removing
600+
// them) so that custom-cluster tests have the full Docker memory
601+
// budget. On macOS/Docker-Desktop the VM is memory-constrained and
602+
// running 16+ Dgraph processes simultaneously causes OOM kills.
603+
pauseDefault := func() {
604+
if !started || stopped {
605+
return
606+
}
607+
cmd := command("docker", "compose", "--compatibility",
608+
"-f", defaultCompose, "-p", prefix, "stop")
609+
cmd.Stderr = nil
610+
if err := cmd.Run(); err != nil {
611+
fmt.Printf("Warning: failed to pause default cluster %s: %v\n", prefix, err)
612+
} else {
613+
defaultPaused = true
614+
fmt.Printf("DEFAULT CLUSTER PAUSED: %s\n", prefix)
615+
}
616+
}
617+
618+
// resumeDefault restarts the stopped default cluster containers and
619+
// waits for them to become healthy.
620+
resumeDefault := func() error {
621+
if !started || stopped {
622+
return start()
623+
}
624+
if !defaultPaused {
625+
return nil // already running
626+
}
627+
cmd := command("docker", "compose", "--compatibility",
628+
"-f", defaultCompose, "-p", prefix, "start")
629+
cmd.Stderr = nil
630+
if err := cmd.Run(); err != nil {
631+
fmt.Printf("Warning: failed to resume default cluster %s: %v\n", prefix, err)
632+
// If resume fails, recreate the cluster from scratch.
633+
started = false
634+
return start()
635+
}
636+
fmt.Printf("DEFAULT CLUSTER RESUMED: %s\n", prefix)
637+
638+
// Wait for health after resume.
639+
var resumeWg sync.WaitGroup
640+
for i := 1; i <= NumZeroNodes; i++ {
641+
resumeWg.Add(1)
642+
go func(n int) {
643+
defer resumeWg.Done()
644+
in := testutil.GetContainerInstance(prefix, "zero"+strconv.Itoa(n))
645+
if err := in.BestEffortWaitForHealthy(ZeroPort); err != nil {
646+
fmt.Printf("Warning: zero%d health check after resume: %v\n", n, err)
647+
}
648+
}(i)
649+
}
650+
for i := 1; i <= NumAlphaNodes; i++ {
651+
resumeWg.Add(1)
652+
go func(n int) {
653+
defer resumeWg.Done()
654+
in := testutil.GetContainerInstance(prefix, "alpha"+strconv.Itoa(n))
655+
if err := in.BestEffortWaitForHealthy(AlphaPort); err != nil {
656+
fmt.Printf("Warning: alpha%d health check after resume: %v\n", n, err)
657+
}
658+
}(i)
659+
}
660+
resumeWg.Wait()
661+
defaultPaused = false
662+
return nil
663+
}
664+
595665
for task := range taskCh {
596666
if ctx.Err() != nil {
597667
err = ctx.Err()
@@ -608,14 +678,16 @@ func runTests(taskCh chan task, closer *z.Closer) error {
608678
// If we only need to run custom cluster tests, then skip this one.
609679
continue
610680
}
611-
if err := start(); err != nil {
681+
if err := resumeDefault(); err != nil {
612682
return err
613683
}
614684
if err = runTestsFor(ctx, task.pkg.ID, prefix, xmlFile); err != nil {
615685
// fmt.Printf("ERROR for package: %s. Err: %v\n", task.pkg.ID, err)
616686
return err
617687
}
618688
} else {
689+
// Pause the default cluster to free memory for the custom cluster.
690+
pauseDefault()
619691
// we are not using err variable here because we dont want to
620692
// print logs of default cluster in case of custom test fail.
621693
if cerr := runCustomClusterTest(ctx, task.pkg.ID, wg, xmlFile); cerr != nil {
@@ -1179,6 +1251,18 @@ func run() error {
11791251
fmt.Printf("Proc ID is %d\n", procId)
11801252
fmt.Printf("Detected architecture: %s", runtime.GOARCH)
11811253

1254+
// Ensure $GOPATH/bin is in PATH so that tools installed via `go install`
1255+
// (e.g. protoc-gen-go) are found by subprocesses like protoc.
1256+
gopath := os.Getenv("GOPATH")
1257+
if gopath == "" {
1258+
gopath = filepath.Join(os.Getenv("HOME"), "go")
1259+
}
1260+
gopathBin := filepath.Join(gopath, "bin")
1261+
currentPath := os.Getenv("PATH")
1262+
if !strings.Contains(currentPath, gopathBin) {
1263+
os.Setenv("PATH", gopathBin+string(os.PathListSeparator)+currentPath)
1264+
}
1265+
11821266
start := time.Now()
11831267
oc.Took(0, "START", time.Millisecond)
11841268

testutil/backup.go

Lines changed: 41 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -53,29 +53,44 @@ func openDgraph(pdir string) (*badger.DB, error) {
5353
}
5454

5555
func WaitForRestore(t *testing.T, dg *dgo.Dgraph, HttpSocket string) {
56+
// Use a 10-minute overall deadline so the test fails quickly if the
57+
// cluster is in a permanently degraded state (e.g. OOM-killed containers)
58+
// rather than hanging until the Go test timeout.
59+
deadline := time.Now().Add(10 * time.Minute)
60+
5661
restoreDone := false
57-
for {
62+
for time.Now().Before(deadline) {
5863
resp, err := http.Get("http://" + HttpSocket + "/health")
59-
require.NoError(t, err)
64+
if err != nil {
65+
// The health endpoint may be transiently unreachable while the
66+
// server restarts during a restore. Keep retrying.
67+
t.Logf("WaitForRestore: health check error (will retry): %v", err)
68+
time.Sleep(1 * time.Second)
69+
continue
70+
}
6071
buf, err := io.ReadAll(resp.Body)
61-
require.NoError(t, resp.Body.Close())
62-
require.NoError(t, err)
72+
_ = resp.Body.Close()
73+
if err != nil {
74+
t.Logf("WaitForRestore: error reading health body (will retry): %v", err)
75+
time.Sleep(1 * time.Second)
76+
continue
77+
}
6378
sbuf := string(buf)
6479
if !strings.Contains(sbuf, "opRestore") {
6580
restoreDone = true
6681
break
6782
}
6883
time.Sleep(1 * time.Second)
6984
}
70-
require.True(t, restoreDone)
85+
require.True(t, restoreDone, "timed out waiting for restore operation to complete")
7186

7287
// Wait for the client to exit draining mode. This is needed because the client might
7388
// be connected to a follower and might be behind the leader in applying the restore.
7489
// Waiting for three consecutive successful queries is done to prevent a situation in
7590
// which the query succeeds at the first attempt because the follower is behind and
7691
// has not started to apply the restore proposal.
7792
numSuccess := 0
78-
for {
93+
for time.Now().Before(deadline) {
7994
// This is a dummy query that returns no results.
8095
_, err := dg.NewTxn().Query(context.Background(), `{
8196
q(func: has(invalid_pred)) {
@@ -85,18 +100,36 @@ func WaitForRestore(t *testing.T, dg *dgo.Dgraph, HttpSocket string) {
85100
if err == nil {
86101
numSuccess++
87102
} else {
88-
require.Contains(t, err.Error(), "the server is in draining mode")
103+
// During restore the server may be in draining mode, or it may be
104+
// transiently unreachable (connection reset, TLS handshake failure,
105+
// gRPC Unavailable, etc.). All of these are expected and retriable.
106+
errMsg := err.Error()
107+
transient := strings.Contains(errMsg, "the server is in draining mode") ||
108+
strings.Contains(errMsg, "Unavailable") ||
109+
strings.Contains(errMsg, "connection reset") ||
110+
strings.Contains(errMsg, "connection refused") ||
111+
strings.Contains(errMsg, "transport") ||
112+
strings.Contains(errMsg, "EOF") ||
113+
strings.Contains(errMsg, "overloaded") ||
114+
strings.Contains(errMsg, "context canceled") ||
115+
strings.Contains(errMsg, "Please retry")
116+
if !transient {
117+
require.Fail(t, "unexpected error while waiting for restore",
118+
"error: %v", err)
119+
}
89120
numSuccess = 0
90121
}
91122

92123
// Apply restore works differently with race enabled.
93124
// We are seeing delays in apply proposals hence failure of queries.
94125
if numSuccess == 10 {
95-
// The server has been responsive three times in a row.
126+
// The server has been responsive ten times in a row.
96127
break
97128
}
98129
time.Sleep(1 * time.Second)
99130
}
131+
require.GreaterOrEqual(t, numSuccess, 10,
132+
"timed out waiting for server to exit draining mode after restore")
100133
}
101134

102135
// GetPredicateValues reads the specified p directory and returns the values for the given

0 commit comments

Comments
 (0)