diff --git a/.gitignore b/.gitignore index 248248ed6cb..d45a04118c3 100644 --- a/.gitignore +++ b/.gitignore @@ -48,3 +48,5 @@ x/log_test/*.enc *.buf .osgrep .worktrees/ +AGENTS.md +CLAUDE.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 75ccb8556b9..e38d3c739f7 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -40,7 +40,8 @@ ```bash git clone https://github.com/dgraph-io/dgraph.git cd ./dgraph -make install +make setup # auto-install tool dependencies (gotestsum, ack, etc.) +make install # build and install the dgraph binary ``` This will put the source code in a Git repo under `$GOPATH/src/github.com/dgraph-io/dgraph` and @@ -144,16 +145,25 @@ directory, providing control and flexibility beyond the standard Go testing fram The simplest way to run tests is via Make: ```bash -# Run all tests +# First-time setup: install tool dependencies +make setup + +# Run default tests (~30 min): integration suite + integration2 make test +# Run every test in the repo +make test-all + # Run specific test types -make test-unit # Unit tests only (no Docker) -make test-integration2 # Integration2 tests via dgraphtest -make test-upgrade # Upgrade tests +make test-unit # True unit tests only — no Docker, no build tags +make test-integration # Integration tests via t/ runner with Docker +make test-integration-heavy # All heavy tests: systest-heavy + ldbc + load +make test-integration2 # Integration2 tests via dgraphtest +make test-upgrade # Upgrade tests # Use variables for more control make test TAGS=integration2 PKG=systest/vector +make test SUITE=all # All t/ runner suites make test TIMEOUT=90m # Override per-package timeout (default: 30m) ``` diff --git a/Makefile b/Makefile index e4f6cdd3207..b023cebf99a 100644 --- a/Makefile +++ b/Makefile @@ -18,11 +18,20 @@ ifeq ($(GOPATH),) $(error GOPATH is not set. Please set it explicitly, e.g. export GOPATH=$$HOME/go) endif -# On non-Linux systems, use a separate directory for Linux binaries +# On non-Linux systems, use a separate directory for Linux binaries and +# a cross-compiler so that CGO is available (required for plugin support). ifeq ($(GOHOSTOS),linux) export LINUX_GOBIN ?= $(GOPATH)/bin + LINUX_CC ?= gcc else export LINUX_GOBIN ?= $(GOPATH)/linux_$(GOHOSTARCH) + ifeq ($(GOHOSTARCH),arm64) + LINUX_CC ?= aarch64-unknown-linux-gnu-gcc + else ifeq ($(GOHOSTARCH),amd64) + LINUX_CC ?= x86_64-unknown-linux-gnu-gcc + else + LINUX_CC ?= gcc + endif endif ###################### @@ -60,7 +69,7 @@ install: ## Install dgraph binary ifneq ($(GOHOSTOS),linux) @mkdir -p $(LINUX_GOBIN) @echo "Installing dgraph (linux/$(GOHOSTARCH))..." - @GOOS=linux GOARCH=$(GOHOSTARCH) $(MAKE) -C dgraph dgraph + @GOOS=linux GOARCH=$(GOHOSTARCH) CGO_ENABLED=1 CC=$(LINUX_CC) $(MAKE) -C dgraph BUILD_TAGS= EXTLDFLAGS=-fuse-ld=bfd dgraph @mv dgraph/dgraph $(LINUX_GOBIN)/dgraph @echo "Installed dgraph (linux/$(GOHOSTARCH)) to $(LINUX_GOBIN)/dgraph" endif @@ -73,13 +82,18 @@ uninstall: ## Uninstall dgraph binary .PHONY: dgraph-installed dgraph-installed: - @if [ ! -f "$(GOPATH)/bin/dgraph" ] || [ ! -f "$(LINUX_GOBIN)/dgraph" ]; then \ - echo "Dgraph binary missing, running make install..."; \ - $(MAKE) install; \ - fi + $(MAKE) install + +.PHONY: deps +deps: ## Check test dependencies (pass AUTO_INSTALL=true to auto-install missing ones) + $(MAKE) -C t deps + +.PHONY: setup +setup: ## Install all test dependencies automatically + $(MAKE) deps AUTO_INSTALL=true .PHONY: test -test: dgraph-installed local-image ## Run tests (see 'make help' for options) +test: dgraph-installed local-image ## Run tests (default: integration + integration2) ifdef TAGS @echo "Running tests with tags: $(TAGS)" go test -v --tags="$(TAGS)" \ @@ -97,53 +111,70 @@ else done endif else - @echo "Running test suite: $(or $(SUITE),all)" - $(MAKE) -C t test args="--suite=$(or $(SUITE),all) $(if $(PKG),--pkg=\"$(PKG)\") $(if $(TEST),--test=\"$(TEST)\") $(if $(TIMEOUT),--timeout=$(TIMEOUT))" +ifdef SUITE + @echo "Running test suite: $(SUITE)" + $(MAKE) -C t test args="--suite=$(SUITE) $(if $(PKG),--pkg=\"$(PKG)\") $(if $(TEST),--test=\"$(TEST)\") $(if $(TIMEOUT),--timeout=$(TIMEOUT))" +else + @echo "Running test suite: integration" + $(MAKE) -C t test args="--suite=integration $(if $(PKG),--pkg=\"$(PKG)\") $(if $(TEST),--test=\"$(TEST)\") $(if $(TIMEOUT),--timeout=$(TIMEOUT))" + @echo "Running integration2 tests..." + go test -v --tags="integration2" \ + $(if $(TEST),--run="$(TEST)") \ + $(if $(PKG),./$(PKG)/...,./...) +endif endif - -.PHONY: test-all -test-all: ## All test suites via t/ runner (i.e. 'make test SUITE=all') - @SUITE=all $(MAKE) test .PHONY: test-unit -test-unit: ## Unit tests, no Docker (i.e. 'make test SUITE=unit') +test-unit: ## True unit tests only — no Docker, no integration build tag (i.e. 'make test SUITE=unit') + $(if $(filter command line,$(origin SUITE)),$(error SUITE= cannot be passed to test-unit; use 'make test SUITE=...' instead)) @SUITE=unit $(MAKE) test +.PHONY: test-integration +test-integration: ## Integration tests via t/ runner with Docker (i.e. 'make test SUITE=integration') + $(if $(filter command line,$(origin SUITE)),$(error SUITE= cannot be passed to test-integration; use 'make test SUITE=...' instead)) + @SUITE=integration $(MAKE) test + .PHONY: test-core test-core: ## Core tests (i.e. 'make test SUITE=core') + $(if $(filter command line,$(origin SUITE)),$(error SUITE= cannot be passed to test-core; use 'make test SUITE=...' instead)) @SUITE=core $(MAKE) test -.PHONY: test-integration -test-integration: ## Integration tests (i.e. 'make test TAGS=integration') - @TAGS=integration $(MAKE) test - .PHONY: test-integration2 test-integration2: ## Integration2 tests via dgraphtest (i.e. 'make test TAGS=integration2') + $(if $(filter command line,$(origin TAGS)),$(error TAGS= cannot be passed to test-integration2; use 'make test TAGS=...' instead)) @TAGS=integration2 $(MAKE) test .PHONY: test-upgrade test-upgrade: ## Upgrade tests (i.e. 'make test TAGS=upgrade') + $(if $(filter command line,$(origin TAGS)),$(error TAGS= cannot be passed to test-upgrade; use 'make test TAGS=...' instead)) @TAGS=upgrade $(MAKE) test .PHONY: test-systest -test-systest: ## System integration tests (i.e. 'make test SUITE=systest') +test-systest: ## All systest packages: systest-baseline + systest-heavy (i.e. 'make test SUITE=systest') + $(if $(filter command line,$(origin SUITE)),$(error SUITE= cannot be passed to test-systest; use 'make test SUITE=...' instead)) @SUITE=systest $(MAKE) test +.PHONY: test-integration-heavy +test-integration-heavy: ## All heavy tests: systest-heavy + ldbc + load + $(if $(filter command line,$(origin SUITE)),$(error SUITE= cannot be passed to test-integration-heavy; use 'make test SUITE=...' instead)) + @SUITE=systest-heavy,ldbc,load $(MAKE) test + .PHONY: test-vector test-vector: ## Vector search tests (i.e. 'make test SUITE=vector') + $(if $(filter command line,$(origin SUITE)),$(error SUITE= cannot be passed to test-vector; use 'make test SUITE=...' instead)) @SUITE=vector $(MAKE) test .PHONY: test-fuzz -test-fuzz: ## Fuzz tests, auto-discovers packages (i.e. 'make test FUZZ=1') +test-fuzz: ## Fuzz tests (i.e. 'make test FUZZ=1') + $(if $(filter command line,$(origin FUZZ)),$(error FUZZ= cannot be passed to test-fuzz; use 'make test FUZZ=...' instead)) @FUZZ=1 $(MAKE) test -.PHONY: test-ldbc -test-ldbc: ## LDBC benchmark tests (i.e. 'make test SUITE=ldbc') - @SUITE=ldbc $(MAKE) test - -.PHONY: test-load -test-load: ## Heavy load tests (i.e. 'make test SUITE=load') - @SUITE=load $(MAKE) test +.PHONY: test-all +test-all: ## Every test: all t/ suites + integration2 + upgrade + fuzz + $(MAKE) test SUITE=all + $(MAKE) test-integration2 + $(MAKE) test-upgrade + $(MAKE) test-fuzz .PHONY: test-benchmark test-benchmark: ## Go benchmarks (i.e. 'go test -bench') @@ -152,7 +183,11 @@ test-benchmark: ## Go benchmarks (i.e. 'go test -bench') .PHONY: local-image local-image: ## Build local Docker image (dgraph/dgraph:local) @echo building local docker image - @GOOS=linux GOARCH=amd64 $(MAKE) dgraph +ifneq ($(GOHOSTOS),linux) + @GOOS=linux GOARCH=$(GOHOSTARCH) CGO_ENABLED=1 CC=$(LINUX_CC) $(MAKE) BUILD_TAGS= EXTLDFLAGS=-fuse-ld=bfd dgraph +else + @GOOS=linux GOARCH=$(GOHOSTARCH) $(MAKE) dgraph +endif @mkdir -p linux @mv ./dgraph/dgraph ./linux/dgraph @docker build -f contrib/Dockerfile -t dgraph/dgraph:local . @@ -161,6 +196,13 @@ local-image: ## Build local Docker image (dgraph/dgraph:local) .PHONY: image-local image-local: local-image ## Alias for local-image +.PHONY: clean +clean: ## Clean build artifacts + $(MAKE) -C dgraph clean + $(MAKE) -C compose clean + @rm -rf linux + @go clean -testcache + .PHONY: docker-image docker-image: dgraph ## Build Docker image (dgraph/dgraph:$VERSION) @mkdir -p linux @@ -201,7 +243,7 @@ help: ## Show available targets and variables awk 'BEGIN {FS = ":.*?## "}; {printf " %-20s %s\n", $$1, $$2}' @echo "" @echo "Variables that can be passed to 'test':" - @echo " SUITE Select t/ runner suite (e.g., make test SUITE=systest)" + @echo " SUITE Select t/ runner suite (default: integration + integration2)" @echo " TAGS Go build tags - bypasses t/ runner (e.g., make test TAGS=integration2)" @echo " PKG Limit to specific package (e.g., make test PKG=systest/export)" @echo " TEST Run specific test function (e.g., make test TEST=TestGQLSchema)" @@ -216,8 +258,9 @@ help: ## Show available targets and variables @printf " Available TAGS values: " @grep -roh "//go:build [a-z0-9]*" --include="*_test.go" . 2>/dev/null | \ awk '{print $$2}' | \ - grep -E '^(integration|integration2|upgrade)$$' | \ + grep -E '^(integration2|upgrade)$$' | \ sort -u | tr '\n' ' ' && echo "" + @echo " Note: 'integration' tests require the t/ runner (use SUITE=, not TAGS=)" @echo "" @echo "Examples:" @echo " make test TAGS=integration2 PKG=systest/vector # integration2 tests for vector" diff --git a/TESTING.md b/TESTING.md index 1c930479fc6..a25a2d6d12f 100644 --- a/TESTING.md +++ b/TESTING.md @@ -128,8 +128,9 @@ The codebase is organized into several key packages: Before running tests, ensure you have the following installed and configured. -> **TL;DR:** On a fresh checkout, just run `make install` followed by `make test`. The build system -> automatically handles OS detection, builds the correct binaries, and validates dependencies. +> **TL;DR:** On a fresh checkout, run `make setup` to auto-install tool dependencies, then +> `make install` followed by `make test`. The build system automatically handles OS detection, +> builds the correct binaries, and validates dependencies. ### Automatic Dependency Checking @@ -137,11 +138,14 @@ The test framework includes scripts that check for required dependencies and can auto-install them: ```bash -# Check all dependencies (run from t/ directory) -cd t && make check +# Auto-install all missing tool dependencies (recommended for first-time setup) +make setup -# Auto-install missing dependencies -AUTO_INSTALL=true make check +# Check dependencies without installing (reports what's missing) +make deps + +# Same as 'make deps' but auto-installs anything missing +make deps AUTO_INSTALL=true ``` The check scripts validate: @@ -154,9 +158,8 @@ The check scripts validate: ### Required Tools -> **Note:** You don't need to install these manually. Running `AUTO_INSTALL=true make check` from -> the `t/` directory (or `AUTO_INSTALL=true make test` from the repo root) automatically installs -> missing dependencies. The commands below are listed for reference. +> **Note:** You don't need to install these manually. Running `make setup` from the repo root +> automatically installs missing dependencies. The commands below are listed for reference. #### 1. Go (1.21+) @@ -222,8 +225,8 @@ dgraph version The build system now handles most setup automatically. On both Linux and macOS: ```bash -# Install dependencies (optional - auto-installs if missing) -cd t && AUTO_INSTALL=true make check && cd .. +# Auto-install tool dependencies (gotestsum, ack, etc.) +make setup # Build dgraph binary (automatically handles Linux binary on macOS) make install @@ -304,22 +307,24 @@ If both pass, you're ready to run all test types! The simplest way to run tests: ```bash -# Run all tests (default) +# Run default tests (~30 min): integration suite + integration2 make test +# Run every test in the repo (all suites + all tag-based tests + fuzz) +make test-all + # Common shortcuts (run 'make help' for full list) -make test-all # All test suites via t/ runner (i.e. 'make test SUITE=all') -make test-unit # Unit tests, no Docker (i.e. 'make test SUITE=unit') -make test-core # Core tests (i.e. 'make test SUITE=core') -make test-systest # System integration tests (i.e. 'make test SUITE=systest') -make test-vector # Vector search tests (i.e. 'make test SUITE=vector') -make test-ldbc # LDBC benchmark tests (i.e. 'make test SUITE=ldbc') -make test-load # Heavy load tests (i.e. 'make test SUITE=load') -make test-integration # Integration tests (i.e. 'make test TAGS=integration') -make test-integration2 # Integration2 tests via dgraphtest (i.e. 'make test TAGS=integration2') -make test-upgrade # Upgrade tests (i.e. 'make test TAGS=upgrade') -make test-fuzz # Fuzz tests, auto-discovers packages (i.e. 'make test FUZZ=1') -make test-benchmark # Go benchmarks (i.e. 'go test -bench') +make test-unit # True unit tests only — no Docker, no build tags +make test-integration # Integration tests via t/ runner with Docker (SUITE=integration) +make test-integration-heavy # All heavy tests: systest-heavy + ldbc + load +make test-core # Core tests (i.e. 'make test SUITE=core') +make test-systest # All systest packages: systest-baseline + systest-heavy +make test-vector # Vector search tests (i.e. 'make test SUITE=vector') +make test-integration2 # Integration2 tests via dgraphtest (i.e. 'make test TAGS=integration2') +make test-upgrade # Upgrade tests (i.e. 'make test TAGS=upgrade') +make test-fuzz # Fuzz tests (i.e. 'make test FUZZ=1') +make test-all # Every test: all t/ suites + integration2 + upgrade + fuzz +make test-benchmark # Go benchmarks (i.e. 'go test -bench') ``` Run `make help` to see all available targets, variables, and dynamically discovered SUITE/TAGS @@ -331,7 +336,7 @@ For more control, pass variables to `make test`: | Variable | Purpose | Example | | ---------- | ---------------------------------- | ------------------------------- | -| `SUITE` | Select t/ runner suite | `make test SUITE=systest` | +| `SUITE` | Select t/ runner suite | `make test SUITE=integration` | | `TAGS` | Go build tags - bypasses t/ runner | `make test TAGS=integration2` | | `PKG` | Limit to specific package | `make test PKG=systest/export` | | `TEST` | Run specific test function | `make test TEST=TestGQLSchema` | @@ -339,7 +344,8 @@ For more control, pass variables to `make test`: | `FUZZ` | Enable fuzz testing | `make test FUZZ=1` | | `FUZZTIME` | Fuzz duration per package | `make test FUZZ=1 FUZZTIME=60s` | -**Precedence:** `TAGS` > `FUZZ` > `SUITE` (first match wins) +**Precedence:** `TAGS` > `FUZZ` > `SUITE` > default (first match wins). When no variable is set, +`make test` runs `integration` suite (via t/ runner) plus `integration2`. ### Examples @@ -536,15 +542,18 @@ Docker Compose and runs tests tagged with `integration`. A suite is a named group of test packages that can be run together with the `--suite` flag. -| Suite | Purpose | Packages/Tests Included | -| --------- | ------------------------------------- | --------------------------------------------------------------------------------------------- | -| `unit` | Default suite for regular development | All packages except ldbc and load (includes query, mutation, schema, GraphQL, ACL, worker) | -| `core` | Core Dgraph functionality | Query, mutation, schema, GraphQL e2e, ACL, TLS, worker (excludes systest, ldbc, vector, load) | -| `systest` | Real workflows and system-level tests | Backup/restore, export, multi-tenancy, online-restore, audit, CDC, group-delete | -| `vector` | Vector search functionality | Vector index, similarity search, HNSW, vector backup/restore (`systest/vector/`) | -| `ldbc` | Benchmark queries | LDBC benchmark suite (`systest/ldbc/`) | -| `load` | Heavy data loading scenarios | 21million, 1million, bulk_live, bgindex, bulkloader | -| `all` | Everything | Runs all test suites | +| Suite | Purpose | Packages/Tests Included | +| ------------------ | -------------------------------------------------- | ------------------------------------------------------------------------------------- | +| `unit` | True unit tests only | All packages except ldbc/load — no Docker, no `--tags=integration` | +| `integration` | Default suite — all integration tests except heavy | Everything except ldbc, load, and systest-heavy (replaces old `unit`) | +| `core` | Core Dgraph functionality | Query, mutation, schema, GraphQL e2e, ACL, TLS, worker | +| `systest` | All system integration tests | Both systest-baseline + systest-heavy (backward compatible) | +| `systest-baseline` | Lean systest for daily dev | backup/filesystem, export, multi-tenancy, audit, CDC, group-delete, plugin, ... | +| `systest-heavy` | Resource-intensive systests | backup/minio\*, backup/encryption, backup/advanced-scenarios, tracing, online-restore | +| `vector` | Vector search functionality | Vector index, similarity search, HNSW | +| `ldbc` | Benchmark queries | LDBC benchmark suite | +| `load` | Heavy data loading scenarios | 21million, 1million, bulk_live, bgindex, bulkloader | +| `all` | Everything in t/ runner | All packages | ### Docker Compose Discovery @@ -579,16 +588,16 @@ cd t && go build . ### Key Flags -| Flag | Description | -| ------------- | ------------------------------------------------------------------- | -| `--suite=X` | Select test suite(s): all, ldbc, load, unit, systest, vector, core | -| `--pkg=X` | Run specific package | -| `--test=X` | Run specific test function | -| `--timeout=X` | Per-package timeout (e.g. 60m, 2h). Default: 30m (180m with --race) | -| `-j=N` | Concurrency (default: 1) | -| `--keep` | Keep cluster running after tests | -| `-r` | Remove all test containers | -| `--skip-slow` | Skip slow packages | +| Flag | Description | +| ------------- | ---------------------------------------------------------------------------------------------------------------- | +| `--suite=X` | Select test suite(s): all, ldbc, load, unit, integration, systest, systest-baseline, systest-heavy, vector, core | +| `--pkg=X` | Run specific package | +| `--test=X` | Run specific test function | +| `--timeout=X` | Per-package timeout (e.g. 60m, 2h). Default: 30m (180m with --race) | +| `-j=N` | Concurrency (default: 1) | +| `--keep` | Keep cluster running after tests | +| `-r` | Remove all test containers | +| `--skip-slow` | Skip slow packages | --- @@ -1288,6 +1297,8 @@ The following items from the original wishlist have been implemented: - **✅ Unified test interface:** A single `make test` entry point that accepts arguments to run any test type (unit, integration, integration2, upgrade, fuzz) with environment variables for control. + The default (`make test` with no args) runs `integration` suite plus `integration2` for a fast + feedback loop (~30 min). Use `make test-all` to run every test. - **✅ Example commands that "just work":** The following now work as expected: @@ -1295,7 +1306,7 @@ The following items from the original wishlist have been implemented: make test SUITE=systest make test FUZZ=1 PKG=dql make test TAGS=upgrade PKG=acl - make test TAGS=integration PKG=systest/plugin + make test SUITE=systest PKG=systest/plugin ``` ### Remaining Ideas diff --git a/dgraph/Makefile b/dgraph/Makefile index 6608079782e..4a2de9bcc36 100644 --- a/dgraph/Makefile +++ b/dgraph/Makefile @@ -32,7 +32,8 @@ gitBranch = github.com/dgraph-io/dgraph/v25/x.gitBranch lastCommitSHA = github.com/dgraph-io/dgraph/v25/x.lastCommitSHA lastCommitTime = github.com/dgraph-io/dgraph/v25/x.lastCommitTime -BUILD_FLAGS ?= -ldflags '-X ${lastCommitSHA}=${BUILD} -X "${lastCommitTime}=${BUILD_DATE}" -X "${dgraphVersion}=${BUILD_VERSION}" -X "${dgraphCodename}=${BUILD_CODENAME}" -X ${gitBranch}=${BUILD_BRANCH}' +EXTLDFLAGS ?= +BUILD_FLAGS ?= -ldflags '-X ${lastCommitSHA}=${BUILD} -X "${lastCommitTime}=${BUILD_DATE}" -X "${dgraphVersion}=${BUILD_VERSION}" -X "${dgraphCodename}=${BUILD_CODENAME}" -X ${gitBranch}=${BUILD_BRANCH}$(if $(EXTLDFLAGS), -extldflags "$(EXTLDFLAGS)")' # Insert build tags if specified ifneq ($(strip $(BUILD_TAGS)),) diff --git a/dgraph/cmd/dgraphimport/import_client.go b/dgraph/cmd/dgraphimport/import_client.go index 1297566378e..73e40b568d4 100644 --- a/dgraph/cmd/dgraphimport/import_client.go +++ b/dgraph/cmd/dgraphimport/import_client.go @@ -13,6 +13,8 @@ import ( "math" "os" "path/filepath" + "strings" + "time" "github.com/dgraph-io/badger/v4" "github.com/dgraph-io/dgo/v250" @@ -55,19 +57,53 @@ func Import(ctx context.Context, connectionString string, bulkOutDir string) err return streamSnapshot(ctx, dg, bulkOutDir, resp.Groups) } +// isRetryableError returns true for transient errors that may resolve after a brief wait, +// such as Raft proposal backlogs during membership changes. +func isRetryableError(err error) bool { + if err == nil { + return false + } + msg := err.Error() + // Only retry errors that indicate the server is alive but temporarily busy. + // Do NOT retry connectivity errors ("unable to connect to the leader", + // "connection refused") as those may indicate a permanent quorum loss and + // would cause negative test cases to wait unnecessarily. + return strings.Contains(msg, "overloaded") +} + // initiateSnapshotStream initiates a snapshot stream session with the Dgraph server. +// It retries on transient errors (e.g. "overloaded with pending proposals") with +// exponential backoff up to 60 seconds total. func initiateSnapshotStream(ctx context.Context, dc api.DgraphClient) (*api.UpdateExtSnapshotStreamingStateResponse, error) { glog.Info("[import] Initiating external snapshot stream") req := &api.UpdateExtSnapshotStreamingStateRequest{ Start: true, } - resp, err := dc.UpdateExtSnapshotStreamingState(ctx, req) - if err != nil { - glog.Errorf("[import] failed to initiate external snapshot stream: %v", err) - return nil, fmt.Errorf("failed to initiate external snapshot stream: %v", err) + + const maxRetryDuration = 60 * time.Second + deadline := time.Now().Add(maxRetryDuration) + retryDelay := time.Second + + for { + resp, err := dc.UpdateExtSnapshotStreamingState(ctx, req) + if err == nil { + glog.Info("[import] External snapshot stream initiated successfully") + return resp, nil + } + + if !isRetryableError(err) || time.Now().After(deadline) { + glog.Errorf("[import] failed to initiate external snapshot stream: %v", err) + return nil, fmt.Errorf("failed to initiate external snapshot stream: %v", err) + } + + glog.Warningf("[import] transient error initiating snapshot stream, retrying in %v: %v", retryDelay, err) + select { + case <-ctx.Done(): + return nil, ctx.Err() + case <-time.After(retryDelay): + } + retryDelay = min(retryDelay*2, 10*time.Second) } - glog.Info("[import] External snapshot stream initiated successfully") - return resp, nil } // streamSnapshot takes a p directory and a set of group IDs and streams the data from the diff --git a/dgraphtest/local_cluster.go b/dgraphtest/local_cluster.go index 504feac2b26..80a0fb10b9e 100644 --- a/dgraphtest/local_cluster.go +++ b/dgraphtest/local_cluster.go @@ -1322,11 +1322,19 @@ func (c *LocalCluster) GeneratePlugins(raceEnabled bool) error { if raceEnabled { opts = append(opts, "-race") } - opts = append(opts, "-buildmode=plugin", "-o", so, src) - os.Setenv("GOOS", "linux") - os.Setenv("GOARCH", "amd64") + opts = append(opts, "-buildmode=plugin") + if runtime.GOOS != "linux" { + // Use the BFD linker; the default gold linker is not shipped + // with most cross-compiler toolchains. + opts = append(opts, "-ldflags", "-extldflags -fuse-ld=bfd") + } + opts = append(opts, "-o", so, src) cmd := exec.Command("go", opts...) cmd.Dir = filepath.Dir(curr) + cmd.Env = append(os.Environ(), "GOOS=linux", "GOARCH="+runtime.GOARCH) + if runtime.GOOS != "linux" { + cmd.Env = append(cmd.Env, "CGO_ENABLED=1", "CC="+linuxCrossCC()) + } if out, err := cmd.CombinedOutput(); err != nil { log.Printf("Error: %v\n", err) log.Printf("Output: %v\n", string(out)) @@ -1347,6 +1355,22 @@ func (c *LocalCluster) GeneratePlugins(raceEnabled bool) error { return nil } +// linuxCrossCC returns the C cross-compiler for targeting Linux from the current host. +// Respects the LINUX_CC environment variable if set. +func linuxCrossCC() string { + if cc := os.Getenv("LINUX_CC"); cc != "" { + return cc + } + switch runtime.GOARCH { + case "arm64": + return "aarch64-unknown-linux-gnu-gcc" + case "amd64": + return "x86_64-unknown-linux-gnu-gcc" + default: + return "gcc" + } +} + func (c *LocalCluster) GetAlphaGrpcPublicPort(id int) (string, error) { return publicPort(c.dcli, c.alphas[id], alphaGrpcPort) } diff --git a/t/Makefile b/t/Makefile index 14ed4d61d68..b6e35a851f5 100644 --- a/t/Makefile +++ b/t/Makefile @@ -18,12 +18,15 @@ endif all: test -.PHONY: check -check: check-go check-docker check-gotestsum check-ack +.PHONY: deps +deps: check-go check-docker check-gotestsum check-ack check-cross-compiler @if [ "$(GOOS)" = "linux" ]; then \ which protoc > /dev/null 2>&1 || (echo "Error: protoc is not installed or not in PATH" && exit 1); \ fi @echo "All dependencies are installed" + +.PHONY: check +check: deps @echo "LINUX_GOBIN=$(LINUX_GOBIN)" @if [ -f "$(LINUX_GOBIN)/dgraph" ]; then \ file $(LINUX_GOBIN)/dgraph | grep -q "ELF.*executable" || (echo "Error: dgraph binary at $(LINUX_GOBIN)/dgraph is not a Linux executable" && exit 1); \ @@ -44,6 +47,10 @@ check-gotestsum: check-ack: @./scripts/check-ack.sh +.PHONY: check-cross-compiler +check-cross-compiler: + @./scripts/check-cross-compiler.sh + .PHONY: check-protoc check-protoc: @./scripts/check-protoc.sh diff --git a/t/scripts/check-cross-compiler.sh b/t/scripts/check-cross-compiler.sh new file mode 100755 index 00000000000..d050d51b4d6 --- /dev/null +++ b/t/scripts/check-cross-compiler.sh @@ -0,0 +1,108 @@ +#!/usr/bin/env bash +# shellcheck disable=SC2310 +set -euo pipefail + +# shellcheck source=checkhelper.sh +source "$(dirname "${BASH_SOURCE[0]}")/checkhelper.sh" + +BREW_TAP="messense/macos-cross-toolchains" + +# Return the expected cross-compiler binary name for the host architecture. +get_cross_compiler() { + if [[ -n ${LINUX_CC-} ]]; then + echo "${LINUX_CC}" + return + fi + local arch + arch="$(uname -m)" + case "${arch}" in + arm64 | aarch64) + echo "aarch64-unknown-linux-gnu-gcc" + ;; + x86_64) + echo "x86_64-unknown-linux-gnu-gcc" + ;; + *) + err "unsupported architecture: ${arch}" + return 1 + ;; + esac +} + +# Return the Homebrew formula name for the cross-compiler. +get_brew_formula() { + local arch + arch="$(uname -m)" + case "${arch}" in + arm64 | aarch64) + echo "aarch64-unknown-linux-gnu" + ;; + x86_64) + echo "x86_64-unknown-linux-gnu" + ;; + *) + err "unsupported architecture: ${arch}" + return 1 + ;; + esac +} + +install_cross_compiler_macos() { + ensure_brew || exit 1 + local formula + formula="$(get_brew_formula)" + brew tap "${BREW_TAP}" + brew install "${BREW_TAP}/${formula}" +} + +main() { + # Cross-compiler is only needed on non-Linux systems + local os + os="$(get_os)" + if [[ ${os} == "Linux" ]]; then + exit 0 + fi + + local cc + cc="$(get_cross_compiler)" + + if command -v "${cc}" &>/dev/null; then + exit 0 + fi + + if [[ ${AUTO_INSTALL-} == "true" ]]; then + install_cross_compiler_macos + if command -v "${cc}" &>/dev/null; then + exit 0 + else + err "cross-compiler check still failing after installation" + exit 1 + fi + fi + + echo "" + err "Linux cross-compiler is not installed (needed for Go plugin builds)" + echo "" + err "Required binary: ${cc}" + echo "" + err "Please install manually:" + + case "${os}" in + Darwin) + local formula + formula="$(get_brew_formula)" + err " brew tap ${BREW_TAP}" + err " brew install ${BREW_TAP}/${formula}" + echo "" + err 'Or set LINUX_CC to a custom cross-compiler (e.g. "zig cc -target ...")' + ;; + *) + err " (install a gcc cross-compiler targeting linux for your architecture)" + ;; + esac + + print_auto_install_hint + exit 1 +} + +main "$@" diff --git a/t/t.go b/t/t.go index 37ff7fe1795..59ed9f3a687 100644 --- a/t/t.go +++ b/t/t.go @@ -86,9 +86,13 @@ var ( "Don't bring up a cluster, instead use an existing cluster with this prefix.") skipSlow = pflag.BoolP("skip-slow", "s", false, "If true, don't run tests on slow packages.") - suite = pflag.String("suite", "unit", "This flag is used to specify which "+ - "test suites to run. Possible values are all, ldbc, load, unit, systest, vector, core. Multiple suites can be "+ - "selected like --suite=ldbc,load") + suite = pflag.String("suite", "integration", "This flag is used to specify which "+ + "test suites to run. Possible values are all, ldbc, load, unit, integration, systest, "+ + "systest-baseline, systest-heavy, vector, core. Multiple suites can be "+ + "selected like --suite=ldbc,load. "+ + "unit = true unit tests only (no Docker, no integration tag). "+ + "integration = everything except ldbc, load, and systest-heavy (with Docker). "+ + "systest = systest-baseline + systest-heavy.") tmp = pflag.String("tmp", "", "Temporary directory used to download data.") downloadResources = pflag.BoolP("download", "d", true, "Flag to specify whether to download resources or not") @@ -422,9 +426,23 @@ func sanitizeFilename(pkg string) string { return strings.ReplaceAll(pkg, "/", "_") } +// gotestsumBin returns the absolute path to gotestsum inside $GOPATH/bin. +// This avoids relying on $PATH, which may not include $GOPATH/bin on all machines +// (the check-gotestsum.sh script validates at this same path). +func gotestsumBin() string { + gopath := os.Getenv("GOPATH") + if gopath == "" { + gopath = filepath.Join(os.Getenv("HOME"), "go") + } + return filepath.Join(gopath, "bin", "gotestsum") +} + func runTestsFor(ctx context.Context, pkg, prefix string, xmlFile string) error { - args := []string{"gotestsum", "--junitfile", xmlFile, "--format", "standard-verbose", "--max-fails", "1", "--", - "-v", "-failfast", "-tags=integration"} + args := []string{gotestsumBin(), "--junitfile", xmlFile, "--format", "standard-verbose", "--max-fails", "1", "--", + "-v", "-failfast"} + if !isUnitOnly() { + args = append(args, "-tags=integration") + } switch { case *testTimeout != "": args = append(args, "-timeout", *testTimeout) @@ -535,7 +553,7 @@ func runTests(taskCh chan task, closer *z.Closer) error { var started, stopped bool start := func() error { - if len(*useExisting) > 0 || started { + if isUnitOnly() || len(*useExisting) > 0 || started { return nil } err := startCluster(defaultCompose, prefix) @@ -548,7 +566,7 @@ func runTests(taskCh chan task, closer *z.Closer) error { } stop := func() { - if *keepCluster || stopped { + if isUnitOnly() || *keepCluster || stopped { return } wg.Add(1) @@ -581,6 +599,76 @@ func runTests(taskCh chan task, closer *z.Closer) error { } }() + // defaultPaused tracks whether the default cluster has been stopped to + // free memory for custom-cluster tests. + var defaultPaused bool + + // pauseDefault stops the default cluster containers (without removing + // them) so that custom-cluster tests have the full Docker memory + // budget. On macOS/Docker-Desktop the VM is memory-constrained and + // running 16+ Dgraph processes simultaneously causes OOM kills. + pauseDefault := func() { + if !started || stopped { + return + } + cmd := command("docker", "compose", "--compatibility", + "-f", defaultCompose, "-p", prefix, "stop") + cmd.Stderr = nil + if err := cmd.Run(); err != nil { + fmt.Printf("Warning: failed to pause default cluster %s: %v\n", prefix, err) + } else { + defaultPaused = true + fmt.Printf("DEFAULT CLUSTER PAUSED: %s\n", prefix) + } + } + + // resumeDefault restarts the stopped default cluster containers and + // waits for them to become healthy. + resumeDefault := func() error { + if !started || stopped { + return start() + } + if !defaultPaused { + return nil // already running + } + cmd := command("docker", "compose", "--compatibility", + "-f", defaultCompose, "-p", prefix, "start") + cmd.Stderr = nil + if err := cmd.Run(); err != nil { + fmt.Printf("Warning: failed to resume default cluster %s: %v\n", prefix, err) + // If resume fails, recreate the cluster from scratch. + started = false + return start() + } + fmt.Printf("DEFAULT CLUSTER RESUMED: %s\n", prefix) + + // Wait for health after resume. + var resumeWg sync.WaitGroup + for i := 1; i <= NumZeroNodes; i++ { + resumeWg.Add(1) + go func(n int) { + defer resumeWg.Done() + in := testutil.GetContainerInstance(prefix, "zero"+strconv.Itoa(n)) + if err := in.BestEffortWaitForHealthy(ZeroPort); err != nil { + fmt.Printf("Warning: zero%d health check after resume: %v\n", n, err) + } + }(i) + } + for i := 1; i <= NumAlphaNodes; i++ { + resumeWg.Add(1) + go func(n int) { + defer resumeWg.Done() + in := testutil.GetContainerInstance(prefix, "alpha"+strconv.Itoa(n)) + if err := in.BestEffortWaitForHealthy(AlphaPort); err != nil { + fmt.Printf("Warning: alpha%d health check after resume: %v\n", n, err) + } + }(i) + } + resumeWg.Wait() + defaultPaused = false + return nil + } + for task := range taskCh { if ctx.Err() != nil { err = ctx.Err() @@ -597,14 +685,23 @@ func runTests(taskCh chan task, closer *z.Closer) error { // If we only need to run custom cluster tests, then skip this one. continue } - if err := start(); err != nil { - return err + if !isUnitOnly() { + if err := resumeDefault(); err != nil { + return err + } } if err = runTestsFor(ctx, task.pkg.ID, prefix, xmlFile); err != nil { // fmt.Printf("ERROR for package: %s. Err: %v\n", task.pkg.ID, err) return err } } else { + if isUnitOnly() { + // Skip custom-cluster packages entirely in unit mode — + // they only contain integration tests. + continue + } + // Pause the default cluster to free memory for the custom cluster. + pauseDefault() // we are not using err variable here because we dont want to // print logs of default cluster in case of custom test fail. if cerr := runCustomClusterTest(ctx, task.pkg.ID, wg, xmlFile); cerr != nil { @@ -778,7 +875,13 @@ func getPackages() []task { } return out } - cfg := &packages.Config{BuildFlags: []string{"-tags=integration"}} + // When running unit-only, don't add --tags=integration so that only true + // unit tests (without //go:build integration) are discovered and compiled. + var buildFlags []string + if !isUnitOnly() { + buildFlags = []string{"-tags=integration"} + } + cfg := &packages.Config{BuildFlags: buildFlags} pkgs, err := packages.Load(cfg, *baseDir+"/...") x.Check(err) @@ -898,6 +1001,22 @@ var loadPackages = []string{ "/dgraph/cmd/bulk/systest", } +// heavyPackages lists resource-intensive systest packages separated into +// the systest-heavy suite. These spin up large Docker clusters (20-112 +// services) and can cause OOM on macOS Docker Desktop. +// Use --suite=systest-heavy to run them, or --suite=systest for both. +var heavyPackages = []string{ + "/systest/backup/minio", + "/systest/backup/minio-large", + "/systest/backup/nfs-backup", + "/systest/backup/advanced-scenarios/", + "/systest/backup/encryption", + "/systest/backup/multi-tenancy", + "/systest/tracing/jaeger1", + "/systest/tracing/jaeger2", + "/systest/online-restore", +} + func testSuiteContains(suite string) bool { for _, str := range testsuite { if suite == str { @@ -916,6 +1035,13 @@ func testSuiteContainsAny(suites ...string) bool { return false } +// isUnitOnly returns true when the suite is exactly "unit" with nothing else. +// In unit-only mode, the runner skips Docker clusters and --tags=integration, +// running only tests that don't require the integration build tag. +func isUnitOnly() bool { + return len(testsuite) == 1 && testsuite[0] == "unit" +} + func isValidPackageForSuite(pkg string) bool { valid := false if testSuiteContains("all") { @@ -927,22 +1053,28 @@ func isValidPackageForSuite(pkg string) bool { if testSuiteContains("load") { valid = valid || isLoadPackage(pkg) } + // "unit" = true unit tests (no --tags=integration, same package scope as integration) if testSuiteContains("unit") { - valid = valid || (!isLoadPackage(pkg) && !isLDBCPackage(pkg)) + valid = valid || (!isLoadPackage(pkg) && !isLDBCPackage(pkg) && !isHeavyPackage(pkg)) + } + // "integration" replaces old "unit" — everything except ldbc, load, and systest-heavy + if testSuiteContains("integration") { + valid = valid || (!isLoadPackage(pkg) && !isLDBCPackage(pkg) && !isHeavyPackage(pkg)) } if testSuiteContains("vector") { valid = valid || isVectorPackage(pkg) } - if testSuiteContains("systest") { - valid = valid || isSystestPackage(pkg) + // "systest" = both systest-baseline and systest-heavy (backward compatible) + if testSuiteContainsAny("systest", "systest-baseline") { + valid = valid || (isSystestPackage(pkg) && !isHeavyPackage(pkg)) + } + if testSuiteContainsAny("systest", "systest-heavy") { + valid = valid || isHeavyPackage(pkg) } if testSuiteContains("core") { valid = valid || isCorePackage(pkg) } - if valid { - return valid - } - return false + return valid } func isLoadPackage(pkg string) bool { @@ -980,6 +1112,15 @@ func isVectorPackage(pkg string) bool { return strings.HasSuffix(pkg, "/vector") } +func isHeavyPackage(pkg string) bool { + for _, p := range heavyPackages { + if strings.HasSuffix(pkg, p) || strings.Contains(pkg, p) { + return true + } + } + return false +} + var datafiles = map[string]string{ "1million-noindex.schema": "https://raw.githubusercontent.com/dgraph-io/dgraph-benchmarks/refs/heads/main/data/1million-noindex.schema", "1million.schema": "https://raw.githubusercontent.com/dgraph-io/dgraph-benchmarks/refs/heads/main/data/1million.schema", @@ -1168,6 +1309,18 @@ func run() error { fmt.Printf("Proc ID is %d\n", procId) fmt.Printf("Detected architecture: %s", runtime.GOARCH) + // Ensure $GOPATH/bin is in PATH so that tools installed via `go install` + // (e.g. protoc-gen-go) are found by subprocesses like protoc. + gopath := os.Getenv("GOPATH") + if gopath == "" { + gopath = filepath.Join(os.Getenv("HOME"), "go") + } + gopathBin := filepath.Join(gopath, "bin") + currentPath := os.Getenv("PATH") + if !strings.Contains(currentPath, gopathBin) { + os.Setenv("PATH", gopathBin+string(os.PathListSeparator)+currentPath) + } + start := time.Now() oc.Took(0, "START", time.Millisecond) @@ -1275,7 +1428,7 @@ func run() error { func validateAllowed(testSuite []string) { - allowed := []string{"all", "ldbc", "load", "unit", "systest", "vector", "core"} + allowed := []string{"all", "ldbc", "load", "unit", "integration", "systest", "systest-baseline", "systest-heavy", "vector", "core"} for _, str := range testSuite { onlyAllowed := false for _, allowedStr := range allowed { @@ -1284,7 +1437,7 @@ func validateAllowed(testSuite []string) { } } if !onlyAllowed { - log.Fatalf("Allowed options for suite are only all, load, ldbc or unit; passed in %+v", testSuite) + log.Fatalf("Allowed options for suite are: %s; passed in %+v", strings.Join(allowed, ", "), testSuite) } } } diff --git a/testutil/backup.go b/testutil/backup.go index bc614ad8a7a..5aaa672cb4c 100644 --- a/testutil/backup.go +++ b/testutil/backup.go @@ -53,13 +53,28 @@ func openDgraph(pdir string) (*badger.DB, error) { } func WaitForRestore(t *testing.T, dg *dgo.Dgraph, HttpSocket string) { + // Use a 10-minute overall deadline so the test fails quickly if the + // cluster is in a permanently degraded state (e.g. OOM-killed containers) + // rather than hanging until the Go test timeout. + deadline := time.Now().Add(10 * time.Minute) + restoreDone := false - for { + for time.Now().Before(deadline) { resp, err := http.Get("http://" + HttpSocket + "/health") - require.NoError(t, err) + if err != nil { + // The health endpoint may be transiently unreachable while the + // server restarts during a restore. Keep retrying. + t.Logf("WaitForRestore: health check error (will retry): %v", err) + time.Sleep(1 * time.Second) + continue + } buf, err := io.ReadAll(resp.Body) - require.NoError(t, resp.Body.Close()) - require.NoError(t, err) + _ = resp.Body.Close() + if err != nil { + t.Logf("WaitForRestore: error reading health body (will retry): %v", err) + time.Sleep(1 * time.Second) + continue + } sbuf := string(buf) if !strings.Contains(sbuf, "opRestore") { restoreDone = true @@ -67,7 +82,7 @@ func WaitForRestore(t *testing.T, dg *dgo.Dgraph, HttpSocket string) { } time.Sleep(1 * time.Second) } - require.True(t, restoreDone) + require.True(t, restoreDone, "timed out waiting for restore operation to complete") // Wait for the client to exit draining mode. This is needed because the client might // be connected to a follower and might be behind the leader in applying the restore. @@ -75,7 +90,7 @@ func WaitForRestore(t *testing.T, dg *dgo.Dgraph, HttpSocket string) { // which the query succeeds at the first attempt because the follower is behind and // has not started to apply the restore proposal. numSuccess := 0 - for { + for time.Now().Before(deadline) { // This is a dummy query that returns no results. _, err := dg.NewTxn().Query(context.Background(), `{ q(func: has(invalid_pred)) { @@ -85,18 +100,36 @@ func WaitForRestore(t *testing.T, dg *dgo.Dgraph, HttpSocket string) { if err == nil { numSuccess++ } else { - require.Contains(t, err.Error(), "the server is in draining mode") + // During restore the server may be in draining mode, or it may be + // transiently unreachable (connection reset, TLS handshake failure, + // gRPC Unavailable, etc.). All of these are expected and retriable. + errMsg := err.Error() + transient := strings.Contains(errMsg, "the server is in draining mode") || + strings.Contains(errMsg, "Unavailable") || + strings.Contains(errMsg, "connection reset") || + strings.Contains(errMsg, "connection refused") || + strings.Contains(errMsg, "transport") || + strings.Contains(errMsg, "EOF") || + strings.Contains(errMsg, "overloaded") || + strings.Contains(errMsg, "context canceled") || + strings.Contains(errMsg, "Please retry") + if !transient { + require.Fail(t, "unexpected error while waiting for restore", + "error: %v", err) + } numSuccess = 0 } // Apply restore works differently with race enabled. // We are seeing delays in apply proposals hence failure of queries. if numSuccess == 10 { - // The server has been responsive three times in a row. + // The server has been responsive ten times in a row. break } time.Sleep(1 * time.Second) } + require.GreaterOrEqual(t, numSuccess, 10, + "timed out waiting for server to exit draining mode after restore") } // GetPredicateValues reads the specified p directory and returns the values for the given diff --git a/testutil/plugin.go b/testutil/plugin.go index 6beda3ea5a7..f3e4b0d5843 100644 --- a/testutil/plugin.go +++ b/testutil/plugin.go @@ -7,6 +7,7 @@ package testutil import ( "fmt" + "os" "os/exec" "path/filepath" "runtime" @@ -33,9 +34,19 @@ func GeneratePlugins(raceEnabled bool) { if raceEnabled { opts = append(opts, "-race") } - opts = append(opts, "-buildmode=plugin", "-o", so, src) + opts = append(opts, "-buildmode=plugin") + if runtime.GOOS != "linux" { + // Use the BFD linker; the default gold linker is not shipped + // with most cross-compiler toolchains. + opts = append(opts, "-ldflags", "-extldflags -fuse-ld=bfd") + } + opts = append(opts, "-o", so, src) cmd := exec.Command("go", opts...) cmd.Dir = filepath.Dir(curr) + cmd.Env = append(os.Environ(), "GOOS=linux", "GOARCH="+runtime.GOARCH) + if runtime.GOOS != "linux" { + cmd.Env = append(cmd.Env, "CGO_ENABLED=1", "CC="+linuxCC()) + } if out, err := cmd.CombinedOutput(); err != nil { fmt.Printf("Error: %v\n", err) fmt.Printf("Output: %v\n", string(out)) @@ -51,3 +62,19 @@ func GeneratePlugins(raceEnabled bool) { fmt.Printf("plugin build completed. Files are: %s\n", strings.Join(soFiles, ",")) } + +// linuxCC returns the C cross-compiler for targeting Linux from the current host. +// Respects the LINUX_CC environment variable if set. +func linuxCC() string { + if cc := os.Getenv("LINUX_CC"); cc != "" { + return cc + } + switch runtime.GOARCH { + case "arm64": + return "aarch64-unknown-linux-gnu-gcc" + case "amd64": + return "x86_64-unknown-linux-gnu-gcc" + default: + return "gcc" + } +}