fix(pipeline): InsertTokenizerIndexes deadlocks on uids >= 2^63

matthewmcneely · claude · matthewmcneely · commit 792923279a7e · 2026-05-01T15:45:36.000-04:00
When loading via dgraph live (or any mutation source whose uids span the full uint64 range, including xidmap-assigned uids), the per-predicate pipeline hung indefinitely on the very first batch with zero forward progress. A goroutine dump showed the dispatcher goroutine wedged on \`chan send (nil chan)\` at the line: chMap[int(uid)%numGo] <- uid uid is uint64. Casting directly to int produces a negative value for uid >= 2^63, so int(uid)%10 can be in [-9, -1]. chMap[-3] returns the zero value for a chan uint64, which is a nil channel; sending on a nil channel blocks forever. The 10 worker goroutines (also created here) were idle on \`for uid := range uids\` since no uids ever reached them, so the parent \`wg.Wait()\` and the surrounding errgroup never returned. applyMutations therefore never released the txn, the alpha's old-txn abort loop kept retrying every minute, and live-load showed "Txns: 0 N-Quads: 0" indefinitely. Fix: hash unsigned, then cast: \`chMap[int(uid%uint64(numGo))]\`. Verified end-to-end with the live loader against the 1million.rdf.gz benchmark dataset (1,041,684 n-quads, schema mixes [uid] @reverse @count, [uid] @count, datetime @index(year), string @index(...) @lang, geo @index(geo), string @index(exact) @upsert): legacy : 13.85s / 14.74s (avg ~14.3s, ~77k n-quads/s) pipeline : 9.65s / 9.36s (avg ~9.5s, ~116k n-quads/s) That is ~1.50x faster on a realistic multi-predicate, multi-index workload — i.e. the case the per-predicate runner pipeline is built for. Also adds worker/pipeline_bench_test.go: in-process Go benchmarks comparing legacy runMutation vs newRunMutations across a matrix of (predicates, edges-per-predicate, indexed/non-indexed) shapes. They show the pipeline loses ~2x on tiny mutations (1-10 edges) and wins 1.2x-1.55x on bulk (10 preds x 100+ edges, indexed or not), which is why the feature flag stays default-off and the live-loader speedup above is the right place to evaluate this work. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/posting/index.go b/posting/index.go
@@ -233,7 +233,11 @@ func (mp *MutationPipeline) InsertTokenizerIndexes(ctx context.Context, pipeline
 	}
 
 	for uid := range *postings {
-		chMap[int(uid)%numGo] <- uid
+		// uid is uint64; converting directly to int can produce a negative
+		// value for uid >= 2^63, which would index outside chMap and resolve
+		// to a nil channel (deadlocks the dispatcher). Hash unsigned, then
+		// cast.
+		chMap[int(uid%uint64(numGo))] <- uid
 	}
 
 	for i := 0; i < numGo; i++ {
diff --git a/worker/pipeline_bench_test.go b/worker/pipeline_bench_test.go
@@ -0,0 +1,210 @@
+/*
+ * SPDX-FileCopyrightText: © 2017-2025 Istari Digital, Inc.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+package worker
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"testing"
+
+	"github.com/dgraph-io/badger/v4"
+	"github.com/dgraph-io/dgraph/v25/posting"
+	"github.com/dgraph-io/dgraph/v25/protos/pb"
+	"github.com/dgraph-io/dgraph/v25/schema"
+	"github.com/dgraph-io/dgraph/v25/x"
+)
+
+// Benchmarks comparing the legacy serial mutation path (runMutation per edge)
+// with the new per-predicate mutation pipeline (newRunMutations).
+//
+// What the pipeline ought to win on:
+//   - many predicates per transaction      → one goroutine per predicate
+//   - many indexed edges per predicate     → 10-way intra-predicate
+//                                            parallelism on tokenization
+//
+// What it shouldn't help (and may regret):
+//   - tiny mutations (1-2 edges, 1 predicate) where goroutine spin-up cost
+//     dominates the mutation work
+//
+// Each iteration is a single transaction: build a fresh batch of edges,
+// run mutations, txn.Update(), CommitToDisk. We do NOT include the b.ResetTimer()
+// before edge construction because edge construction is part of the
+// per-transaction cost the pipeline is supposed to amortize.
+
+func benchSetup(b *testing.B, schemaTxt string) *badger.DB {
+	b.Helper()
+	dir, err := os.MkdirTemp("", "pipeline_bench_")
+	if err != nil {
+		b.Fatal(err)
+	}
+	b.Cleanup(func() { _ = os.RemoveAll(dir) })
+
+	ps, err := badger.OpenManaged(badger.DefaultOptions(dir).WithLoggingLevel(badger.ERROR))
+	if err != nil {
+		b.Fatal(err)
+	}
+	b.Cleanup(func() { _ = ps.Close() })
+
+	posting.Init(ps, 0, false)
+	Init(ps)
+	posting.Oracle().ResetTxns()
+	if err := schema.ParseBytes([]byte(schemaTxt), 1); err != nil {
+		b.Fatal(err)
+	}
+	return ps
+}
+
+// buildEdges constructs numPreds*edgesPerPred edges across distinct predicates,
+// indexed-string-valued. The same generator drives both legacy and pipeline
+// runs so the input is identical.
+func buildEdges(numPreds, edgesPerPred int, baseUid uint64) []*pb.DirectedEdge {
+	edges := make([]*pb.DirectedEdge, 0, numPreds*edgesPerPred)
+	for p := 0; p < numPreds; p++ {
+		attr := x.AttrInRootNamespace(fmt.Sprintf("p%d", p))
+		for e := 0; e < edgesPerPred; e++ {
+			edges = append(edges, &pb.DirectedEdge{
+				Entity:    baseUid + uint64(e),
+				Attr:      attr,
+				Value:     []byte(fmt.Sprintf("v%d_%d", p, e)),
+				ValueType: pb.Posting_STRING,
+				Op:        pb.DirectedEdge_SET,
+			})
+		}
+	}
+	return edges
+}
+
+// schemaForPreds emits "p0: string @index(exact) ., p1: ..., ..." (or no
+// index, depending on indexed). Each predicate is a distinct list-or-scalar.
+func schemaForPreds(numPreds int, indexed bool, list bool) string {
+	var b []byte
+	for p := 0; p < numPreds; p++ {
+		ty := "string"
+		if list {
+			ty = "[string]"
+		}
+		idx := ""
+		if indexed {
+			idx = " @index(exact)"
+		}
+		b = append(b, []byte(fmt.Sprintf("p%d: %s%s .\n", p, ty, idx))...)
+	}
+	return string(b)
+}
+
+// runOne executes one transaction's mutations through the chosen path.
+// startTs/commitTs must be unique per call.
+func runOnePipeline(b *testing.B, ps *badger.DB, edges []*pb.DirectedEdge, startTs, commitTs uint64) {
+	b.Helper()
+	txn := posting.Oracle().RegisterStartTs(startTs)
+	if err := newRunMutations(context.Background(), edges, txn); err != nil {
+		b.Fatal(err)
+	}
+	txn.Update()
+	w := posting.NewTxnWriter(ps)
+	if err := txn.CommitToDisk(w, commitTs); err != nil {
+		b.Fatal(err)
+	}
+	if err := w.Flush(); err != nil {
+		b.Fatal(err)
+	}
+	txn.UpdateCachedKeys(commitTs)
+}
+
+func runOneLegacy(b *testing.B, ps *badger.DB, edges []*pb.DirectedEdge, startTs, commitTs uint64) {
+	b.Helper()
+	txn := posting.Oracle().RegisterStartTs(startTs)
+	for _, e := range edges {
+		if err := runMutation(context.Background(), e, txn); err != nil {
+			b.Fatal(err)
+		}
+	}
+	txn.Update()
+	w := posting.NewTxnWriter(ps)
+	if err := txn.CommitToDisk(w, commitTs); err != nil {
+		b.Fatal(err)
+	}
+	if err := w.Flush(); err != nil {
+		b.Fatal(err)
+	}
+	txn.UpdateCachedKeys(commitTs)
+}
+
+// runBench runs sub-benchmarks (legacy vs pipeline) for a single
+// (numPreds, edgesPerPred, indexed, list) configuration.
+func runBench(b *testing.B, numPreds, edgesPerPred int, indexed, list bool) {
+	for _, mode := range []struct {
+		name string
+		fn   func(*testing.B, *badger.DB, []*pb.DirectedEdge, uint64, uint64)
+	}{
+		{"legacy", runOneLegacy},
+		{"pipeline", runOnePipeline},
+	} {
+		b.Run(mode.name, func(b *testing.B) {
+			ps := benchSetup(b, schemaForPreds(numPreds, indexed, list))
+			b.ReportAllocs()
+			b.ResetTimer()
+			ts := uint64(10)
+			for i := 0; i < b.N; i++ {
+				edges := buildEdges(numPreds, edgesPerPred, uint64(i)*1_000_000+1)
+				mode.fn(b, ps, edges, ts, ts+1)
+				ts += 2
+			}
+		})
+	}
+}
+
+// 1 predicate, 1 edge — smallest possible mutation. Pipeline overhead
+// is most visible here.
+func BenchmarkMutate_1pred_1edge_indexed(b *testing.B) {
+	runBench(b, 1, 1, true, false)
+}
+
+// 1 predicate, 100 indexed edges — exercises intra-predicate
+// tokenization parallelism.
+func BenchmarkMutate_1pred_100edges_indexed(b *testing.B) {
+	runBench(b, 1, 100, true, false)
+}
+
+// 10 predicates, 1 edge each — per-predicate parallelism with light work
+// per predicate.
+func BenchmarkMutate_10preds_1edge_indexed(b *testing.B) {
+	runBench(b, 10, 1, true, false)
+}
+
+// 10 predicates, 100 edges each — full benefit case: per-predicate AND
+// intra-predicate parallelism on indexed work.
+func BenchmarkMutate_10preds_100edges_indexed(b *testing.B) {
+	runBench(b, 10, 100, true, false)
+}
+
+// 1 predicate, 1000 indexed edges — heavy intra-predicate.
+func BenchmarkMutate_1pred_1000edges_indexed(b *testing.B) {
+	runBench(b, 1, 1000, true, false)
+}
+
+// 10 predicates, 1000 edges each — large mutation, indexed.
+func BenchmarkMutate_10preds_1000edges_indexed(b *testing.B) {
+	runBench(b, 10, 1000, true, false)
+}
+
+// Non-indexed counterparts isolate per-predicate parallelism from the
+// tokenization parallelism.
+func BenchmarkMutate_10preds_1000edges_noindex(b *testing.B) {
+	runBench(b, 10, 1000, false, false)
+}
+
+// Very large indexed mutation: 50 predicates × 1000 edges each = 50k edges.
+// Where the pipeline should shine most.
+func BenchmarkMutate_50preds_1000edges_indexed(b *testing.B) {
+	runBench(b, 50, 1000, true, false)
+}
+
+// 50 predicates, 100 edges each (5k edges) — typical-ish bulk write shape.
+func BenchmarkMutate_50preds_100edges_indexed(b *testing.B) {
+	runBench(b, 50, 100, true, false)
+}

Original file line number	Diff line number	Diff line change
`@@ -233,7 +233,11 @@ func (mp *MutationPipeline) InsertTokenizerIndexes(ctx context.Context, pipeline`
`233`	`233`	`}`
`234`	`234`
`235`	`235`	`for uid := range *postings {`
`236`		`- chMap[int(uid)%numGo] <- uid`
	`236`	`+ // uid is uint64; converting directly to int can produce a negative`
	`237`	`+ // value for uid >= 2^63, which would index outside chMap and resolve`
	`238`	`+ // to a nil channel (deadlocks the dispatcher). Hash unsigned, then`
	`239`	`+ // cast.`
	`240`	`+ chMap[int(uid%uint64(numGo))] <- uid`
`237`	`241`	`}`
`238`	`242`
`239`	`243`	`for i := 0; i < numGo; i++ {`