add more tests

Shiva · Shiva · commit f5ddbbd3fab4 · 2026-02-16T23:03:50.000+05:30
diff --git a/dgraph/cmd/bulk/reduce.go b/dgraph/cmd/bulk/reduce.go
@@ -62,27 +62,28 @@ func (r *reducer) run() error {
 
 	if len(vectorIndexSpecs) > 0 {
 		fmt.Printf("Creating shared vector database for %d vector predicate(s)\n", len(vectorIndexSpecs))
+		// Track which predicates belong to which output shard
+		predToOutputShard = make(map[string]int)
 
-		// Create single shared vectorTmpDb
 		sharedVectorDb = r.createVectorTmpBadger()
 
-		// Initialize posting and schema ONCE (avoids race condition!)
 		posting.Init(sharedVectorDb, 0, false)
 		schema.Init(sharedVectorDb)
 		for pred, sch := range r.schema.schemaMap {
+			_, ok := vectorIndexSpecs[pred]
+			if !ok {
+				continue
+			}
 			schema.State().Set(pred, sch)
 		}
-
-		// Track which predicates belong to which output shard
-		predToOutputShard = make(map[string]int)
 	}
 
 	thr := y.NewThrottle(r.opt.NumReducers)
 	for i := range r.opt.ReduceShards {
 		if err := thr.Do(); err != nil {
 			return err
 		}
-		go func(shardId int, db *badger.DB, tmpDb *badger.DB) {
+		go func(shardId int, db *badger.DB, tmpDb *badger.DB, vectorTmpDb *badger.DB) {
 			defer thr.Done(nil)
 
 			mapFiles := filenamesInTree(dirs[shardId])
@@ -117,10 +118,10 @@ func (r *reducer) run() error {
 
 			// Create vector indexer using shared DB (if vectors exist)
 			var vi *vectorIndexer
-			if sharedVectorDb != nil && len(vectorIndexSpecs) > 0 {
+			if vectorTmpDb != nil && len(vectorIndexSpecs) > 0 {
 				fmt.Printf("Initializing vector indexer for shard %d with %d predicate(s)\n",
 					shardId, len(vectorIndexSpecs))
-				vi = newVectorIndexerShared(r, sharedVectorDb, vectorIndexSpecs,
+				vi = newVectorIndexerShared(r, vectorTmpDb, vectorIndexSpecs,
 					shardId, &predToShardMu, predToOutputShard)
 			}
 
@@ -149,7 +150,7 @@ func (r *reducer) run() error {
 					fmt.Printf("Error while closing iterator: %v", err)
 				}
 			}
-		}(i, r.createBadger(i), r.createTmpBadger())
+		}(i, r.createBadger(i), r.createTmpBadger(), sharedVectorDb)
 	}
 	if err := thr.Finish(); err != nil {
 		return err
@@ -294,8 +295,7 @@ func newMapIterator(filename string) (*pb.MapHeader, *mapIterator) {
 type encodeRequest struct {
 	cbuf      *z.Buffer
 	countBuf  *z.Buffer
-	vectorBuf *z.Buffer      // Buffer for vector entries to be indexed
-	vi        *vectorIndexer // Vector indexer for routing vector predicates to tmpDb
+	vectorBuf *z.Buffer // Buffer for vector entries to be indexed
 	wg        *sync.WaitGroup
 	listCh    chan *z.Buffer
 	splitCh   chan *bpb.KVList
@@ -318,11 +318,11 @@ func (r *reducer) streamIdFor(pred string) uint32 {
 	return streamId
 }
 
-func (r *reducer) encode(entryCh chan *encodeRequest, closer *z.Closer) {
+func (r *reducer) encode(entryCh chan *encodeRequest, vi *vectorIndexer, closer *z.Closer) {
 	defer closer.Done()
 
 	for req := range entryCh {
-		r.toList(req)
+		r.toList(req, vi)
 		req.wg.Done()
 	}
 }
@@ -470,6 +470,9 @@ func (r *reducer) startWriting(ci *countIndexer, vi *vectorIndexer, writerCh cha
 
 		count(req)
 		if vi != nil {
+			if err := vi.flushWriteBatch(); err != nil {
+				glog.Errorf("Error flushing vector write batch before HNSW insertion: %v", err)
+			}
 			vector(req)
 		}
 	}
@@ -646,7 +649,7 @@ func (r *reducer) reduce(partitionKeys [][]byte, mapItrs []*mapIterator, ci *cou
 	for range cpu {
 		// Start listening to encode entries
 		// For time being let's lease 100 stream id for each encoder.
-		go r.encode(encoderCh, encoderCloser)
+		go r.encode(encoderCh, vi, encoderCloser)
 	}
 	// Start listening to write the badger list.
 	writerCloser := z.NewCloser(1)
@@ -661,7 +664,6 @@ func (r *reducer) reduce(partitionKeys [][]byte, mapItrs []*mapIterator, ci *cou
 			listCh:   make(chan *z.Buffer, 3),
 			splitCh:  ci.splitCh,
 			countBuf: getBuf(r.opt.TmpDir),
-			vi:       vi,
 		}
 		// Only allocate vectorBuf when we have vector predicates to index
 		if vi != nil {
@@ -733,7 +735,7 @@ func (r *reducer) reduce(partitionKeys [][]byte, mapItrs []*mapIterator, ci *cou
 	writerCloser.SignalAndWait()
 }
 
-func (r *reducer) toList(req *encodeRequest) {
+func (r *reducer) toList(req *encodeRequest, vi *vectorIndexer) {
 	cbuf := req.cbuf
 	defer func() {
 		atomic.AddInt64(&r.prog.numEncoding, -int64(cbuf.LenNoPadding()))
@@ -888,8 +890,8 @@ func (r *reducer) toList(req *encodeRequest) {
 			}
 		}
 
-		// Check if this is a vector predicate that should be routed to tmpDb
-		isVectorPred := req.vi != nil && pk.IsData() && req.vi.isVectorPredicate(pk.Attr)
+		// Check if this is a vector predicate that should be routed to vectorTmpDb
+		isVectorPred := vi != nil && pk.IsData() && vi.isVectorPredicate(pk.Attr)
 
 		shouldSplit := proto.Size(pl) > (1<<20)/2 && len(pl.Pack.Blocks) > 1
 		if shouldSplit {
@@ -908,7 +910,7 @@ func (r *reducer) toList(req *encodeRequest) {
 				// Vector predicates go to vectorTmpDb
 				for _, kv := range kvs {
 					kv.Version = writeVersionTs
-					if err := req.vi.writeVectorKV(kv); err != nil {
+					if err := vi.writeVectorKV(kv); err != nil {
 						glog.Errorf("Error writing vector posting to tmpDb: %v", err)
 					}
 				}
@@ -931,7 +933,7 @@ func (r *reducer) toList(req *encodeRequest) {
 
 			if isVectorPred {
 				// Vector predicates go to vectorTmpDb
-				if err := req.vi.writeVectorKV(kv); err != nil {
+				if err := vi.writeVectorKV(kv); err != nil {
 					glog.Errorf("Error writing vector posting to tmpDb: %v", err)
 				}
 			} else {
diff --git a/dgraph/cmd/bulk/vector_indexer.go b/dgraph/cmd/bulk/vector_indexer.go
@@ -41,7 +41,7 @@ type vectorIndexer struct {
 	predToShardMu     *sync.Mutex    // Mutex for predToOutputShard (shared across shards)
 	predToOutputShard map[string]int // Predicate → output shard mapping (shared across shards)
 
-	// For batched writes
+	// For batched writes of vector posting lists to shared DB
 	writeBatch *badger.WriteBatch
 	writeCount int
 
@@ -143,7 +143,6 @@ func unmarshalVectorEntry(data []byte) *vectorEntry {
 }
 
 // newVectorIndexerShared creates a new vectorIndexer for a shard using a shared vectorTmpDb.
-// This avoids the global pstore race condition by using a single shared DB with posting.Init()
 // called once before any shards start. Indexers are created lazily when vectors arrive.
 func newVectorIndexerShared(r *reducer, sharedVectorDb *badger.DB, indexSpecs map[string]*pb.VectorIndexSpec,
 	shardId int, predToShardMu *sync.Mutex, predToOutputShard map[string]int) *vectorIndexer {
@@ -161,13 +160,6 @@ func newVectorIndexerShared(r *reducer, sharedVectorDb *badger.DB, indexSpecs ma
 		predToShardMu:     predToShardMu,
 		predToOutputShard: predToOutputShard,
 	}
-
-	// NOTE: posting.Init() and schema.Init() are called ONCE in reduce.go
-	// before any shards start, so we don't call them here.
-
-	// NOTE: Indexers are created LAZILY when vectors arrive for a predicate.
-	// This avoids creating indexers for predicates that don't exist in this shard.
-
 	glog.Infof("Vector indexer created for shard %d (lazy initialization, %d potential predicates)",
 		shardId, len(indexSpecs))
 
@@ -180,7 +172,6 @@ func (vi *vectorIndexer) getOrCreateIndexer(pred string) (index.VectorIndex[floa
 	vi.mu.Lock()
 	defer vi.mu.Unlock()
 
-	// Already created?
 	if indexer, ok := vi.indexers[pred]; ok {
 		return indexer, vi.txnCaches[pred], nil
 	}
diff --git a/systest/vector/load_test.go b/systest/vector/load_test.go
@@ -219,7 +219,7 @@ func TestBulkLoadVectorIndex(t *testing.T) {
 	t.Log("Step 6: Verifying vector similarity queries work on bulk loaded data...")
 	fmt.Println("vectors: ", len(vectors))
 	for i, vector := range vectors {
-		similarVectors, err := targetGc.QueryMultipleVectorsUsingSimilarTo(vector, pred, 10)
+		similarVectors, err := targetGc.QueryMultipleVectorsUsingSimilarTo(vector, pred, 5)
 		require.NoError(t, err)
 		require.GreaterOrEqual(t, len(similarVectors), 4,
 			"similar_to query should return results for vector %d", i)
@@ -255,9 +255,6 @@ func vectorsEqual(a, b []float32) bool {
 	return true
 }
 
-// TestBulkLoadVectorIndexMultipleGroups tests bulk loading vector data with multiple
-// alpha groups (shards). This ensures vector indexing works correctly when predicates
-// are distributed across different shards.
 func TestBulkLoadVectorIndexMultipleGroups(t *testing.T) {
 	// if runtime.GOOS != "linux" && os.Getenv("DGRAPH_BINARY") == "" {
 	// 	fmt.Println("You can set the DGRAPH_BINARY environment variable to path of a native dgraph binary to run these tests")
@@ -415,7 +412,7 @@ func TestBulkLoadVectorIndexMultipleGroups(t *testing.T) {
 		sampleSize := 10
 
 		for i := 0; i < sampleSize; i++ {
-			similarVectors, err := targetGc.QueryMultipleVectorsUsingSimilarTo(vectors[i], pred, 10)
+			similarVectors, err := targetGc.QueryMultipleVectorsUsingSimilarTo(vectors[i], pred, 5)
 			require.NoError(t, err)
 			require.GreaterOrEqual(t, len(similarVectors), 4,
 				"similar_to query should return results for predicate %s vector %d", pred, i)
@@ -437,7 +434,7 @@ func TestBulkLoadMixedPredicates(t *testing.T) {
 
 	// Schema with vectors AND other indexed predicates
 	mixedSchema := `
-		vec_embedding: float32vector @index(hnsw(exponent: "5", metric: "euclidean")) .
+		project_description_v: float32vector @index(hnsw(exponent: "5", metric: "euclidean")) .
 		name: string @index(term, fulltext) .
 		age: int @index(int) .
 		score: float .
@@ -485,7 +482,7 @@ func TestBulkLoadMixedPredicates(t *testing.T) {
 		vecStr := fmt.Sprintf(`"[%s]"`, strings.Trim(strings.Join(strings.Fields(fmt.Sprint(vec)), ", "), "[]"))
 
 		// Add vector predicate
-		rdfBuilder.WriteString(fmt.Sprintf("<0x%x> <vec_embedding> %s .\n", uid, vecStr))
+		rdfBuilder.WriteString(fmt.Sprintf("<0x%x> <project_description_v> %s .\n", uid, vecStr))
 		// Add string predicate
 		rdfBuilder.WriteString(fmt.Sprintf("<0x%x> <name> \"Person %d\" .\n", uid, i))
 		// Add int predicate
@@ -597,7 +594,7 @@ func TestBulkLoadMixedPredicates(t *testing.T) {
 
 	// Verify vector similarity query
 	similarQuery := fmt.Sprintf(`{
-		vector(func: similar_to(vec_embedding, 5, "%v")) {
+		vector(func: similar_to(project_description_v, 5, "%v")) {
 			uid
 			name
 		}
@@ -610,8 +607,6 @@ func TestBulkLoadMixedPredicates(t *testing.T) {
 	t.Log("All mixed predicate types verified successfully!")
 }
 
-// TestBulkLoadVectorDimensions tests bulk loading vectors with different dimensions
-// to ensure the implementation handles various vector sizes correctly.
 func TestBulkLoadVectorDimensions(t *testing.T) {
 	// if runtime.GOOS != "linux" && os.Getenv("DGRAPH_BINARY") == "" {
 	// 	t.Skip("Skipping test on non-Linux platforms due to dgraph binary dependency")
@@ -630,7 +625,7 @@ func TestBulkLoadVectorDimensions(t *testing.T) {
 
 	for _, tc := range testCases {
 		t.Run(tc.name, func(t *testing.T) {
-			predName := fmt.Sprintf("vec_%s", tc.name)
+			predName := "project_description_v"
 			schema := fmt.Sprintf(`%s: float32vector @index(hnsw(exponent: "5", metric: "euclidean")) .`, predName)
 
 			// Step 1: Create source cluster
@@ -704,7 +699,7 @@ func TestBulkLoadVectorDimensions(t *testing.T) {
 
 			targetCluster, err := dgraphtest.NewLocalCluster(targetConf)
 			require.NoError(t, err)
-			defer func() { targetCluster.Cleanup(t.Failed()) }()
+			// defer func() { targetCluster.Cleanup(t.Failed()) }()
 			require.NoError(t, targetCluster.Start())
 
 			targetGc, targetCleanup, err := targetCluster.Client()
@@ -724,15 +719,13 @@ func TestBulkLoadVectorDimensions(t *testing.T) {
 				similarVectors, err := targetGc.QueryMultipleVectorsUsingSimilarTo(vector, predName, 5)
 				require.NoError(t, err)
 				require.GreaterOrEqual(t, len(similarVectors), 4,
-					"similar_to query should return results for vector %d")
+					"similar_to query should return results for vector")
 			}
 
 		})
 	}
 }
 
-// TestBulkLoadVectorMetrics tests bulk loading vectors with different distance metrics
-// (euclidean, cosine, dotproduct) to ensure all HNSW configurations work correctly.
 func TestBulkLoadVectorMetrics(t *testing.T) {
 	if runtime.GOOS != "linux" && os.Getenv("DGRAPH_BINARY") == "" {
 		t.Skip("Skipping test on non-Linux platforms due to dgraph binary dependency")
@@ -741,13 +734,13 @@ func TestBulkLoadVectorMetrics(t *testing.T) {
 	metrics := []string{"euclidean", "cosine", "dotproduct"}
 	numVectors := 200
 	vectorDim := 10
+	predName := "project_description_v"
 
 	// Build schema with all metric types
 	var schemaBuilder strings.Builder
 	for _, metric := range metrics {
 		schemaBuilder.WriteString(fmt.Sprintf(
-			"vec_%s: float32vector @index(hnsw(exponent: \"5\", metric: \"%s\")) .\n",
-			metric, metric))
+			"project_description_v: float32vector @index(hnsw(exponent: \"5\", metric: \"%s\")) .\n", metric))
 	}
 	schema := schemaBuilder.String()
 
@@ -779,7 +772,6 @@ func TestBulkLoadVectorMetrics(t *testing.T) {
 	// Generate and load vectors for each metric type
 	allVectors := make(map[string][][]float32)
 	for _, metric := range metrics {
-		predName := fmt.Sprintf("vec_%s", metric)
 		rdfs, vectors := dgraphapi.GenerateRandomVectors(0, numVectors, vectorDim, predName)
 		allVectors[predName] = vectors
 
@@ -845,7 +837,6 @@ func TestBulkLoadVectorMetrics(t *testing.T) {
 	// Step 5: Verify each metric type
 	t.Log("Step 5: Verifying each metric type...")
 	for _, metric := range metrics {
-		predName := fmt.Sprintf("vec_%s", metric)
 		vectors := allVectors[predName]
 
 		// Verify count
@@ -872,8 +863,6 @@ func TestBulkLoadVectorMetrics(t *testing.T) {
 	t.Log("All vector metrics verified successfully!")
 }
 
-// TestBulkLoadVectorEdgeCases tests edge cases like empty vector predicates,
-// single vector, and predicates with no data.
 func TestBulkLoadVectorEdgeCases(t *testing.T) {
 	// if runtime.GOOS != "linux" && os.Getenv("DGRAPH_BINARY") == "" {
 	// 	t.Skip("Skipping test on non-Linux platforms due to dgraph binary dependency")