dgraph-io
diff --git a/‎dql/parser.go‎
Lines changed: 1 addition & 1 deletion b/‎dql/parser.go‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎posting/index.go‎
Lines changed: 183 additions & 0 deletions b/‎posting/index.go‎
Lines changed: 183 additions & 0 deletions
diff --git a/‎query/common_test.go‎
Lines changed: 17 additions & 0 deletions b/‎query/common_test.go‎
Lines changed: 17 additions & 0 deletions
@@ -1701,7 +1701,7 @@ func validFuncName(name string) bool {
 
 	switch name {
 	case "regexp", "anyofterms", "allofterms", "alloftext", "anyoftext", "ngram",
-		"has", "uid", "uid_in", "anyof", "allof", "type", "match", "similar_to":
+		"has", "uid", "uid_in", "anyof", "allof", "type", "match", "similar_to", "bm25":
 		return true
 	}
 	return false
 
@@ -68,6 +68,10 @@ func indexTokens(ctx context.Context, info *indexMutationInfo) ([]string, error)
 
 	var tokens []string
 	for _, it := range info.tokenizers {
+		// BM25 tokenizer is handled separately in addBM25IndexMutations.
+		if it.Identifier() == tok.IdentBM25 {
+			continue
+		}
 		toks, err := tok.BuildTokens(sv.Value, tok.GetTokenizerForLang(it, lang))
 		if err != nil {
 			return tokens, err
@@ -179,6 +183,17 @@ func (txn *Txn) addIndexMutations(ctx context.Context, info *indexMutationInfo)
 		}
 	}
 
+	// Check if any tokenizer is BM25 and handle separately.
+	for _, it := range info.tokenizers {
+		if _, ok := tok.GetTokenizerForLang(it, info.edge.GetLang()).(tok.BM25Tokenizer); ok {
+			if err := txn.addBM25IndexMutations(ctx, info); err != nil {
+				return []*pb.DirectedEdge{}, err
+			}
+			// Continue to process remaining non-BM25 tokenizers below.
+			continue
+		}
+	}
+
 	tokens, err := indexTokens(ctx, info)
 	if err != nil {
 		// This data is not indexable
@@ -215,6 +230,174 @@ func (txn *Txn) addIndexMutation(ctx context.Context, edge *pb.DirectedEdge, tok
 	return nil
 }
 
+// addBM25IndexMutations handles index mutations for the BM25 tokenizer.
+// It stores term frequencies, document lengths, and corpus statistics.
+func (txn *Txn) addBM25IndexMutations(ctx context.Context, info *indexMutationInfo) error {
+	attr := info.edge.Attr
+	uid := info.edge.Entity
+	lang := info.edge.GetLang()
+
+	schemaType, err := schema.State().TypeOf(attr)
+	if err != nil || !schemaType.IsScalar() {
+		return errors.Errorf("Cannot BM25 index attribute %s of type object.", attr)
+	}
+
+	sv, err := types.Convert(info.val, schemaType)
+	if err != nil {
+		return err
+	}
+
+	bm25Tok := tok.BM25Tokenizer{}
+	termFreqs, docLen, err := bm25Tok.TokensWithFrequency(sv.Value, lang)
+	if err != nil {
+		return err
+	}
+
+	// Skip documents that tokenize to zero terms (e.g., all stopwords).
+	if docLen == 0 {
+		return nil
+	}
+
+	if info.op == pb.DirectedEdge_DEL {
+		// For DELETE: remove uid from all BM25 term posting lists, doc length list,
+		// and decrement corpus stats.
+		for term := range termFreqs {
+			encodedTerm := string([]byte{tok.IdentBM25}) + term
+			key := x.BM25IndexKey(attr, encodedTerm)
+			plist, err := txn.cache.GetFromDelta(key)
+			if err != nil {
+				return err
+			}
+			edge := &pb.DirectedEdge{
+				ValueId: uid,
+				Attr:    attr,
+				Op:      pb.DirectedEdge_DEL,
+			}
+			if err := plist.addMutation(ctx, txn, edge); err != nil {
+				return err
+			}
+		}
+		// Remove doc length entry.
+		dlKey := x.BM25DocLenKey(attr)
+		dlPlist, err := txn.cache.GetFromDelta(dlKey)
+		if err != nil {
+			return err
+		}
+		dlEdge := &pb.DirectedEdge{
+			ValueId: uid,
+			Attr:    attr,
+			Op:      pb.DirectedEdge_DEL,
+		}
+		if err := dlPlist.addMutation(ctx, txn, dlEdge); err != nil {
+			return err
+		}
+
+		// Update corpus stats: decrement doc count and total terms.
+		return txn.updateBM25Stats(ctx, attr, -1, -int64(docLen))
+	}
+
+	// For SET: store term frequencies, doc length, and update corpus stats.
+	for term, tf := range termFreqs {
+		encodedTerm := string([]byte{tok.IdentBM25}) + term
+		key := x.BM25IndexKey(attr, encodedTerm)
+		plist, err := txn.cache.GetFromDelta(key)
+		if err != nil {
+			return err
+		}
+		// Store uid in the posting list. The TF is encoded in the Value field.
+		tfBuf := make([]byte, 4)
+		binary.BigEndian.PutUint32(tfBuf, tf)
+		edge := &pb.DirectedEdge{
+			ValueId:   uid,
+			Attr:      attr,
+			Value:     tfBuf,
+			ValueType: pb.Posting_INT,
+			Op:        pb.DirectedEdge_SET,
+		}
+		if err := plist.addMutation(ctx, txn, edge); err != nil {
+			return err
+		}
+	}
+
+	// Store document length.
+	dlKey := x.BM25DocLenKey(attr)
+	dlPlist, err := txn.cache.GetFromDelta(dlKey)
+	if err != nil {
+		return err
+	}
+	dlBuf := make([]byte, 4)
+	binary.BigEndian.PutUint32(dlBuf, docLen)
+	dlEdge := &pb.DirectedEdge{
+		ValueId:   uid,
+		Attr:      attr,
+		Value:     dlBuf,
+		ValueType: pb.Posting_INT,
+		Op:        pb.DirectedEdge_SET,
+	}
+	if err := dlPlist.addMutation(ctx, txn, dlEdge); err != nil {
+		return err
+	}
+
+	// Update corpus stats: increment doc count by 1 and total terms by docLen.
+	return txn.updateBM25Stats(ctx, attr, 1, int64(docLen))
+}
+
+// updateBM25Stats reads the current corpus statistics for a BM25-indexed attribute,
+// applies the given deltas, and writes back.
+func (txn *Txn) updateBM25Stats(ctx context.Context, attr string, docCountDelta int64, totalTermsDelta int64) error {
+	statsKey := x.BM25StatsKey(attr)
+	plist, err := txn.cache.GetFromDelta(statsKey)
+	if err != nil {
+		return err
+	}
+
+	// Read existing stats from posting with uid=1.
+	var docCount, totalTerms uint64
+	val, err := plist.Value(txn.StartTs)
+	if err == nil && val.Value != nil {
+		data, ok := val.Value.([]byte)
+		if ok && len(data) == 16 {
+			docCount = binary.BigEndian.Uint64(data[0:8])
+			totalTerms = binary.BigEndian.Uint64(data[8:16])
+		}
+	}
+
+	// Apply deltas.
+	if docCountDelta >= 0 {
+		docCount += uint64(docCountDelta)
+	} else {
+		dec := uint64(-docCountDelta)
+		if dec > docCount {
+			docCount = 0
+		} else {
+			docCount -= dec
+		}
+	}
+	if totalTermsDelta >= 0 {
+		totalTerms += uint64(totalTermsDelta)
+	} else {
+		dec := uint64(-totalTermsDelta)
+		if dec > totalTerms {
+			totalTerms = 0
+		} else {
+			totalTerms -= dec
+		}
+	}
+
+	// Write back stats.
+	statsBuf := make([]byte, 16)
+	binary.BigEndian.PutUint64(statsBuf[0:8], docCount)
+	binary.BigEndian.PutUint64(statsBuf[8:16], totalTerms)
+	edge := &pb.DirectedEdge{
+		Entity:    1,
+		Attr:      attr,
+		Value:     statsBuf,
+		ValueType: pb.Posting_ValType(0),
+		Op:        pb.DirectedEdge_SET,
+	}
+	return plist.addMutation(ctx, txn, edge)
+}
+
 // countParams is sent to updateCount function. It is used to update the count index.
 // It deletes the uid from the key corresponding to <attr, countBefore> and adds it
 // to <attr, countAfter>.
 
@@ -390,6 +390,11 @@ func populateCluster(dc dgraphapi.Cluster) {
 		testSchema += "\ndescription: string @index(ngram) ."
 	}
 
+	// BM25 indexing - uses same version gate as ngram for now
+	if ngramSupport {
+		testSchema += "\ndescription_bm25: string @index(bm25) ."
+	}
+
 	setSchema(testSchema)
 
 	err = addTriplesToCluster(`
@@ -1007,4 +1012,16 @@ func populateCluster(dc dgraphapi.Cluster) {
 		<415> <description> "Linguistic analysis helps understand text meaning" .
 	`)
 	x.Panic(err)
+
+	// Add data for BM25 tests - uses separate predicate to avoid conflicts
+	err = addTriplesToCluster(`
+		<501> <description_bm25> "The quick brown fox jumps over the lazy dog" .
+		<502> <description_bm25> "A quick brown fox leaps over a sleeping dog" .
+		<503> <description_bm25> "fox fox fox" .
+		<504> <description_bm25> "The lazy dog sleeps under the warm sun all day long in the garden" .
+		<505> <description_bm25> "Dogs are loyal companions to humans and families everywhere" .
+		<506> <description_bm25> "Quick movements help foxes catch their prey in the wild" .
+		<507> <description_bm25> "Brown foxes are quick and agile animals in the forest" .
+	`)
+	x.Panic(err)
 }
Original file line number	Diff line number	Diff line change
`@@ -1701,7 +1701,7 @@ func validFuncName(name string) bool {`
`1701`	`1701`
`1702`	`1702`	`switch name {`
`1703`	`1703`	`case "regexp", "anyofterms", "allofterms", "alloftext", "anyoftext", "ngram",`
`1704`		`- "has", "uid", "uid_in", "anyof", "allof", "type", "match", "similar_to":`
	`1704`	`+ "has", "uid", "uid_in", "anyof", "allof", "type", "match", "similar_to", "bm25":`
`1705`	`1705`	`return true`
`1706`	`1706`	`}`
`1707`	`1707`	`return false`