Skip to content

Commit 4e78b88

Browse files
fix(bm25): store TF/doclen in facets and fix query pipeline integration
Three critical bugs fixed: 1. REF postings lose Value during rollup: The posting list encode/rollup cycle strips the Value field from REF postings without facets (list.go:1630). BM25 term frequencies and doc lengths were stored in Value and lost. Fix: Store TF and doclen as facets on REF postings, which are preserved. 2. Missing function validation: query/query.go has a separate isValidFuncName check from dql/parser.go. "bm25" was only added to the parser, causing "Invalid function name: bm25" at query time. 3. Unsorted UIDs break query pipeline: BM25 returned UIDs sorted by score, but the query pipeline (algo.MergeSorted, child predicate fetching) requires UID-ascending order. Fix: Sort UIDs ascending in UidMatrix, apply first/offset pagination on score-sorted results before UID sorting. Co-Authored-By: Claude Opus 4.6 <[email protected]>
1 parent b318667 commit 4e78b88

4 files changed

Lines changed: 55 additions & 26 deletions

File tree

posting/index.go

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
"github.com/dgraph-io/badger/v4"
2929
"github.com/dgraph-io/badger/v4/options"
3030
bpb "github.com/dgraph-io/badger/v4/pb"
31+
"github.com/dgraph-io/dgo/v250/protos/api"
3132
"github.com/dgraph-io/dgraph/v25/protos/pb"
3233
"github.com/dgraph-io/dgraph/v25/schema"
3334
"github.com/dgraph-io/dgraph/v25/tok"
@@ -304,15 +305,15 @@ func (txn *Txn) addBM25IndexMutations(ctx context.Context, info *indexMutationIn
304305
if err != nil {
305306
return err
306307
}
307-
// Store uid in the posting list. The TF is encoded in the Value field.
308+
// Store uid in the posting list. TF is stored as a facet so it survives
309+
// the rollup cycle (REF postings without facets lose their Value field).
308310
tfBuf := make([]byte, 4)
309311
binary.BigEndian.PutUint32(tfBuf, tf)
310312
edge := &pb.DirectedEdge{
311-
ValueId: uid,
312-
Attr: attr,
313-
Value: tfBuf,
314-
ValueType: pb.Posting_INT,
315-
Op: pb.DirectedEdge_SET,
313+
ValueId: uid,
314+
Attr: attr,
315+
Op: pb.DirectedEdge_SET,
316+
Facets: []*api.Facet{{Key: "tf", Value: tfBuf, ValType: api.Facet_INT}},
316317
}
317318
if err := plist.addMutation(ctx, txn, edge); err != nil {
318319
return err
@@ -328,11 +329,10 @@ func (txn *Txn) addBM25IndexMutations(ctx context.Context, info *indexMutationIn
328329
dlBuf := make([]byte, 4)
329330
binary.BigEndian.PutUint32(dlBuf, docLen)
330331
dlEdge := &pb.DirectedEdge{
331-
ValueId: uid,
332-
Attr: attr,
333-
Value: dlBuf,
334-
ValueType: pb.Posting_INT,
335-
Op: pb.DirectedEdge_SET,
332+
ValueId: uid,
333+
Attr: attr,
334+
Op: pb.DirectedEdge_SET,
335+
Facets: []*api.Facet{{Key: "dl", Value: dlBuf, ValType: api.Facet_INT}},
336336
}
337337
if err := dlPlist.addMutation(ctx, txn, dlEdge); err != nil {
338338
return err

query/query.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2751,7 +2751,7 @@ func isValidArg(a string) bool {
27512751
func isValidFuncName(f string) bool {
27522752
switch f {
27532753
case "anyofterms", "allofterms", "val", "regexp", "anyoftext", "alloftext", "ngram",
2754-
"has", "uid", "uid_in", "anyof", "allof", "type", "match", "similar_to":
2754+
"has", "uid", "uid_in", "anyof", "allof", "type", "match", "similar_to", "bm25":
27552755
return true
27562756
}
27572757
return isInequalityFn(f) || types.IsGeoFunc(f)

query/query_bm25_test.go

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ package query
1010

1111
import (
1212
"context"
13-
"strings"
1413
"testing"
1514

1615
"github.com/stretchr/testify/require"
@@ -32,6 +31,8 @@ func TestBM25Basic(t *testing.T) {
3231
}
3332

3433
func TestBM25Ordering(t *testing.T) {
34+
// BM25 returns all matching documents. Use first:1 to verify the highest-scored
35+
// document is "fox fox fox" (tf=3, short doc).
3536
query := `
3637
{
3738
me(func: bm25(description_bm25, "fox")) {
@@ -41,14 +42,22 @@ func TestBM25Ordering(t *testing.T) {
4142
}
4243
`
4344
js := processQueryNoErr(t, query)
44-
// Document 503 has "fox fox fox" (tf=3, short doc) so should rank highest.
45-
// Verify it appears before other fox-containing documents in the output.
46-
foxFoxFoxIdx := strings.Index(js, "fox fox fox")
47-
quickBrownIdx := strings.Index(js, "quick brown fox jumps")
48-
require.Greater(t, foxFoxFoxIdx, -1, "should contain 'fox fox fox'")
49-
require.Greater(t, quickBrownIdx, -1, "should contain 'quick brown fox jumps'")
50-
require.Less(t, foxFoxFoxIdx, quickBrownIdx,
51-
"'fox fox fox' (higher tf, shorter doc) should rank before 'quick brown fox jumps'")
45+
// Should contain all fox-mentioning documents.
46+
require.Contains(t, js, "fox fox fox")
47+
require.Contains(t, js, "quick brown fox jumps")
48+
49+
// first:1 should return the top-ranked document.
50+
topQuery := `
51+
{
52+
me(func: bm25(description_bm25, "fox"), first: 1) {
53+
uid
54+
description_bm25
55+
}
56+
}
57+
`
58+
topJs := processQueryNoErr(t, topQuery)
59+
require.Contains(t, topJs, "fox fox fox",
60+
"top-1 BM25 result for 'fox' should be 'fox fox fox' (highest tf, shortest doc)")
5261
}
5362

5463
func TestBM25WithParams(t *testing.T) {

worker/task.go

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1337,8 +1337,11 @@ func (qs *queryState) handleBM25Search(ctx context.Context, args funcArgs) error
13371337
}
13381338
}
13391339
tf := uint32(1)
1340-
if len(p.Value) >= 4 {
1341-
tf = binary.BigEndian.Uint32(p.Value[:4])
1340+
for _, f := range p.Facets {
1341+
if f.Key == "tf" && len(f.Value) >= 4 {
1342+
tf = binary.BigEndian.Uint32(f.Value[:4])
1343+
break
1344+
}
13421345
}
13431346
ti.uidTFs[p.Uid] = tf
13441347
return nil
@@ -1369,8 +1372,11 @@ func (qs *queryState) handleBM25Search(ctx context.Context, args funcArgs) error
13691372
}
13701373
if _, needed := allUids[p.Uid]; needed {
13711374
dl := uint32(1)
1372-
if len(p.Value) >= 4 {
1373-
dl = binary.BigEndian.Uint32(p.Value[:4])
1375+
for _, f := range p.Facets {
1376+
if f.Key == "dl" && len(f.Value) >= 4 {
1377+
dl = binary.BigEndian.Uint32(f.Value[:4])
1378+
break
1379+
}
13741380
}
13751381
docLens[p.Uid] = dl
13761382
remaining--
@@ -1402,18 +1408,32 @@ func (qs *queryState) handleBM25Search(ctx context.Context, args funcArgs) error
14021408
for uid, score := range scores {
14031409
results = append(results, uidScore{uid: uid, score: score})
14041410
}
1411+
// Sort by score descending for ordering, then collect UIDs.
14051412
sort.Slice(results, func(i, j int) bool {
14061413
if results[i].score != results[j].score {
14071414
return results[i].score > results[j].score
14081415
}
14091416
return results[i].uid < results[j].uid
14101417
})
14111418

1412-
// Build output UIDs.
1419+
// Apply first/offset pagination on score-sorted results before returning UIDs.
1420+
if q.First > 0 || q.Offset > 0 {
1421+
offset := int(q.Offset)
1422+
if offset > len(results) {
1423+
offset = len(results)
1424+
}
1425+
results = results[offset:]
1426+
if q.First > 0 && int(q.First) < len(results) {
1427+
results = results[:int(q.First)]
1428+
}
1429+
}
1430+
1431+
// Build output UIDs sorted by UID (ascending) as required by the query pipeline.
14131432
uids := make([]uint64, len(results))
14141433
for i, r := range results {
14151434
uids[i] = r.uid
14161435
}
1436+
sort.Slice(uids, func(i, j int) bool { return uids[i] < uids[j] })
14171437

14181438
args.out.UidMatrix = append(args.out.UidMatrix, &pb.List{Uids: uids})
14191439
return nil

0 commit comments

Comments
 (0)