feat(bulk): add a "skip reduce" flag to the bulk loader (#9618)

matthewmcneely · web-flow · commit 570681d6bc94 · 2026-03-04T16:52:13.000-05:00
**Description** This PR add a `--skip_reduce_phase` flag to the bulk loader. When supplied, the bulk loader stops after the map phase. Workflow: ```sh # start a dgraph zero (for uid and ts mgmt) dgraph bulk --skip_reduce_phase -f data.rdf.gz -s dql.schema --tmp tmp ``` Move your tmp folder, or not... ```sh dgraph bulk --skip_map_phase --tmp tmp --out out ``` Closes #9615 **Checklist** - [x] The PR title follows the [Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/#summary) syntax, leading with `fix:`, `feat:`, `chore:`, `ci:`, etc. - [x] Code compiles correctly and linting (via trunk) passes locally - [x] Tests added for new functionality, or regression tests for bug fixes added as applicable - [ ] For public APIs, new features, etc., a PR on the [docs repo](https://github.com/dgraph-io/dgraph-docs) staged and linked here. This process can be simplified by going to the [public docs site](https://docs.dgraph.io/) and clicking the "Edit this page" button at the bottom of page(s) relevant to your changes. Ensure that you indicate in the PR that this is an **unreleased** feature so that it does not get merged into the main docs prematurely.
diff --git a/dgraph/cmd/bulk/loader.go b/dgraph/cmd/bulk/loader.go
@@ -51,6 +51,7 @@ type BulkOptions struct {
 	MapBufSize       uint64
 	PartitionBufSize int64
 	SkipMapPhase     bool
+	SkipReducePhase  bool
 	CleanupTmp       bool
 	NumReducers      int
 	Version          bool
@@ -144,7 +145,7 @@ type loader struct {
 	dg      *dgo.Dgraph
 }
 
-func newLoader(opt *BulkOptions) *loader {
+func newLoader(opt *BulkOptions, precomputedWriteTs uint64) *loader {
 	if opt == nil {
 		log.Fatalf("Cannot create loader with nil options.")
 	}
@@ -185,17 +186,27 @@ func newLoader(opt *BulkOptions) *loader {
 		fmt.Printf("Error logging enabled, writing to: %s\n", opt.ErrorLogPath)
 	}
 
+	writeTs := precomputedWriteTs
+	if writeTs == 0 {
+		writeTs = getWriteTimestamp(zero, dg)
+	}
 	st := &state{
 		opt:    opt,
 		prog:   newProgress(),
 		shards: newShardMap(opt.MapShards),
 		// Lots of gz readers, so not much channel buffer needed.
 		readerChunkCh: make(chan *chunkWithMeta, opt.NumGoroutines),
-		writeTs:       getWriteTimestamp(zero, dg),
+		writeTs:       writeTs,
 		namespaces:    &sync.Map{},
 		errorLog:      errLog,
 	}
-	st.schema = newSchemaStore(readSchema(opt), opt, st)
+	var parsedSchema *schema.ParsedSchema
+	if !opt.SkipMapPhase {
+		parsedSchema = readSchema(opt)
+	} else {
+		parsedSchema = &schema.ParsedSchema{}
+	}
+	st.schema = newSchemaStore(parsedSchema, opt, st)
 	ld := &loader{
 		state:   st,
 		mappers: make([]*mapper, opt.NumGoroutines),
diff --git a/dgraph/cmd/bulk/run.go b/dgraph/cmd/bulk/run.go
@@ -78,6 +78,8 @@ func init() {
 	flag.Int64("partition_mb", 4, "Pick a partition key every N megabytes of data.")
 	flag.Bool("skip_map_phase", false,
 		"Skip the map phase (assumes that map output files already exist).")
+	flag.Bool("skip_reduce_phase", false,
+		"Skip the reduce phase (stops after map phase completion).")
 	flag.Bool("cleanup_tmp", true,
 		"Clean up the tmp directory after the loader finishes. Setting this to false allows the"+
 			" bulk loader can be re-run while skipping the map phase.")
@@ -150,6 +152,7 @@ func run() {
 		MapBufSize:       uint64(Bulk.Conf.GetInt("mapoutput_mb")),
 		PartitionBufSize: int64(Bulk.Conf.GetInt("partition_mb")),
 		SkipMapPhase:     Bulk.Conf.GetBool("skip_map_phase"),
+		SkipReducePhase:  Bulk.Conf.GetBool("skip_reduce_phase"),
 		CleanupTmp:       Bulk.Conf.GetBool("cleanup_tmp"),
 		NumReducers:      Bulk.Conf.GetInt("reducers"),
 		Version:          Bulk.Conf.GetBool("version"),
@@ -205,27 +208,29 @@ func RunBulkLoader(opt BulkOptions) {
 	}
 	fmt.Printf("Encrypted input: %v; Encrypted output: %v\n", opt.Encrypted, opt.EncryptedOut)
 
-	if opt.SchemaFile == "" {
-		// if only graphql schema is provided, we can generate DQL schema from it.
-		if opt.GqlSchemaFile == "" {
-			fmt.Fprint(os.Stderr, "Schema file must be specified.\n")
-			os.Exit(1)
+	if !opt.SkipMapPhase {
+		if opt.SchemaFile == "" {
+			// if only graphql schema is provided, we can generate DQL schema from it.
+			if opt.GqlSchemaFile == "" {
+				fmt.Fprint(os.Stderr, "Schema file must be specified.\n")
+				os.Exit(1)
+			}
+		} else {
+			if !filestore.Exists(opt.SchemaFile) {
+				fmt.Fprintf(os.Stderr, "Schema path(%v) does not exist.\n", opt.SchemaFile)
+				os.Exit(1)
+			}
 		}
-	} else {
-		if !filestore.Exists(opt.SchemaFile) {
-			fmt.Fprintf(os.Stderr, "Schema path(%v) does not exist.\n", opt.SchemaFile)
+		if opt.DataFiles == "" {
+			fmt.Fprint(os.Stderr, "RDF or JSON file(s) location must be specified.\n")
 			os.Exit(1)
-		}
-	}
-	if opt.DataFiles == "" {
-		fmt.Fprint(os.Stderr, "RDF or JSON file(s) location must be specified.\n")
-		os.Exit(1)
-	} else {
-		fileList := strings.SplitSeq(opt.DataFiles, ",")
-		for file := range fileList {
-			if !filestore.Exists(file) {
-				fmt.Fprintf(os.Stderr, "Data path(%v) does not exist.\n", file)
-				os.Exit(1)
+		} else {
+			fileList := strings.SplitSeq(opt.DataFiles, ",")
+			for file := range fileList {
+				if !filestore.Exists(file) {
+					fmt.Fprintf(os.Stderr, "Data path(%v) does not exist.\n", file)
+					os.Exit(1)
+				}
 			}
 		}
 	}
@@ -240,6 +245,20 @@ func RunBulkLoader(opt BulkOptions) {
 			opt.NumReducers, opt.ReduceShards)
 		os.Exit(1)
 	}
+
+	// Validate skip phase flags
+	if opt.SkipMapPhase && opt.SkipReducePhase {
+		fmt.Fprint(os.Stderr, "Cannot skip both map and reduce phases.\n")
+		os.Exit(1)
+	}
+	if opt.SkipReducePhase {
+		if Bulk.Cmd.Flags().Changed("cleanup_tmp") && opt.CleanupTmp {
+			fmt.Fprint(os.Stderr, "Cannot use --skip_reduce_phase with --cleanup_tmp=true. "+
+				"Temp files must be preserved for the later reduce phase.\n")
+			os.Exit(1)
+		}
+		opt.CleanupTmp = false
+	}
 	if opt.CustomTokenizers != "" {
 		for _, soFile := range strings.Split(opt.CustomTokenizers, ",") {
 			tok.LoadCustomTokenizer(soFile)
@@ -267,25 +286,28 @@ func RunBulkLoader(opt BulkOptions) {
 
 	// Make sure it's OK to create or replace the directory specified with the --out option.
 	// It is always OK to create or replace the default output directory.
-	if opt.OutDir != defaultOutDir && !opt.ReplaceOutDir {
-		err := x.IsMissingOrEmptyDir(opt.OutDir)
-		if err == nil {
-			fmt.Fprintf(os.Stderr, "Output directory exists and is not empty."+
-				" Use --replace_out to overwrite it.\n")
-			os.Exit(1)
-		} else if err != x.ErrMissingDir {
-			x.CheckfNoTrace(err)
+	// Skip output directory validation if we're only doing map phase
+	if !opt.SkipReducePhase {
+		if opt.OutDir != defaultOutDir && !opt.ReplaceOutDir {
+			err := x.IsMissingOrEmptyDir(opt.OutDir)
+			if err == nil {
+				fmt.Fprintf(os.Stderr, "Output directory exists and is not empty."+
+					" Use --replace_out to overwrite it.\n")
+				os.Exit(1)
+			} else if err != x.ErrMissingDir {
+				x.CheckfNoTrace(err)
+			}
 		}
-	}
 
-	// Delete and recreate the output dirs to ensure they are empty.
-	x.Check(os.RemoveAll(opt.OutDir))
-	for i := range opt.ReduceShards {
-		dir := filepath.Join(opt.OutDir, strconv.Itoa(i), "p")
-		x.Check(os.MkdirAll(dir, 0700))
-		opt.shardOutputDirs = append(opt.shardOutputDirs, dir)
+		// Delete and recreate the output dirs to ensure they are empty.
+		x.Check(os.RemoveAll(opt.OutDir))
+		for i := range opt.ReduceShards {
+			dir := filepath.Join(opt.OutDir, strconv.Itoa(i), "p")
+			x.Check(os.MkdirAll(dir, 0700))
+			opt.shardOutputDirs = append(opt.shardOutputDirs, dir)
 
-		x.Check(x.WriteGroupIdFile(dir, uint32(i+1)))
+			x.Check(x.WriteGroupIdFile(dir, uint32(i+1)))
+		}
 	}
 
 	// Create a directory just for bulk loader's usage.
@@ -303,10 +325,26 @@ func RunBulkLoader(opt BulkOptions) {
 	x.Check(os.MkdirAll(bufDir, 0700))
 	defer os.RemoveAll(bufDir)
 
-	loader := newLoader(&opt)
-
 	const bulkMetaFilename = "bulk.meta"
 	bulkMetaPath := filepath.Join(opt.TmpDir, bulkMetaFilename)
+	const writeTsFilename = "write.ts"
+	writeTsPath := filepath.Join(opt.TmpDir, writeTsFilename)
+
+	var precomputedWriteTs uint64
+	if opt.SkipMapPhase {
+		writeTsData, err := os.ReadFile(writeTsPath)
+		if err != nil {
+			fmt.Fprintln(os.Stderr, "Error reading write timestamp file; was --skip_reduce_phase used in the map run?")
+			os.Exit(1)
+		}
+		precomputedWriteTs, err = strconv.ParseUint(strings.TrimSpace(string(writeTsData)), 10, 64)
+		if err != nil {
+			fmt.Fprintln(os.Stderr, "Error parsing write timestamp file")
+			os.Exit(1)
+		}
+	}
+
+	loader := newLoader(&opt, precomputedWriteTs)
 
 	if opt.SkipMapPhase {
 		bulkMetaData, err := os.ReadFile(bulkMetaPath)
@@ -343,7 +381,20 @@ func RunBulkLoader(opt BulkOptions) {
 			fmt.Fprintln(os.Stderr, "Error writing to bulk meta file")
 			os.Exit(1)
 		}
+		if err = os.WriteFile(writeTsPath, []byte(strconv.FormatUint(loader.writeTs, 10)), 0600); err != nil {
+			fmt.Fprintln(os.Stderr, "Error writing write timestamp file")
+			os.Exit(1)
+		}
 	}
+
+	if opt.SkipReducePhase {
+		fmt.Println("Skipping reduce phase. Map phase completed successfully.")
+		fmt.Println("Temp files preserved for later reduce phase processing.")
+		// Don't call cleanup() to preserve temp files
+		loader.prog.endSummary()
+		return
+	}
+
 	loader.reduceStage()
 	loader.writeSchema()
 	loader.cleanup()
diff --git a/dgraphtest/load.go b/dgraphtest/load.go
@@ -475,12 +475,15 @@ func (c *LocalCluster) LiveLoadFromExport(exportDir string) error {
 }
 
 type BulkOpts struct {
-	DataFiles      []string
-	SchemaFiles    []string
-	GQLSchemaFiles []string
-	OutDir         string
-	MapShards      int // Number of map shards (0 = auto based on numAlphas/replicas)
-	ReduceShards   int // Number of reduce shards (0 = auto based on numAlphas/replicas)
+	DataFiles       []string
+	SchemaFiles     []string
+	GQLSchemaFiles  []string
+	OutDir          string
+	MapShards       int    // Number of map shards (0 = auto based on numAlphas/replicas)
+	ReduceShards    int    // Number of reduce shards (0 = auto based on numAlphas/replicas)
+	SkipReducePhase bool   // Stop after map phase; preserve tmp dir for later reduce
+	SkipMapPhase    bool   // Skip map phase; assumes map output files already exist
+	TmpDir          string // Custom tmp directory (required when splitting map/reduce runs)
 }
 
 func (c *LocalCluster) BulkLoad(opts BulkOpts) error {
@@ -518,6 +521,16 @@ func (c *LocalCluster) BulkLoad(opts BulkOpts) error {
 		"--http", ":0",
 	}
 
+	if opts.TmpDir != "" {
+		args = append(args, "--tmp", opts.TmpDir)
+	}
+	if opts.SkipReducePhase {
+		args = append(args, "--skip_reduce_phase")
+	}
+	if opts.SkipMapPhase {
+		args = append(args, "--skip_map_phase")
+	}
+
 	if len(opts.DataFiles) > 0 {
 		args = append(args, "-f", strings.Join(opts.DataFiles, ","))
 	}
diff --git a/systest/integration2/bulk_loader_test.go b/systest/integration2/bulk_loader_test.go
@@ -62,6 +62,66 @@ const (
 	]`
 )
 
+func TestBulkLoaderSkipReducePhase(t *testing.T) {
+	tmpDir := t.TempDir()
+
+	conf := dgraphtest.NewClusterConfig().WithNumAlphas(1).WithNumZeros(1).
+		WithACL(time.Hour).WithReplicas(1).WithBulkLoadOutDir(t.TempDir())
+	c, err := dgraphtest.NewLocalCluster(conf)
+	require.NoError(t, err)
+	defer func() { c.Cleanup(t.Failed()) }()
+
+	require.NoError(t, c.StartZero(0))
+	require.NoError(t, c.HealthCheck(true))
+
+	baseDir := t.TempDir()
+	dataFile := filepath.Join(baseDir, "data.json")
+	require.NoError(t, os.WriteFile(dataFile, []byte(jsonData), os.ModePerm))
+	gqlSchemaFile := filepath.Join(baseDir, "gql.schema")
+	require.NoError(t, os.WriteFile(gqlSchemaFile, []byte(gqlSchema), os.ModePerm))
+
+	// First run: map phase only, preserve tmp dir
+	mapOpts := dgraphtest.BulkOpts{
+		DataFiles:       []string{dataFile},
+		GQLSchemaFiles:  []string{gqlSchemaFile},
+		TmpDir:          tmpDir,
+		SkipReducePhase: true,
+	}
+	require.NoError(t, c.BulkLoad(mapOpts))
+
+	// Second run: reduce phase only, using the same tmp dir.
+	// Data and schema files are not needed; all input was processed in the map phase.
+	reduceOpts := dgraphtest.BulkOpts{
+		TmpDir:       tmpDir,
+		SkipMapPhase: true,
+	}
+	require.NoError(t, c.BulkLoad(reduceOpts))
+
+	require.NoError(t, c.Start())
+
+	hc, err := c.HTTPClient()
+	require.NoError(t, err)
+	require.NoError(t, hc.LoginIntoNamespace(dgraphapi.DefaultUser,
+		dgraphapi.DefaultPassword, x.RootNamespace))
+
+	params := dgraphapi.GraphQLParams{
+		Query: `query {
+			getMessage(uniqueId: 3) {
+				content
+				author
+			}
+		}`,
+	}
+	data, err := hc.RunGraphqlQuery(params, false)
+	require.NoError(t, err)
+	require.NoError(t, dgraphapi.CompareJSON(`{
+		"getMessage": {
+		  "content": "DVTCTXCVYI",
+		  "author": "USYMVFJYXA"
+		}
+	  }`, string(data)))
+}
+
 func TestBulkLoaderNoDqlSchema(t *testing.T) {
 	conf := dgraphtest.NewClusterConfig().WithNumAlphas(2).WithNumZeros(1).
 		WithACL(time.Hour).WithReplicas(1).WithBulkLoadOutDir(t.TempDir())