From c24e0dffe9b9ae7857079649726674b0fc6be116 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Sun, 3 May 2026 15:45:32 +0000 Subject: [PATCH 1/8] sync: add remote-SHA filter as a third layer of the upload pipeline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sync runs in three layers: 1. Discovery (libs/git, libs/fileset): walk the local tree. 2. Snapshot diff (libs/sync/snapshot.go, diff.go): compare against the local mtime snapshot to produce an action plan (puts/deletes/etc). 3. Remote filter (new, libs/sync/remote_filter.go): pre-flight that bulk-fetches content SHAs from the workspace and drops puts whose remote SHA already matches the local SHA. Layer 3 only runs when the snapshot is fresh (no prior local state) — the case where Layer 2 produces false-positive puts at scale, e.g. on a CI runner that has just cloned the repo. With an existing snapshot, Layer 2 is precise; paying for a bulk remote list would be wasted work. The remote SHA list uses /api/2.0/workspace/list-repo with the return_wsfs_metadata=true flag, exposed via a new RemoteFileMetadata type and ListWithSHAs method on WorkspaceFilesClient. Errors and missing remote state degrade gracefully: the filter returns the unmodified diff and the worst case is the existing behavior (re-upload). Verified end-to-end against bundle-dev: a sync that deletes the local snapshot and re-runs produces zero uploads when contents are unchanged, and uploads only the edited file when one file changes. Notebooks (.py with the magic header and .ipynb) preserve raw uploaded bytes server-side, so their SHAs match local SHAs verbatim — no notebook-specific carve-out needed. Snapshot schema is unchanged; this is purely additive. Co-authored-by: Isaac --- libs/filer/workspace_files_client.go | 66 +++++++++++ libs/fileset/file.go | 6 + libs/sync/remote_filter.go | 133 +++++++++++++++++++++ libs/sync/remote_filter_test.go | 169 +++++++++++++++++++++++++++ libs/sync/sync.go | 46 +++++++- 5 files changed, 416 insertions(+), 4 deletions(-) create mode 100644 libs/sync/remote_filter.go create mode 100644 libs/sync/remote_filter_test.go diff --git a/libs/filer/workspace_files_client.go b/libs/filer/workspace_files_client.go index c6c62816bbc..99b41a02a47 100644 --- a/libs/filer/workspace_files_client.go +++ b/libs/filer/workspace_files_client.go @@ -338,6 +338,72 @@ func (w *WorkspaceFilesClient) Mkdir(ctx context.Context, name string) error { }) } +// RemoteFileMetadata describes a single workspace object returned by the +// list-repo API with return_wsfs_metadata=true. +type RemoteFileMetadata struct { + // Absolute workspace path. Note that for notebooks, the extension is + // stripped (e.g. /Workspace/foo/bar.py -> /Workspace/foo/bar). + Path string + + // SHA-256 hex digest of the blob. Populated only for FILE and NOTEBOOK + // objects (not directories) when the workspace returns wsfs metadata. + ContentSHA256Hex string + + // "FILE", "NOTEBOOK", or "DIRECTORY". + ObjectType string +} + +// ListWithSHAs recursively lists all workspace objects under the given path +// and returns their content SHAs from the workspace's wsfs metadata. This uses +// /api/2.0/workspace/list-repo with the (currently undocumented in the SDK) +// return_wsfs_metadata=true flag, which causes the response to include a +// content_sha256_hex field for files and notebooks. +// +// Returns nil if the path does not exist; callers should treat that as "no +// remote state to merge" rather than an error. +func (w *WorkspaceFilesClient) ListWithSHAs(ctx context.Context, dirPath string) ([]RemoteFileMetadata, error) { + type listObject struct { + ObjectType string `json:"object_type"` + Path string `json:"path"` + ContentSHA256Hex string `json:"content_sha256_hex"` + HasWsfsMetadata bool `json:"has_wsfs_metadata"` + } + type listResponse struct { + Objects []listObject `json:"objects"` + } + + var resp listResponse + err := w.apiClient.Do( + ctx, + http.MethodGet, + "/api/2.0/workspace/list-repo", + w.orgIDHeaders(), + nil, + map[string]any{ + "path": dirPath, + "return_wsfs_metadata": true, + }, + &resp, + ) + if err != nil { + var aerr *apierr.APIError + if errors.As(err, &aerr) && aerr.StatusCode == http.StatusNotFound { + return nil, nil + } + return nil, err + } + + out := make([]RemoteFileMetadata, 0, len(resp.Objects)) + for _, o := range resp.Objects { + out = append(out, RemoteFileMetadata{ + Path: o.Path, + ContentSHA256Hex: o.ContentSHA256Hex, + ObjectType: o.ObjectType, + }) + } + return out, nil +} + func (w *WorkspaceFilesClient) Stat(ctx context.Context, name string) (fs.FileInfo, error) { absPath, err := w.root.Join(name) if err != nil { diff --git a/libs/fileset/file.go b/libs/fileset/file.go index 0a27b294cf7..0d0f6440ab0 100644 --- a/libs/fileset/file.go +++ b/libs/fileset/file.go @@ -67,6 +67,12 @@ func (f File) Modified() (ts time.Time) { return info.ModTime() } +// Open returns a reader for the file contents. The caller is responsible +// for closing it. +func (f File) Open() (fs.File, error) { + return f.root.Open(f.Relative) +} + func (f *File) IsNotebook() (bool, error) { if f.fileType != Unknown { return f.fileType == Notebook, nil diff --git a/libs/sync/remote_filter.go b/libs/sync/remote_filter.go new file mode 100644 index 00000000000..dcf0427c2c8 --- /dev/null +++ b/libs/sync/remote_filter.go @@ -0,0 +1,133 @@ +package sync + +import ( + "context" + "crypto/sha256" + "encoding/hex" + "io" + "path" + + "github.com/databricks/cli/libs/filer" + "github.com/databricks/cli/libs/fileset" + "github.com/databricks/cli/libs/log" +) + +// shaLister fetches content SHAs for a workspace directory in one bulk call. +// The interface lets tests stub the API call without spinning up a fake +// filer. +type shaLister interface { + ListWithSHAs(ctx context.Context, dirPath string) ([]filer.RemoteFileMetadata, error) +} + +// RemoteFilter is the third layer of the sync pipeline. It takes the action +// plan produced by the snapshot diff and drops puts whose remote SHA already +// matches the local SHA — files the workspace already has, byte-for-byte. +// +// The expensive work (one bulk list, one SHA per skipped put candidate) only +// pays off when the snapshot diff has produced false-positive puts at scale. +// The caller decides when to invoke Apply; today that's only on a fresh +// snapshot (no prior local state). +type RemoteFilter struct { + lister shaLister + remotePath string +} + +func NewRemoteFilter(lister shaLister, remotePath string) *RemoteFilter { + return &RemoteFilter{lister: lister, remotePath: remotePath} +} + +// Apply returns a copy of d with put entries removed for files whose local +// SHA already matches the remote SHA. Errors fetching or computing SHAs are +// logged and treated as "do not skip" — the worst case is an unnecessary +// upload, which is the existing behavior. +func (f *RemoteFilter) Apply(ctx context.Context, d diff, files []fileset.File, localToRemote map[string]string) diff { + if len(d.put) == 0 || f == nil || f.lister == nil { + return d + } + + remote, err := f.lister.ListWithSHAs(ctx, f.remotePath) + if err != nil { + log.Warnf(ctx, "could not fetch remote content SHAs from %s; uploading all candidate files: %s", f.remotePath, err) + return d + } + if len(remote) == 0 { + return d + } + + remoteSHAByPath := make(map[string]string, len(remote)) + for _, e := range remote { + if e.ContentSHA256Hex == "" { + continue + } + remoteSHAByPath[e.Path] = e.ContentSHA256Hex + } + + localByRelative := make(map[string]*fileset.File, len(files)) + for i := range files { + localByRelative[files[i].Relative] = &files[i] + } + + keep := make([]string, 0, len(d.put)) + skipped := 0 + for _, p := range d.put { + if !f.canSkip(ctx, p, localByRelative, localToRemote, remoteSHAByPath) { + keep = append(keep, p) + continue + } + skipped++ + } + + if skipped > 0 { + log.Debugf(ctx, "remote-filter: skipped %d/%d uploads matching workspace SHAs", skipped, len(d.put)) + } + + return diff{ + delete: d.delete, + rmdir: d.rmdir, + mkdir: d.mkdir, + put: keep, + } +} + +// canSkip reports whether the put for relativePath can be safely dropped: +// the workspace already has a file at the corresponding remote path with the +// same SHA-256 as the local file. +func (f *RemoteFilter) canSkip( + ctx context.Context, + relativePath string, + localByRelative map[string]*fileset.File, + localToRemote map[string]string, + remoteSHAByPath map[string]string, +) bool { + local, ok := localByRelative[relativePath] + if !ok { + return false + } + remoteName, ok := localToRemote[relativePath] + if !ok { + return false + } + remoteSHA, ok := remoteSHAByPath[path.Join(f.remotePath, remoteName)] + if !ok { + return false + } + localSHA, err := computeFileSHA(local) + if err != nil { + log.Debugf(ctx, "remote-filter: hashing %s failed; will upload: %s", relativePath, err) + return false + } + return localSHA == remoteSHA +} + +func computeFileSHA(f *fileset.File) (string, error) { + rc, err := f.Open() + if err != nil { + return "", err + } + defer rc.Close() + h := sha256.New() + if _, err := io.Copy(h, rc); err != nil { + return "", err + } + return hex.EncodeToString(h.Sum(nil)), nil +} diff --git a/libs/sync/remote_filter_test.go b/libs/sync/remote_filter_test.go new file mode 100644 index 00000000000..33e74aecf5b --- /dev/null +++ b/libs/sync/remote_filter_test.go @@ -0,0 +1,169 @@ +package sync + +import ( + "context" + "crypto/sha256" + "encoding/hex" + "errors" + "path" + "testing" + + "github.com/databricks/cli/libs/filer" + "github.com/databricks/cli/libs/fileset" + "github.com/databricks/cli/libs/vfs" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +type stubLister struct { + out []filer.RemoteFileMetadata + err error +} + +func (s *stubLister) ListWithSHAs(ctx context.Context, dirPath string) ([]filer.RemoteFileMetadata, error) { + return s.out, s.err +} + +func sha256OfFile(t *testing.T, root vfs.Path, relative string) string { + t.Helper() + r, err := root.Open(relative) + require.NoError(t, err) + defer r.Close() + h := sha256.New() + _, err = h.Write(mustReadAll(t, r)) + require.NoError(t, err) + return hex.EncodeToString(h.Sum(nil)) +} + +func mustReadAll(t *testing.T, r interface{ Read(p []byte) (int, error) }) []byte { + t.Helper() + buf := make([]byte, 0, 1024) + tmp := make([]byte, 256) + for { + n, err := r.Read(tmp) + if n > 0 { + buf = append(buf, tmp[:n]...) + } + if err != nil { + break + } + } + return buf +} + +const remoteRoot = "/Workspace/Users/foo@databricks.com/proj" + +// loadTestFiles returns the test fileset and the mtime-based SnapshotState +// (which provides LocalToRemoteNames). All tests share the same on-disk +// fileset under testdata/sync-fileset. +func loadTestFiles(t *testing.T) ([]fileset.File, map[string]string) { + t.Helper() + fs := fileset.New(vfs.MustNew("./testdata/sync-fileset")) + files, err := fs.Files() + require.NoError(t, err) + state, err := NewSnapshotState(files) + require.NoError(t, err) + return files, state.LocalToRemoteNames +} + +func TestRemoteFilterReturnsUnchangedWhenNoPuts(t *testing.T) { + rf := NewRemoteFilter(&stubLister{}, remoteRoot) + d := diff{delete: []string{"old"}, rmdir: []string{"olddir"}} + got := rf.Apply(t.Context(), d, nil, nil) + assert.Equal(t, d.delete, got.delete) + assert.Equal(t, d.rmdir, got.rmdir) + assert.Empty(t, got.put) +} + +func TestRemoteFilterReturnsUnchangedWhenListErrors(t *testing.T) { + files, ltr := loadTestFiles(t) + rf := NewRemoteFilter(&stubLister{err: errors.New("boom")}, remoteRoot) + in := diff{put: []string{"my-script.py"}} + got := rf.Apply(t.Context(), in, files, ltr) + assert.Equal(t, []string{"my-script.py"}, got.put) +} + +func TestRemoteFilterReturnsUnchangedWhenRemoteEmpty(t *testing.T) { + files, ltr := loadTestFiles(t) + rf := NewRemoteFilter(&stubLister{out: nil}, remoteRoot) + in := diff{put: []string{"my-script.py"}} + got := rf.Apply(t.Context(), in, files, ltr) + assert.Equal(t, []string{"my-script.py"}, got.put) +} + +func TestRemoteFilterDropsPutWhenSHAMatches(t *testing.T) { + files, ltr := loadTestFiles(t) + root := vfs.MustNew("./testdata/sync-fileset") + wantSHA := sha256OfFile(t, root, "my-script.py") + + rf := NewRemoteFilter(&stubLister{out: []filer.RemoteFileMetadata{ + {Path: path.Join(remoteRoot, "my-script.py"), ContentSHA256Hex: wantSHA, ObjectType: "FILE"}, + }}, remoteRoot) + + in := diff{put: []string{"my-script.py"}} + got := rf.Apply(t.Context(), in, files, ltr) + assert.Empty(t, got.put) +} + +func TestRemoteFilterKeepsPutWhenSHAMismatches(t *testing.T) { + files, ltr := loadTestFiles(t) + rf := NewRemoteFilter(&stubLister{out: []filer.RemoteFileMetadata{ + {Path: path.Join(remoteRoot, "my-script.py"), ContentSHA256Hex: "deadbeef", ObjectType: "FILE"}, + }}, remoteRoot) + + in := diff{put: []string{"my-script.py"}} + got := rf.Apply(t.Context(), in, files, ltr) + assert.Equal(t, []string{"my-script.py"}, got.put) +} + +func TestRemoteFilterKeepsPutWhenRemoteMissing(t *testing.T) { + files, ltr := loadTestFiles(t) + rf := NewRemoteFilter(&stubLister{out: []filer.RemoteFileMetadata{ + {Path: path.Join(remoteRoot, "other-file"), ContentSHA256Hex: "x"}, + }}, remoteRoot) + + in := diff{put: []string{"my-script.py"}} + got := rf.Apply(t.Context(), in, files, ltr) + assert.Equal(t, []string{"my-script.py"}, got.put) +} + +func TestRemoteFilterDropsPutForNotebookWhenSHAMatches(t *testing.T) { + // Notebooks are stored without their .py extension on the workspace — + // LocalToRemoteNames maps "my-nb.py" -> "my-nb". The filter must use the + // remote name when looking up the SHA. + files, ltr := loadTestFiles(t) + root := vfs.MustNew("./testdata/sync-fileset") + wantSHA := sha256OfFile(t, root, "my-nb.py") + require.Equal(t, "my-nb", ltr["my-nb.py"]) + + rf := NewRemoteFilter(&stubLister{out: []filer.RemoteFileMetadata{ + {Path: path.Join(remoteRoot, "my-nb"), ContentSHA256Hex: wantSHA, ObjectType: "NOTEBOOK"}, + }}, remoteRoot) + + in := diff{put: []string{"my-nb.py"}} + got := rf.Apply(t.Context(), in, files, ltr) + assert.Empty(t, got.put) +} + +func TestRemoteFilterMixedDiff(t *testing.T) { + files, ltr := loadTestFiles(t) + root := vfs.MustNew("./testdata/sync-fileset") + scriptSHA := sha256OfFile(t, root, "my-script.py") + validNbSHA := sha256OfFile(t, root, "valid-nb.ipynb") + + rf := NewRemoteFilter(&stubLister{out: []filer.RemoteFileMetadata{ + {Path: path.Join(remoteRoot, "my-script.py"), ContentSHA256Hex: scriptSHA, ObjectType: "FILE"}, + {Path: path.Join(remoteRoot, "my-nb"), ContentSHA256Hex: "stale", ObjectType: "NOTEBOOK"}, + {Path: path.Join(remoteRoot, "valid-nb"), ContentSHA256Hex: validNbSHA, ObjectType: "NOTEBOOK"}, + }}, remoteRoot) + + in := diff{ + put: []string{"my-script.py", "my-nb.py", "valid-nb.ipynb"}, + delete: []string{"gone"}, + mkdir: []string{"new"}, + } + got := rf.Apply(t.Context(), in, files, ltr) + assert.ElementsMatch(t, []string{"my-nb.py"}, got.put, "only the SHA-mismatched notebook should be re-uploaded") + assert.Equal(t, []string{"gone"}, got.delete) + assert.Equal(t, []string{"new"}, got.mkdir) +} diff --git a/libs/sync/sync.go b/libs/sync/sync.go index c65b49eb775..9a6a7e52259 100644 --- a/libs/sync/sync.go +++ b/libs/sync/sync.go @@ -45,6 +45,24 @@ type SyncOptions struct { DryRun bool } +// Sync runs file synchronization in three layers: +// +// 1. Discovery (libs/git, libs/fileset): walks the local tree and produces a +// list of files to consider, honoring include/exclude rules. +// +// 2. Snapshot diff (libs/sync/snapshot.go, libs/sync/diff.go): compares the +// discovered files against a local snapshot of mtimes from the previous +// run and produces a diff (puts, deletes, mkdirs, rmdirs) — the action +// plan for this iteration. If no snapshot exists, every file becomes a +// put. +// +// 3. Remote filter (libs/sync/remote_filter.go): an optional pre-flight that +// fetches content SHAs from the workspace and drops puts whose remote SHA +// already matches the local SHA. We only run it when the snapshot is +// fresh (no prior state), which is the only case where Layer 2 produces +// false-positive puts at scale (e.g. on a CI runner). When a local +// snapshot exists, Layer 2 is already accurate enough; paying for a +// bulk remote list would be wasted work. type Sync struct { *SyncOptions @@ -52,8 +70,9 @@ type Sync struct { includeFileSet *fileset.FileSet excludeFileSet *fileset.FileSet - snapshot *Snapshot - filer filer.Filer + snapshot *Snapshot + filer filer.Filer + remoteFilter *RemoteFilter // Synchronization progress events are sent to this event notifier. notifier EventNotifier @@ -111,11 +130,19 @@ func New(ctx context.Context, opts SyncOptions) (*Sync, error) { } } - filer, err := filer.NewWorkspaceFilesClient(opts.WorkspaceClient, opts.RemotePath) + filerImpl, err := filer.NewWorkspaceFilesClient(opts.WorkspaceClient, opts.RemotePath) if err != nil { return nil, err } + // The remote SHA list call is not part of the Filer interface (it's a + // sync-only optimization, not a general filesystem op), so we type-assert + // the concrete client. In tests we plug in a stub via NewWithRemoteFilter. + var remoteFilter *RemoteFilter + if wfc, ok := filerImpl.(*filer.WorkspaceFilesClient); ok { + remoteFilter = NewRemoteFilter(wfc, opts.RemotePath) + } + var notifier EventNotifier outputWaitGroup := &stdsync.WaitGroup{} if opts.OutputHandler != nil { @@ -135,7 +162,8 @@ func New(ctx context.Context, opts SyncOptions) (*Sync, error) { includeFileSet: includeFileSet, excludeFileSet: excludeFileSet, snapshot: snapshot, - filer: filer, + filer: filerImpl, + remoteFilter: remoteFilter, notifier: notifier, outputWaitGroup: outputWaitGroup, seq: 0, @@ -178,16 +206,26 @@ func (s *Sync) notifyComplete(ctx context.Context, d diff) { // Returns the list of files tracked (and synchronized) by the syncer during the run, // and an error if any occurred. func (s *Sync) RunOnce(ctx context.Context) ([]fileset.File, error) { + // Layer 1: discovery. files, err := s.GetFileList(ctx) if err != nil { return files, err } + // Layer 2: snapshot-driven action plan. change, err := s.snapshot.diff(ctx, files) if err != nil { return files, err } + // Layer 3: remote-state filter, only when the snapshot is fresh. + // With an existing snapshot, Layer 2 is precise; with no snapshot, every + // file is a put — so we ask the workspace what's already there and drop + // puts whose contents already match. + if s.snapshot.New && s.remoteFilter != nil { + change = s.remoteFilter.Apply(ctx, change, files, s.snapshot.LocalToRemoteNames) + } + s.notifyStart(ctx, change) if change.IsEmpty() { s.notifyComplete(ctx, change) From 087152b2758393089ccd7e221e99e79593449eb7 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Mon, 4 May 2026 08:57:58 +0000 Subject: [PATCH 2/8] testserver: handle /workspace/list-repo so Layer 3 doesn't error in acceptance Acceptance tests were failing on the direct engine path because Layer 3 of sync (the remote-SHA filter) calls /api/2.0/workspace/list-repo, but the testserver had no default handler. The CLI's filter logged a Warn line ("could not fetch remote content SHAs ... No stub found") which contaminated recorded output.txt diffs. Add WorkspaceListRepo to the FakeWorkspace: walks the in-memory files/directories maps, computes SHA-256 over each file's stored bytes, and returns objects in the same shape as the real list-repo response (path, object_type, content_sha256_hex, has_wsfs_metadata, size, language). On a workspace with no imported files, returns {"objects": []}, which causes Layer 3 to no-op cleanly. Co-authored-by: Isaac --- libs/testserver/fake_workspace.go | 52 +++++++++++++++++++++++++++++++ libs/testserver/handlers.go | 5 +++ 2 files changed, 57 insertions(+) diff --git a/libs/testserver/fake_workspace.go b/libs/testserver/fake_workspace.go index 5430c68cbcc..36596b58252 100644 --- a/libs/testserver/fake_workspace.go +++ b/libs/testserver/fake_workspace.go @@ -2,7 +2,9 @@ package testserver import ( "bytes" + "crypto/sha256" "encoding/binary" + "encoding/hex" "encoding/json" "fmt" "os" @@ -454,6 +456,56 @@ func (s *FakeWorkspace) WorkspaceFilesExportFile(path string) []byte { return s.files[path].Data } +// WorkspaceListRepo returns the recursive listing under root used by the +// /api/2.0/workspace/list-repo endpoint. The CLI calls this with +// return_wsfs_metadata=true, expecting a content_sha256_hex per file/notebook. +func (s *FakeWorkspace) WorkspaceListRepo(root string) any { + if !strings.HasPrefix(root, "/") { + root = "/" + root + } + + defer s.LockUnlock()() + + type listObject struct { + ObjectType string `json:"object_type"` + Path string `json:"path"` + ContentSHA256Hex string `json:"content_sha256_hex,omitempty"` + HasWsfsMetadata bool `json:"has_wsfs_metadata,omitempty"` + Size int `json:"size,omitempty"` + Language string `json:"language,omitempty"` + } + + var objects []listObject + if _, ok := s.directories[root]; ok { + objects = append(objects, listObject{ObjectType: "DIRECTORY", Path: root}) + } + for p, dir := range s.directories { + if p == root || !strings.HasPrefix(p, root+"/") { + continue + } + objects = append(objects, listObject{ObjectType: "DIRECTORY", Path: dir.Path}) + } + for p, fe := range s.files { + if !strings.HasPrefix(p, root+"/") { + continue + } + sum := sha256.Sum256(fe.Data) + obj := listObject{ + ObjectType: string(fe.Info.ObjectType), + Path: fe.Info.Path, + ContentSHA256Hex: hex.EncodeToString(sum[:]), + HasWsfsMetadata: true, + Size: len(fe.Data), + } + if fe.Info.Language != "" { + obj.Language = string(fe.Info.Language) + } + objects = append(objects, obj) + } + + return map[string]any{"objects": objects} +} + // FileExists checks if a file exists at the given path. func (s *FakeWorkspace) FileExists(path string) bool { if !strings.HasPrefix(path, "/") { diff --git a/libs/testserver/handlers.go b/libs/testserver/handlers.go index 8bd53391841..1ebab18051f 100644 --- a/libs/testserver/handlers.go +++ b/libs/testserver/handlers.go @@ -79,6 +79,11 @@ func AddDefaultHandlers(server *Server) { return req.Workspace.WorkspaceGetStatus(path) }) + server.Handle("GET", "/api/2.0/workspace/list-repo", func(req Request) any { + path := req.URL.Query().Get("path") + return req.Workspace.WorkspaceListRepo(path) + }) + server.Handle("POST", "/api/2.0/workspace/mkdirs", func(req Request) any { var request workspace.Mkdirs if err := json.Unmarshal(req.Body, &request); err != nil { From f2597a0f8e5e164cf2640f20f3344a9d51bb09a4 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Mon, 4 May 2026 09:16:29 +0000 Subject: [PATCH 3/8] acceptance: record list-repo call from Layer 3 in user_agent fixtures Layer 3 of sync issues a GET /api/2.0/workspace/list-repo with return_wsfs_metadata=true on cold-snapshot deploys (which is what acceptance tests do). Add the new request to the recorded fixtures for both engines so the diff comparison passes. The User-Agent on the new call inherits cmd/bundle_deploy and engine/ as expected. Co-authored-by: Isaac --- .../simple/out.requests.deploy.direct.json | 13 +++++++++++++ .../simple/out.requests.deploy.terraform.json | 13 +++++++++++++ 2 files changed, 26 insertions(+) diff --git a/acceptance/bundle/user_agent/simple/out.requests.deploy.direct.json b/acceptance/bundle/user_agent/simple/out.requests.deploy.direct.json index cc39aad6e9b..a84d67497c6 100644 --- a/acceptance/bundle/user_agent/simple/out.requests.deploy.direct.json +++ b/acceptance/bundle/user_agent/simple/out.requests.deploy.direct.json @@ -123,6 +123,19 @@ "return_export_info": "true" } } +{ + "headers": { + "User-Agent": [ + "cli/[DEV_VERSION] databricks-sdk-go/[SDK_VERSION] go/[GO_VERSION] os/[OS] cmd/bundle_deploy cmd-exec-id/[UUID] interactive/none engine/direct auth/pat" + ] + }, + "method": "GET", + "path": "/api/2.0/workspace/list-repo", + "q": { + "path": "/Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files", + "return_wsfs_metadata": "true" + } +} { "headers": { "User-Agent": [ diff --git a/acceptance/bundle/user_agent/simple/out.requests.deploy.terraform.json b/acceptance/bundle/user_agent/simple/out.requests.deploy.terraform.json index 435b188af3b..9661fc629d1 100644 --- a/acceptance/bundle/user_agent/simple/out.requests.deploy.terraform.json +++ b/acceptance/bundle/user_agent/simple/out.requests.deploy.terraform.json @@ -123,6 +123,19 @@ "return_export_info": "true" } } +{ + "headers": { + "User-Agent": [ + "cli/[DEV_VERSION] databricks-sdk-go/[SDK_VERSION] go/[GO_VERSION] os/[OS] cmd/bundle_deploy cmd-exec-id/[UUID] interactive/none engine/terraform auth/pat" + ] + }, + "method": "GET", + "path": "/api/2.0/workspace/list-repo", + "q": { + "path": "/Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files", + "return_wsfs_metadata": "true" + } +} { "headers": { "User-Agent": [ From a72cd728245ec5c37cc526438acbbead5208c455 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Mon, 4 May 2026 09:29:49 +0000 Subject: [PATCH 4/8] acceptance: add list-repo entries to bundle/user_agent output expectations Companion to the previous fixture updates. The parent user_agent test aggregates User-Agent observations from per-engine recorded request logs and writes a shared output.txt; that needs the same list-repo entries (one per engine). Co-authored-by: Isaac --- acceptance/bundle/user_agent/output.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/acceptance/bundle/user_agent/output.txt b/acceptance/bundle/user_agent/output.txt index bf128624271..5cda322d3d8 100644 --- a/acceptance/bundle/user_agent/output.txt +++ b/acceptance/bundle/user_agent/output.txt @@ -8,6 +8,7 @@ OK deploy.direct /api/2.0/workspace/get-status engine/direct OK deploy.direct /api/2.0/workspace/get-status engine/direct OK deploy.direct /api/2.0/workspace/get-status engine/direct OK deploy.direct /api/2.0/workspace/get-status engine/direct +OK deploy.direct /api/2.0/workspace/list-repo engine/direct OK deploy.direct /api/2.0/workspace-files/import-file/Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files/empty.py engine/direct OK deploy.direct /api/2.0/workspace-files/import-file/Workspace/Users/[USERNAME]/.bundle/test-bundle/default/state/deploy.lock engine/direct OK deploy.direct /api/2.0/workspace-files/import-file/Workspace/Users/[USERNAME]/.bundle/test-bundle/default/state/deployment.json engine/direct @@ -28,6 +29,7 @@ OK deploy.terraform /api/2.0/workspace/get-status engine/terraform OK deploy.terraform /api/2.0/workspace/get-status engine/terraform OK deploy.terraform /api/2.0/workspace/get-status engine/terraform OK deploy.terraform /api/2.0/workspace/get-status engine/terraform +OK deploy.terraform /api/2.0/workspace/list-repo engine/terraform OK deploy.terraform /api/2.0/workspace-files/import-file/Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files/empty.py engine/terraform OK deploy.terraform /api/2.0/workspace-files/import-file/Workspace/Users/[USERNAME]/.bundle/test-bundle/default/state/deploy.lock engine/terraform OK deploy.terraform /api/2.0/workspace-files/import-file/Workspace/Users/[USERNAME]/.bundle/test-bundle/default/state/deployment.json engine/terraform From 0083c16a48c6584033c5f896dfed96e52f2a743a Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Mon, 4 May 2026 13:58:34 +0000 Subject: [PATCH 5/8] sync: add reusable workspace benchmarks for the list-repo + Layer 3 paths Behind a benchworkspace build tag so they don't run in CI. Three groups: - BenchmarkListRepoByCount: list-repo cost vs N (10/100/500/1000). - BenchmarkListRepoByContent: list-repo cost across plain files / notebooks (py/sql/ipynb) / dashboards / mixed at fixed N=200. - BenchmarkSyncRunOnceColdSnapshot: end-to-end Sync.RunOnce against a pre-warmed remote, with and without Layer 3, at N=20/100/500. The "without" variant zeroes out s.remoteFilter to bypass the new path. Each run creates and tears down a unique /Users/$USER/.tmp/sync-bench-X tree on the configured workspace. Run instructions are in the file's top comment. Required env: DATABRICKS_BENCH_PROFILE= DATABRICKS_BENCH_USER= go test -tags benchworkspace -bench=. -benchtime=5x -timeout=60m ./libs/sync/... Co-authored-by: Isaac --- libs/sync/workspace_bench_test.go | 309 ++++++++++++++++++++++++++++++ 1 file changed, 309 insertions(+) create mode 100644 libs/sync/workspace_bench_test.go diff --git a/libs/sync/workspace_bench_test.go b/libs/sync/workspace_bench_test.go new file mode 100644 index 00000000000..45301e43320 --- /dev/null +++ b/libs/sync/workspace_bench_test.go @@ -0,0 +1,309 @@ +//go:build benchworkspace + +// Workspace benchmarks for libs/sync. Real-network — they hit a Databricks +// workspace and import / list / delete files there. +// +// Quickstart: +// +// export DATABRICKS_BENCH_PROFILE=my-profile # required; profile from .databrickscfg +// export DATABRICKS_BENCH_USER=me@example.com # required; remote dirs go under /Users/$DATABRICKS_BENCH_USER/.tmp/ +// +// go test -tags benchworkspace -bench=. -benchtime=5x -timeout=60m ./libs/sync/... +// +// To run a single benchmark group: +// +// go test -tags benchworkspace -bench=BenchmarkListRepoByCount -benchtime=10x -timeout=20m ./libs/sync/... +// go test -tags benchworkspace -bench=BenchmarkListRepoByContent -benchtime=10x -timeout=20m ./libs/sync/... +// go test -tags benchworkspace -bench=BenchmarkSyncRunOnceColdSnapshot -benchtime=5x -timeout=30m ./libs/sync/... +// +// Use a scratch profile / scratch user — every run creates and deletes a +// fresh /Users/$DATABRICKS_BENCH_USER/.tmp/sync-bench-/ tree. + +package sync + +import ( + "bytes" + "context" + "crypto/rand" + "encoding/hex" + "fmt" + "net/http" + "net/url" + "os" + "path" + "path/filepath" + "strings" + "sync" + "sync/atomic" + "testing" + + "github.com/databricks/cli/libs/filer" + "github.com/databricks/cli/libs/vfs" + "github.com/databricks/databricks-sdk-go" + apiclient "github.com/databricks/databricks-sdk-go/client" + "github.com/databricks/databricks-sdk-go/config" + "github.com/databricks/databricks-sdk-go/service/workspace" +) + +// benchEnv groups the workspace state a benchmark needs. +type benchEnv struct { + wc *databricks.WorkspaceClient + apiClient *apiclient.DatabricksClient + profile string + username string + root string // absolute, no trailing slash +} + +func newBenchEnv(tb testing.TB) *benchEnv { + tb.Helper() + profile := os.Getenv("DATABRICKS_BENCH_PROFILE") + user := os.Getenv("DATABRICKS_BENCH_USER") + if profile == "" || user == "" { + tb.Skip("DATABRICKS_BENCH_PROFILE and DATABRICKS_BENCH_USER must be set") + } + wc, err := databricks.NewWorkspaceClient(&databricks.Config{Profile: profile}) + if err != nil { + tb.Fatalf("workspace client: %v", err) + } + c, err := apiclient.New(&config.Config{Profile: profile}) + if err != nil { + tb.Fatalf("api client: %v", err) + } + tag := make([]byte, 4) + _, _ = rand.Read(tag) + root := fmt.Sprintf("/Users/%s/.tmp/sync-bench-%s", user, hex.EncodeToString(tag)) + if err := wc.Workspace.MkdirsByPath(context.Background(), root); err != nil { + tb.Fatalf("mkdir root: %v", err) + } + tb.Cleanup(func() { + _ = wc.Workspace.Delete(context.Background(), workspace.Delete{Path: root, Recursive: true}) + }) + return &benchEnv{wc: wc, apiClient: c, profile: profile, username: user, root: root} +} + +// importRaw uploads body bytes to the given absolute workspace path via the +// legacy /workspace-files/import-file endpoint. Used during benchmark setup +// (we want population to be cheap and predictable; what we're benchmarking +// is the listing / sync, not the upload). +func importRaw(ctx context.Context, c *apiclient.DatabricksClient, absPath string, body []byte) error { + urlPath := fmt.Sprintf("/api/2.0/workspace-files/import-file/%s?overwrite=true", + url.PathEscape(strings.TrimLeft(absPath, "/"))) + return c.Do(ctx, http.MethodPost, urlPath, nil, nil, body, nil) +} + +// populate uploads the given (relative path -> body) map under remoteDir, +// using a fixed-size worker pool. Returns once all uploads finish. +func populate(tb testing.TB, env *benchEnv, remoteDir string, items map[string][]byte) { + tb.Helper() + ctx := context.Background() + if err := env.wc.Workspace.MkdirsByPath(ctx, remoteDir); err != nil { + tb.Fatalf("mkdir %s: %v", remoteDir, err) + } + type job struct { + rel string + body []byte + } + jobs := make(chan job, len(items)) + for r, b := range items { + jobs <- job{r, b} + } + close(jobs) + var wg sync.WaitGroup + var failed atomic.Int64 + for i := 0; i < 16; i++ { + wg.Go(func() { + for j := range jobs { + if err := importRaw(ctx, env.apiClient, path.Join(remoteDir, j.rel), j.body); err != nil { + failed.Add(1) + } + } + }) + } + wg.Wait() + if failed.Load() > 0 { + tb.Logf("warning: %d uploads failed during populate", failed.Load()) + } +} + +// generators returns a body-by-kind table. The bodies are sized to roughly +// 200 bytes so list-repo response time isn't dominated by per-file content. +func generators() map[string]func(i int) (suffix string, body []byte) { + pad := func(s string, n int) []byte { + if n <= len(s) { + return []byte(s[:n]) + } + buf := make([]byte, n) + copy(buf, s) + for i := len(s); i < n; i++ { + buf[i] = byte('a' + (i % 26)) + } + return buf + } + return map[string]func(i int) (string, []byte){ + "file": func(i int) (string, []byte) { + return ".txt", pad(fmt.Sprintf("plain file %d\n", i), 200) + }, + "py-notebook": func(i int) (string, []byte) { + return ".py", pad(fmt.Sprintf("# Databricks notebook source\nprint('%d')\n", i), 200) + }, + "sql-notebook": func(i int) (string, []byte) { + return ".sql", pad(fmt.Sprintf("-- Databricks notebook source\nSELECT %d;\n", i), 200) + }, + "ipynb": func(i int) (string, []byte) { + return ".ipynb", pad(fmt.Sprintf(`{"cells":[{"cell_type":"code","source":["# Databricks notebook source\n","print(%d)"],"outputs":[],"execution_count":null,"metadata":{}}],"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"}},"nbformat":4,"nbformat_minor":4}`, i), 200) + }, + "dashboard": func(i int) (string, []byte) { + return ".lvdash.json", pad(`{"datasets":[],"pages":[{"name":"p","displayName":"P","layout":[]}]}`, 200) + }, + } +} + +// BenchmarkListRepoByCount measures /workspace/list-repo cost at varying file +// counts (all plain files), to characterize how the call scales with N. +func BenchmarkListRepoByCount(b *testing.B) { + env := newBenchEnv(b) + wfc, err := filer.NewWorkspaceFilesClient(env.wc, env.root) + if err != nil { + b.Fatalf("filer: %v", err) + } + lister := wfc.(*filer.WorkspaceFilesClient) + + gen := generators()["file"] + for _, n := range []int{10, 100, 500, 1000} { + b.Run(fmt.Sprintf("N=%d", n), func(b *testing.B) { + dir := path.Join(env.root, fmt.Sprintf("count-%d", n)) + items := make(map[string][]byte, n) + for i := 0; i < n; i++ { + suf, body := gen(i) + items[fmt.Sprintf("file-%05d%s", i, suf)] = body + } + populate(b, env, dir, items) + // Warm-up + _, _ = lister.ListWithSHAs(context.Background(), dir) + b.ResetTimer() + for i := 0; i < b.N; i++ { + if _, err := lister.ListWithSHAs(context.Background(), dir); err != nil { + b.Fatalf("list: %v", err) + } + } + }) + } +} + +// BenchmarkListRepoByContent measures /workspace/list-repo cost when the +// directory has different mixes of object types. Each scenario has a fixed +// number of files; only the content mix varies. +func BenchmarkListRepoByContent(b *testing.B) { + const N = 200 + env := newBenchEnv(b) + wfc, err := filer.NewWorkspaceFilesClient(env.wc, env.root) + if err != nil { + b.Fatalf("filer: %v", err) + } + lister := wfc.(*filer.WorkspaceFilesClient) + + gen := generators() + scenarios := map[string]func(i int) (string, []byte){ + "all-files": gen["file"], + "all-py-notebooks": gen["py-notebook"], + "all-ipynb": gen["ipynb"], + "all-sql-notebooks": gen["sql-notebook"], + "all-dashboards": gen["dashboard"], + } + mixedKinds := []string{"file", "py-notebook", "ipynb", "dashboard", "sql-notebook"} + scenarios["mixed"] = func(i int) (string, []byte) { + return gen[mixedKinds[i%len(mixedKinds)]](i) + } + + for name, g := range scenarios { + b.Run(name, func(b *testing.B) { + dir := path.Join(env.root, "content-"+name) + items := make(map[string][]byte, N) + for i := 0; i < N; i++ { + suf, body := g(i) + items[fmt.Sprintf("item-%05d%s", i, suf)] = body + } + populate(b, env, dir, items) + _, _ = lister.ListWithSHAs(context.Background(), dir) + b.ResetTimer() + for i := 0; i < b.N; i++ { + if _, err := lister.ListWithSHAs(context.Background(), dir); err != nil { + b.Fatalf("list: %v", err) + } + } + }) + } +} + +// BenchmarkSyncRunOnceColdSnapshot times an end-to-end Sync.RunOnce against +// a workspace that already has the bundle's files (the CI-cold-runner case). +// Sub-benchmarks compare with-Layer-3 vs without-Layer-3 to show the speedup +// the remote-SHA filter delivers when nothing has actually changed. +func BenchmarkSyncRunOnceColdSnapshot(b *testing.B) { + env := newBenchEnv(b) + user, err := env.wc.CurrentUser.Me(context.Background()) + if err != nil { + b.Fatalf("Me: %v", err) + } + + for _, n := range []int{20, 100, 500} { + b.Run(fmt.Sprintf("N=%d", n), func(b *testing.B) { + localDir := b.TempDir() + gen := generators()["file"] + for i := 0; i < n; i++ { + suf, body := gen(i) + p := filepath.Join(localDir, fmt.Sprintf("file-%05d%s", i, suf)) + if err := os.WriteFile(p, body, 0o644); err != nil { + b.Fatalf("write %s: %v", p, err) + } + } + remoteDir := path.Join(env.root, fmt.Sprintf("sync-N%d", n)) + if err := env.wc.Workspace.MkdirsByPath(context.Background(), remoteDir); err != nil { + b.Fatalf("mkdir: %v", err) + } + + runOnce := func(b *testing.B, withLayer3 bool) { + _ = os.RemoveAll(filepath.Join(localDir, ".databricks")) + localRoot := vfs.MustNew(localDir) + snapBase := b.TempDir() + s, err := New(context.Background(), SyncOptions{ + WorktreeRoot: localRoot, + LocalRoot: localRoot, + Paths: []string{"."}, + RemotePath: remoteDir, + SnapshotBasePath: snapBase, + WorkspaceClient: env.wc, + CurrentUser: user, + }) + if err != nil { + b.Fatalf("Sync.New: %v", err) + } + if !withLayer3 { + s.remoteFilter = nil + } + if _, err := s.RunOnce(context.Background()); err != nil { + b.Fatalf("RunOnce: %v", err) + } + s.Close() + } + + // Pre-warm: one initial sync so the workspace has the files. + runOnce(b, true) + b.ResetTimer() + + b.Run("with-layer3", func(b *testing.B) { + for i := 0; i < b.N; i++ { + runOnce(b, true) + } + }) + b.Run("without-layer3", func(b *testing.B) { + for i := 0; i < b.N; i++ { + runOnce(b, false) + } + }) + }) + } +} + +// keep tests in this file linkable even without the bench tag +var _ = bytes.NewReader From e90c69e9b00ac58caf233be849f5ed8d908416ee Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Mon, 4 May 2026 14:51:05 +0000 Subject: [PATCH 6/8] sync(bench): add tree-shape axis + parallel-walk runner MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Expand the benchworkspace benchmarks so every cell can be combined with a tree shape: flat (no nesting), small (depth 2, branch 2), medium (depth 4, branch 2), large (depth 6, branch 2). All four list-repo and sync benchmarks now parameterize over (shape × N). Add a test-only parallelWalk runner that lists everything under a path by issuing non-recursive /api/2.0/workspace/list calls and recursing client-side with a configurable worker pool. Add BenchmarkListWalkers that compares list-repo vs parallel-walk (workers=8 and workers=32) head to head across (shape × N). The runner is deliberately not exported and not wired into Sync — production code uses list-repo. The bench is here to make the comparison reproducible and to back the claim that list-repo dominates parallel-walk at any non-flat shape. BenchmarkListRepoByContent now also varies shape, so content × shape is fully covered. Run all four groups with: go test -tags benchworkspace -bench=. -benchtime=5x -timeout=90m ./libs/sync/... Co-authored-by: Isaac --- libs/sync/workspace_bench_test.go | 458 +++++++++++++++++++++--------- 1 file changed, 331 insertions(+), 127 deletions(-) diff --git a/libs/sync/workspace_bench_test.go b/libs/sync/workspace_bench_test.go index 45301e43320..562989674e5 100644 --- a/libs/sync/workspace_bench_test.go +++ b/libs/sync/workspace_bench_test.go @@ -8,21 +8,24 @@ // export DATABRICKS_BENCH_PROFILE=my-profile # required; profile from .databrickscfg // export DATABRICKS_BENCH_USER=me@example.com # required; remote dirs go under /Users/$DATABRICKS_BENCH_USER/.tmp/ // -// go test -tags benchworkspace -bench=. -benchtime=5x -timeout=60m ./libs/sync/... +// go test -tags benchworkspace -bench=. -benchtime=5x -timeout=90m ./libs/sync/... // -// To run a single benchmark group: +// Run a single group: // -// go test -tags benchworkspace -bench=BenchmarkListRepoByCount -benchtime=10x -timeout=20m ./libs/sync/... -// go test -tags benchworkspace -bench=BenchmarkListRepoByContent -benchtime=10x -timeout=20m ./libs/sync/... -// go test -tags benchworkspace -bench=BenchmarkSyncRunOnceColdSnapshot -benchtime=5x -timeout=30m ./libs/sync/... +// go test -tags benchworkspace -bench=BenchmarkListRepo$ -benchtime=10x -timeout=20m ./libs/sync/... +// go test -tags benchworkspace -bench=BenchmarkListRepoByContent -benchtime=10x -timeout=20m ./libs/sync/... +// go test -tags benchworkspace -bench=BenchmarkListWalkers -benchtime=5x -timeout=30m ./libs/sync/... +// go test -tags benchworkspace -bench=BenchmarkSyncRunOnceColdSnapshot -benchtime=5x -timeout=30m ./libs/sync/... // -// Use a scratch profile / scratch user — every run creates and deletes a -// fresh /Users/$DATABRICKS_BENCH_USER/.tmp/sync-bench-/ tree. +// Filter further with -bench='BenchmarkListWalkers/shape=medium/N=200'. +// +// Each benchmark group creates and tears down a unique remote tree under +// /Users/$DATABRICKS_BENCH_USER/.tmp/sync-bench-/ on the configured +// workspace. Use a scratch profile. package sync import ( - "bytes" "context" "crypto/rand" "encoding/hex" @@ -32,9 +35,10 @@ import ( "os" "path" "path/filepath" - "strings" - "sync" + "sort" + stdsync "sync" "sync/atomic" + "strings" "testing" "github.com/databricks/cli/libs/filer" @@ -45,13 +49,69 @@ import ( "github.com/databricks/databricks-sdk-go/service/workspace" ) -// benchEnv groups the workspace state a benchmark needs. +// ----- shape: tree depth and branching used to spread files across dirs ---- + +type treeShape struct { + name string + depth int + branch int // unused when depth == 0 (flat) +} + +// shapes ranges from "flat" (no nesting) to "large" (deep tree, many dirs). +// Tweak these values if you want to explore broader/narrower mixes. +var shapes = []treeShape{ + {"flat", 0, 0}, // 1 leaf dir + {"small", 2, 2}, // 4 leaf dirs + {"medium", 4, 2}, // 16 leaf dirs + {"large", 6, 2}, // 64 leaf dirs +} + +// generatePaths returns n relative file paths arranged into a tree of the +// requested shape. Filenames are unique; no extension is added (callers add +// the right one for the content kind they're using). +func generatePaths(shape treeShape, n int) []string { + if shape.depth == 0 { + out := make([]string, n) + for i := 0; i < n; i++ { + out[i] = fmt.Sprintf("f-%05d", i) + } + return out + } + leaves := 1 + for i := 0; i < shape.depth; i++ { + leaves *= shape.branch + } + filesPerLeaf := (n + leaves - 1) / leaves + if filesPerLeaf < 1 { + filesPerLeaf = 1 + } + var paths []string + var walk func(prefix string, d int) + walk = func(prefix string, d int) { + if d == 0 { + for f := 0; f < filesPerLeaf && len(paths) < n; f++ { + paths = append(paths, path.Join(prefix, fmt.Sprintf("f-%d", f))) + } + return + } + for b := 0; b < shape.branch; b++ { + if len(paths) >= n { + return + } + walk(path.Join(prefix, fmt.Sprintf("d%d", b)), d-1) + } + } + walk("", shape.depth) + return paths +} + +// ----- environment + helpers ------------------------------------------------ + type benchEnv struct { wc *databricks.WorkspaceClient apiClient *apiclient.DatabricksClient - profile string username string - root string // absolute, no trailing slash + root string // absolute workspace path, no trailing slash } func newBenchEnv(tb testing.TB) *benchEnv { @@ -78,27 +138,41 @@ func newBenchEnv(tb testing.TB) *benchEnv { tb.Cleanup(func() { _ = wc.Workspace.Delete(context.Background(), workspace.Delete{Path: root, Recursive: true}) }) - return &benchEnv{wc: wc, apiClient: c, profile: profile, username: user, root: root} + return &benchEnv{wc: wc, apiClient: c, username: user, root: root} } -// importRaw uploads body bytes to the given absolute workspace path via the -// legacy /workspace-files/import-file endpoint. Used during benchmark setup -// (we want population to be cheap and predictable; what we're benchmarking -// is the listing / sync, not the upload). +// importRaw uploads body bytes via the legacy import-file endpoint. Used for +// setup only — what we benchmark is the listing / sync path, not the upload. func importRaw(ctx context.Context, c *apiclient.DatabricksClient, absPath string, body []byte) error { urlPath := fmt.Sprintf("/api/2.0/workspace-files/import-file/%s?overwrite=true", url.PathEscape(strings.TrimLeft(absPath, "/"))) return c.Do(ctx, http.MethodPost, urlPath, nil, nil, body, nil) } -// populate uploads the given (relative path -> body) map under remoteDir, -// using a fixed-size worker pool. Returns once all uploads finish. +// populate creates all parent dirs under remoteDir, then uploads (relPath → +// body) entries via a 16-worker pool. Setup helper for benchmarks. func populate(tb testing.TB, env *benchEnv, remoteDir string, items map[string][]byte) { tb.Helper() ctx := context.Background() if err := env.wc.Workspace.MkdirsByPath(ctx, remoteDir); err != nil { tb.Fatalf("mkdir %s: %v", remoteDir, err) } + parents := map[string]struct{}{} + for rel := range items { + if d := path.Dir(rel); d != "." && d != "/" { + parents[d] = struct{}{} + } + } + parentList := make([]string, 0, len(parents)) + for d := range parents { + parentList = append(parentList, d) + } + sort.Strings(parentList) + for _, d := range parentList { + if err := env.wc.Workspace.MkdirsByPath(ctx, path.Join(remoteDir, d)); err != nil { + tb.Fatalf("mkdir %s: %v", d, err) + } + } type job struct { rel string body []byte @@ -108,7 +182,7 @@ func populate(tb testing.TB, env *benchEnv, remoteDir string, items map[string][ jobs <- job{r, b} } close(jobs) - var wg sync.WaitGroup + var wg stdsync.WaitGroup var failed atomic.Int64 for i := 0; i < 16; i++ { wg.Go(func() { @@ -120,12 +194,12 @@ func populate(tb testing.TB, env *benchEnv, remoteDir string, items map[string][ }) } wg.Wait() - if failed.Load() > 0 { - tb.Logf("warning: %d uploads failed during populate", failed.Load()) + if f := failed.Load(); f > 0 { + tb.Logf("warning: %d uploads failed during populate", f) } } -// generators returns a body-by-kind table. The bodies are sized to roughly +// generators returns body generators for each content kind, sized roughly to // 200 bytes so list-repo response time isn't dominated by per-file content. func generators() map[string]func(i int) (suffix string, body []byte) { pad := func(s string, n int) []byte { @@ -158,9 +232,105 @@ func generators() map[string]func(i int) (suffix string, body []byte) { } } -// BenchmarkListRepoByCount measures /workspace/list-repo cost at varying file -// counts (all plain files), to characterize how the call scales with N. -func BenchmarkListRepoByCount(b *testing.B) { +// itemsForCell builds the (relPath → body) map for a (shape, n, kindKey) cell. +func itemsForCell(shape treeShape, n int, kindKey string) map[string][]byte { + gen := generators()[kindKey] + paths := generatePaths(shape, n) + out := make(map[string][]byte, len(paths)) + for i, rel := range paths { + suf, body := gen(i) + out[rel+suf] = body + } + return out +} + +// itemsMixed builds a map where each file is a different kind in a fixed +// rotation. Used by the content-mix benchmark. +func itemsMixed(shape treeShape, n int) map[string][]byte { + gens := generators() + kinds := []string{"file", "py-notebook", "ipynb", "dashboard", "sql-notebook"} + paths := generatePaths(shape, n) + out := make(map[string][]byte, len(paths)) + for i, rel := range paths { + suf, body := gens[kinds[i%len(kinds)]](i) + out[rel+suf] = body + } + return out +} + +// ----- parallel-walk runner (test-only alternative to list-repo) ------------ + +// parallelWalk lists everything under root by issuing non-recursive +// /api/2.0/workspace/list calls and recursing into directories from the +// client side, with up to `workers` outstanding calls in flight. It serves as +// a baseline to compare against the recursive list-repo endpoint. +// +// Test-only: we do not ship this in production. The list-repo path is +// strictly faster for any nested tree (see benchmark numbers). +func parallelWalk(ctx context.Context, c *apiclient.DatabricksClient, root string, workers int) ([]filer.RemoteFileMetadata, error) { + type listObject struct { + ObjectType string `json:"object_type"` + Path string `json:"path"` + ContentSHA256Hex string `json:"content_sha256_hex"` + HasWsfsMetadata bool `json:"has_wsfs_metadata"` + } + type listResp struct { + Objects []listObject `json:"objects"` + } + + sem := make(chan struct{}, workers) + var ( + mu stdsync.Mutex + results []filer.RemoteFileMetadata + firstErr error + wg stdsync.WaitGroup + ) + + var walk func(dir string) + walk = func(dir string) { + defer wg.Done() + sem <- struct{}{} + defer func() { <-sem }() + + var resp listResp + body := map[string]any{"path": dir, "return_wsfs_metadata": true} + if err := c.Do(ctx, http.MethodGet, "/api/2.0/workspace/list", nil, nil, body, &resp); err != nil { + mu.Lock() + if firstErr == nil { + firstErr = err + } + mu.Unlock() + return + } + for _, o := range resp.Objects { + if o.ObjectType == "DIRECTORY" { + wg.Add(1) + go walk(o.Path) + continue + } + if o.ContentSHA256Hex == "" { + continue + } + mu.Lock() + results = append(results, filer.RemoteFileMetadata{ + Path: o.Path, + ContentSHA256Hex: o.ContentSHA256Hex, + ObjectType: o.ObjectType, + }) + mu.Unlock() + } + } + wg.Add(1) + go walk(root) + wg.Wait() + return results, firstErr +} + +// ----- benchmarks ---------------------------------------------------------- + +// BenchmarkListRepo measures /workspace/list-repo cost vs (shape, N) at fixed +// content (plain files). +func BenchmarkListRepo(b *testing.B) { env := newBenchEnv(b) wfc, err := filer.NewWorkspaceFilesClient(env.wc, env.root) if err != nil { @@ -168,31 +338,26 @@ func BenchmarkListRepoByCount(b *testing.B) { } lister := wfc.(*filer.WorkspaceFilesClient) - gen := generators()["file"] - for _, n := range []int{10, 100, 500, 1000} { - b.Run(fmt.Sprintf("N=%d", n), func(b *testing.B) { - dir := path.Join(env.root, fmt.Sprintf("count-%d", n)) - items := make(map[string][]byte, n) - for i := 0; i < n; i++ { - suf, body := gen(i) - items[fmt.Sprintf("file-%05d%s", i, suf)] = body - } - populate(b, env, dir, items) - // Warm-up - _, _ = lister.ListWithSHAs(context.Background(), dir) - b.ResetTimer() - for i := 0; i < b.N; i++ { - if _, err := lister.ListWithSHAs(context.Background(), dir); err != nil { - b.Fatalf("list: %v", err) + counts := []int{10, 100, 500} + for _, shape := range shapes { + for _, n := range counts { + b.Run(fmt.Sprintf("shape=%s/N=%d", shape.name, n), func(b *testing.B) { + dir := path.Join(env.root, fmt.Sprintf("listrepo-%s-%d", shape.name, n)) + populate(b, env, dir, itemsForCell(shape, n, "file")) + _, _ = lister.ListWithSHAs(context.Background(), dir) + b.ResetTimer() + for i := 0; i < b.N; i++ { + if _, err := lister.ListWithSHAs(context.Background(), dir); err != nil { + b.Fatalf("list: %v", err) + } } - } - }) + }) + } } } -// BenchmarkListRepoByContent measures /workspace/list-repo cost when the -// directory has different mixes of object types. Each scenario has a fixed -// number of files; only the content mix varies. +// BenchmarkListRepoByContent measures /workspace/list-repo cost across +// content mixes, at every shape, fixed N=200. func BenchmarkListRepoByContent(b *testing.B) { const N = 200 env := newBenchEnv(b) @@ -202,28 +367,24 @@ func BenchmarkListRepoByContent(b *testing.B) { } lister := wfc.(*filer.WorkspaceFilesClient) - gen := generators() - scenarios := map[string]func(i int) (string, []byte){ - "all-files": gen["file"], - "all-py-notebooks": gen["py-notebook"], - "all-ipynb": gen["ipynb"], - "all-sql-notebooks": gen["sql-notebook"], - "all-dashboards": gen["dashboard"], - } - mixedKinds := []string{"file", "py-notebook", "ipynb", "dashboard", "sql-notebook"} - scenarios["mixed"] = func(i int) (string, []byte) { - return gen[mixedKinds[i%len(mixedKinds)]](i) - } - - for name, g := range scenarios { - b.Run(name, func(b *testing.B) { - dir := path.Join(env.root, "content-"+name) - items := make(map[string][]byte, N) - for i := 0; i < N; i++ { - suf, body := g(i) - items[fmt.Sprintf("item-%05d%s", i, suf)] = body - } - populate(b, env, dir, items) + contents := []string{"file", "py-notebook", "ipynb", "sql-notebook", "dashboard"} + for _, shape := range shapes { + for _, kind := range contents { + b.Run(fmt.Sprintf("shape=%s/content=%s", shape.name, kind), func(b *testing.B) { + dir := path.Join(env.root, fmt.Sprintf("content-%s-%s", shape.name, kind)) + populate(b, env, dir, itemsForCell(shape, N, kind)) + _, _ = lister.ListWithSHAs(context.Background(), dir) + b.ResetTimer() + for i := 0; i < b.N; i++ { + if _, err := lister.ListWithSHAs(context.Background(), dir); err != nil { + b.Fatalf("list: %v", err) + } + } + }) + } + b.Run(fmt.Sprintf("shape=%s/content=mixed", shape.name), func(b *testing.B) { + dir := path.Join(env.root, fmt.Sprintf("content-%s-mixed", shape.name)) + populate(b, env, dir, itemsMixed(shape, N)) _, _ = lister.ListWithSHAs(context.Background(), dir) b.ResetTimer() for i := 0; i < b.N; i++ { @@ -235,75 +396,118 @@ func BenchmarkListRepoByContent(b *testing.B) { } } -// BenchmarkSyncRunOnceColdSnapshot times an end-to-end Sync.RunOnce against -// a workspace that already has the bundle's files (the CI-cold-runner case). -// Sub-benchmarks compare with-Layer-3 vs without-Layer-3 to show the speedup -// the remote-SHA filter delivers when nothing has actually changed. -func BenchmarkSyncRunOnceColdSnapshot(b *testing.B) { +// BenchmarkListWalkers compares /workspace/list-repo (recursive, server-side) +// against the test-only parallel client-side walk, across (shape, N, workers). +// Plain-file content; the goal is to characterize the listing strategies +// themselves. +func BenchmarkListWalkers(b *testing.B) { env := newBenchEnv(b) - user, err := env.wc.CurrentUser.Me(context.Background()) + wfc, err := filer.NewWorkspaceFilesClient(env.wc, env.root) if err != nil { - b.Fatalf("Me: %v", err) + b.Fatalf("filer: %v", err) } + lister := wfc.(*filer.WorkspaceFilesClient) + + counts := []int{100, 500} + workerCounts := []int{8, 32} + for _, shape := range shapes { + for _, n := range counts { + dir := path.Join(env.root, fmt.Sprintf("walkers-%s-%d", shape.name, n)) + populate(b, env, dir, itemsForCell(shape, n, "file")) + _, _ = lister.ListWithSHAs(context.Background(), dir) - for _, n := range []int{20, 100, 500} { - b.Run(fmt.Sprintf("N=%d", n), func(b *testing.B) { - localDir := b.TempDir() - gen := generators()["file"] - for i := 0; i < n; i++ { - suf, body := gen(i) - p := filepath.Join(localDir, fmt.Sprintf("file-%05d%s", i, suf)) - if err := os.WriteFile(p, body, 0o644); err != nil { - b.Fatalf("write %s: %v", p, err) + b.Run(fmt.Sprintf("shape=%s/N=%d/impl=list-repo", shape.name, n), func(b *testing.B) { + for i := 0; i < b.N; i++ { + if _, err := lister.ListWithSHAs(context.Background(), dir); err != nil { + b.Fatalf("list-repo: %v", err) + } } + }) + for _, w := range workerCounts { + w := w + b.Run(fmt.Sprintf("shape=%s/N=%d/impl=parallel-w%d", shape.name, n, w), func(b *testing.B) { + for i := 0; i < b.N; i++ { + if _, err := parallelWalk(context.Background(), env.apiClient, dir, w); err != nil { + b.Fatalf("parallel-walk: %v", err) + } + } + }) } - remoteDir := path.Join(env.root, fmt.Sprintf("sync-N%d", n)) - if err := env.wc.Workspace.MkdirsByPath(context.Background(), remoteDir); err != nil { - b.Fatalf("mkdir: %v", err) - } + } + } +} - runOnce := func(b *testing.B, withLayer3 bool) { - _ = os.RemoveAll(filepath.Join(localDir, ".databricks")) - localRoot := vfs.MustNew(localDir) - snapBase := b.TempDir() - s, err := New(context.Background(), SyncOptions{ - WorktreeRoot: localRoot, - LocalRoot: localRoot, - Paths: []string{"."}, - RemotePath: remoteDir, - SnapshotBasePath: snapBase, - WorkspaceClient: env.wc, - CurrentUser: user, - }) - if err != nil { - b.Fatalf("Sync.New: %v", err) +// BenchmarkSyncRunOnceColdSnapshot times an end-to-end Sync.RunOnce against a +// pre-warmed remote (the CI-cold-runner case). Sub-benchmarks compare with- +// and without-Layer-3 across (shape, N) cells. Plain-file content. +func BenchmarkSyncRunOnceColdSnapshot(b *testing.B) { + env := newBenchEnv(b) + user, err := env.wc.CurrentUser.Me(context.Background()) + if err != nil { + b.Fatalf("Me: %v", err) + } + + counts := []int{20, 100} + for _, shape := range shapes { + for _, n := range counts { + b.Run(fmt.Sprintf("shape=%s/N=%d", shape.name, n), func(b *testing.B) { + localDir := b.TempDir() + gen := generators()["file"] + for i, rel := range generatePaths(shape, n) { + suf, body := gen(i) + p := filepath.Join(localDir, filepath.FromSlash(rel)+suf) + if err := os.MkdirAll(filepath.Dir(p), 0o755); err != nil { + b.Fatalf("mkdir: %v", err) + } + if err := os.WriteFile(p, body, 0o644); err != nil { + b.Fatalf("write %s: %v", p, err) + } } - if !withLayer3 { - s.remoteFilter = nil + remoteDir := path.Join(env.root, fmt.Sprintf("sync-%s-%d", shape.name, n)) + if err := env.wc.Workspace.MkdirsByPath(context.Background(), remoteDir); err != nil { + b.Fatalf("mkdir: %v", err) } - if _, err := s.RunOnce(context.Background()); err != nil { - b.Fatalf("RunOnce: %v", err) + + runOnce := func(b *testing.B, withLayer3 bool) { + _ = os.RemoveAll(filepath.Join(localDir, ".databricks")) + localRoot := vfs.MustNew(localDir) + snapBase := b.TempDir() + s, err := New(context.Background(), SyncOptions{ + WorktreeRoot: localRoot, + LocalRoot: localRoot, + Paths: []string{"."}, + RemotePath: remoteDir, + SnapshotBasePath: snapBase, + WorkspaceClient: env.wc, + CurrentUser: user, + }) + if err != nil { + b.Fatalf("Sync.New: %v", err) + } + if !withLayer3 { + s.remoteFilter = nil + } + if _, err := s.RunOnce(context.Background()); err != nil { + b.Fatalf("RunOnce: %v", err) + } + s.Close() } - s.Close() - } - // Pre-warm: one initial sync so the workspace has the files. - runOnce(b, true) - b.ResetTimer() + // Pre-warm so the workspace already has every file. + runOnce(b, true) + b.ResetTimer() - b.Run("with-layer3", func(b *testing.B) { - for i := 0; i < b.N; i++ { - runOnce(b, true) - } - }) - b.Run("without-layer3", func(b *testing.B) { - for i := 0; i < b.N; i++ { - runOnce(b, false) - } + b.Run("with-layer3", func(b *testing.B) { + for i := 0; i < b.N; i++ { + runOnce(b, true) + } + }) + b.Run("without-layer3", func(b *testing.B) { + for i := 0; i < b.N; i++ { + runOnce(b, false) + } + }) }) - }) + } } } - -// keep tests in this file linkable even without the bench tag -var _ = bytes.NewReader From 4acd80d8bd4c3b4ed775f67fbcad210e18d21d3d Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Wed, 6 May 2026 10:41:22 +0000 Subject: [PATCH 7/8] sync(bench): add minimal walker comparison BenchmarkWalkers compares /workspace/list-repo against the parallel list-walker across four leaf counts (1, 4, 16, 64) at full occupancy (one file per leaf). Reuses helpers from workspace_bench_test.go. Run: go test -tags benchworkspace -bench=BenchmarkWalkers$ -benchtime=3x -timeout=15m ./libs/sync/ Co-authored-by: Isaac --- libs/sync/walkers_bench_test.go | 68 +++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 libs/sync/walkers_bench_test.go diff --git a/libs/sync/walkers_bench_test.go b/libs/sync/walkers_bench_test.go new file mode 100644 index 00000000000..2fa8c1958df --- /dev/null +++ b/libs/sync/walkers_bench_test.go @@ -0,0 +1,68 @@ +//go:build benchworkspace + +// BenchmarkWalkers — minimal head-to-head comparison of /workspace/list-repo +// (recursive, server-side) vs a client-side parallel walker over the +// non-recursive /workspace/list endpoint. +// +// One cell per leaf-count: 1, 4, 16, 64. Full occupancy — exactly one file +// per leaf directory, so total files equals leaf count and every leaf is +// populated. Two sub-benches per cell: list-repo vs parallel-walk (8 workers). +// +// Run with 3 iterations per cell: +// +// export DATABRICKS_BENCH_PROFILE= +// export DATABRICKS_BENCH_USER= +// go test -tags benchworkspace -bench=BenchmarkWalkers$ -benchtime=3x -timeout=15m ./libs/sync/ + +package sync + +import ( + "context" + "path" + "testing" + + "github.com/databricks/cli/libs/filer" +) + +func BenchmarkWalkers(b *testing.B) { + env := newBenchEnv(b) + wfc, err := filer.NewWorkspaceFilesClient(env.wc, env.root) + if err != nil { + b.Fatalf("filer: %v", err) + } + lister := wfc.(*filer.WorkspaceFilesClient) + + // Shapes are sized so every leaf dir gets exactly one file at full + // occupancy (files = leaves). + cases := []struct { + name string + shape treeShape + leaves int + }{ + {"leaves=1", treeShape{"flat", 0, 0}, 1}, + {"leaves=4", treeShape{"small", 2, 2}, 4}, + {"leaves=16", treeShape{"medium", 4, 2}, 16}, + {"leaves=64", treeShape{"large", 6, 2}, 64}, + } + + for _, c := range cases { + dir := path.Join(env.root, "walkers-"+c.name) + populate(b, env, dir, itemsForCell(c.shape, c.leaves, "file")) + _, _ = lister.ListWithSHAs(context.Background(), dir) + + b.Run(c.name+"/list-repo", func(b *testing.B) { + for i := 0; i < b.N; i++ { + if _, err := lister.ListWithSHAs(context.Background(), dir); err != nil { + b.Fatalf("list-repo: %v", err) + } + } + }) + b.Run(c.name+"/parallel-walk", func(b *testing.B) { + for i := 0; i < b.N; i++ { + if _, err := parallelWalk(context.Background(), env.apiClient, dir, 8); err != nil { + b.Fatalf("parallel-walk: %v", err) + } + } + }) + } +} From 1f89453ad8b5be598f0ecfa249c6f427499ff27e Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Wed, 6 May 2026 10:49:34 +0000 Subject: [PATCH 8/8] sync(bench): round-robin walker bench with persistent fixture; parallelize mkdirs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BenchmarkWalkers now plants exactly 100 files in every cell (1, 4, 16, 64 leaves), distributing them round-robin: file 0 → leaf 0, file 1 → leaf 1, …, rolling over once every leaf has one file. Total file count is constant; only depth/breadth changes. The scaffold path is hardcoded at /Users/$DATABRICKS_BENCH_USER/.tmp/sync-bench-walkers-fixture and is not torn down between runs. ensureWalkersFixture skips the scaffold call when the per-cell directory already has the expected file count, so a second run reuses the workspace state. Cold run is ~80 s; warm re-run is ~45 s for the same 24 measurements. Also parallelize the mkdir loop in populate() — the directory pre- creation was the serial bottleneck for deep shapes (depth=6 with 64 leaves did 64 sequential mkdirs at ~150 ms each). Now uses the same 16-worker pool the upload loop already used. Co-authored-by: Isaac --- libs/sync/walkers_bench_test.go | 134 ++++++++++++++++++++++++++---- libs/sync/workspace_bench_test.go | 37 +++++++-- 2 files changed, 144 insertions(+), 27 deletions(-) diff --git a/libs/sync/walkers_bench_test.go b/libs/sync/walkers_bench_test.go index 2fa8c1958df..d6103587117 100644 --- a/libs/sync/walkers_bench_test.go +++ b/libs/sync/walkers_bench_test.go @@ -4,9 +4,21 @@ // (recursive, server-side) vs a client-side parallel walker over the // non-recursive /workspace/list endpoint. // -// One cell per leaf-count: 1, 4, 16, 64. Full occupancy — exactly one file -// per leaf directory, so total files equals leaf count and every leaf is -// populated. Two sub-benches per cell: list-repo vs parallel-walk (8 workers). +// Always 100 files, distributed round-robin across the available leaves of +// each tree shape. Total file count is constant; only depth/breadth changes: +// +// leaves=1 → 100 files in one dir (flat) +// leaves=4 → ~25 files per dir, depth 2 +// leaves=16 → ~6-7 files per dir, depth 4 +// leaves=64 → ~1-2 files per dir, depth 6 +// +// The scaffold lives at a hardcoded persistent path: +// +// /Users/$DATABRICKS_BENCH_USER/.tmp/sync-bench-walkers-fixture/ +// +// First run populates it (~one-time cost). Subsequent runs reuse it, so +// repeated bench iterations are fast. To force a fresh scaffold, delete the +// fixture dir from the workspace. // // Run with 3 iterations per cell: // @@ -18,37 +30,63 @@ package sync import ( "context" + "fmt" + "os" "path" "testing" "github.com/databricks/cli/libs/filer" + "github.com/databricks/databricks-sdk-go" + apiclient "github.com/databricks/databricks-sdk-go/client" + "github.com/databricks/databricks-sdk-go/config" +) + +const ( + walkersFilesPerCell = 100 + walkersFixtureName = "sync-bench-walkers-fixture" ) func BenchmarkWalkers(b *testing.B) { - env := newBenchEnv(b) - wfc, err := filer.NewWorkspaceFilesClient(env.wc, env.root) + profile := os.Getenv("DATABRICKS_BENCH_PROFILE") + user := os.Getenv("DATABRICKS_BENCH_USER") + if profile == "" || user == "" { + b.Skip("DATABRICKS_BENCH_PROFILE and DATABRICKS_BENCH_USER must be set") + } + wc, err := databricks.NewWorkspaceClient(&databricks.Config{Profile: profile}) + if err != nil { + b.Fatalf("workspace client: %v", err) + } + apiC, err := apiclient.New(&config.Config{Profile: profile}) + if err != nil { + b.Fatalf("api client: %v", err) + } + + fixtureRoot := fmt.Sprintf("/Users/%s/.tmp/%s", user, walkersFixtureName) + if err := wc.Workspace.MkdirsByPath(context.Background(), fixtureRoot); err != nil { + b.Fatalf("mkdir fixture root: %v", err) + } + env := &benchEnv{wc: wc, apiClient: apiC, username: user, root: fixtureRoot} + + wfc, err := filer.NewWorkspaceFilesClient(wc, fixtureRoot) if err != nil { b.Fatalf("filer: %v", err) } lister := wfc.(*filer.WorkspaceFilesClient) - // Shapes are sized so every leaf dir gets exactly one file at full - // occupancy (files = leaves). cases := []struct { - name string - shape treeShape - leaves int + name string + shape treeShape }{ - {"leaves=1", treeShape{"flat", 0, 0}, 1}, - {"leaves=4", treeShape{"small", 2, 2}, 4}, - {"leaves=16", treeShape{"medium", 4, 2}, 16}, - {"leaves=64", treeShape{"large", 6, 2}, 64}, + {"leaves=1", treeShape{"flat", 0, 0}}, + {"leaves=4", treeShape{"small", 2, 2}}, + {"leaves=16", treeShape{"medium", 4, 2}}, + {"leaves=64", treeShape{"large", 6, 2}}, } for _, c := range cases { - dir := path.Join(env.root, "walkers-"+c.name) - populate(b, env, dir, itemsForCell(c.shape, c.leaves, "file")) - _, _ = lister.ListWithSHAs(context.Background(), dir) + dir := path.Join(fixtureRoot, c.name) + items := walkersItemsRoundRobin(c.shape, walkersFilesPerCell) + ensureWalkersFixture(b, env, lister, dir, items) b.Run(c.name+"/list-repo", func(b *testing.B) { for i := 0; i < b.N; i++ { @@ -59,10 +97,70 @@ func BenchmarkWalkers(b *testing.B) { }) b.Run(c.name+"/parallel-walk", func(b *testing.B) { for i := 0; i < b.N; i++ { - if _, err := parallelWalk(context.Background(), env.apiClient, dir, 8); err != nil { + if _, err := parallelWalk(context.Background(), apiC, dir, 8); err != nil { b.Fatalf("parallel-walk: %v", err) } } }) } } + +// walkersItemsRoundRobin generates n file paths distributed round-robin +// across the leaves of the given shape (file 0 → leaf 0, file 1 → leaf 1, …, +// rolling over once every leaf has one file). Returns a (relPath → body) +// map suitable for passing to populate(). +func walkersItemsRoundRobin(shape treeShape, n int) map[string][]byte { + gen := generators()["file"] + leaves := walkersLeafPaths(shape) + out := make(map[string][]byte, n) + for i := 0; i < n; i++ { + leaf := leaves[i%len(leaves)] + suf, body := gen(i) + rel := path.Join(leaf, fmt.Sprintf("f-%d%s", i, suf)) + out[rel] = body + } + return out +} + +// walkersLeafPaths returns every leaf-dir path for the given shape, in DFS +// order. For shape=flat returns [""], so files land directly under the cell +// root. +func walkersLeafPaths(shape treeShape) []string { + if shape.depth == 0 { + return []string{""} + } + var paths []string + var walk func(prefix string, d int) + walk = func(prefix string, d int) { + if d == 0 { + paths = append(paths, prefix) + return + } + for b := 0; b < shape.branch; b++ { + walk(path.Join(prefix, fmt.Sprintf("d%d", b)), d-1) + } + } + walk("", shape.depth) + return paths +} + +// ensureWalkersFixture populates dir with items only if the existing fixture +// doesn't already match. Existence check is by file count under dir (using +// list-repo). Lets repeat bench runs reuse the scaffolding. +func ensureWalkersFixture(b *testing.B, env *benchEnv, lister *filer.WorkspaceFilesClient, dir string, items map[string][]byte) { + existing, err := lister.ListWithSHAs(context.Background(), dir) + if err == nil { + fileCount := 0 + for _, o := range existing { + if o.ContentSHA256Hex != "" { + fileCount++ + } + } + if fileCount == len(items) { + b.Logf("fixture %s reused (%d files)", dir, fileCount) + return + } + } + b.Logf("scaffolding fixture %s (%d files)", dir, len(items)) + populate(b, env, dir, items) +} diff --git a/libs/sync/workspace_bench_test.go b/libs/sync/workspace_bench_test.go index 562989674e5..c70bf3d99d4 100644 --- a/libs/sync/workspace_bench_test.go +++ b/libs/sync/workspace_bench_test.go @@ -36,9 +36,9 @@ import ( "path" "path/filepath" "sort" + "strings" stdsync "sync" "sync/atomic" - "strings" "testing" "github.com/databricks/cli/libs/filer" @@ -60,10 +60,10 @@ type treeShape struct { // shapes ranges from "flat" (no nesting) to "large" (deep tree, many dirs). // Tweak these values if you want to explore broader/narrower mixes. var shapes = []treeShape{ - {"flat", 0, 0}, // 1 leaf dir - {"small", 2, 2}, // 4 leaf dirs - {"medium", 4, 2}, // 16 leaf dirs - {"large", 6, 2}, // 64 leaf dirs + {"flat", 0, 0}, // 1 leaf dir + {"small", 2, 2}, // 4 leaf dirs + {"medium", 4, 2}, // 16 leaf dirs + {"large", 6, 2}, // 64 leaf dirs } // generatePaths returns n relative file paths arranged into a tree of the @@ -168,10 +168,29 @@ func populate(tb testing.TB, env *benchEnv, remoteDir string, items map[string][ parentList = append(parentList, d) } sort.Strings(parentList) + + // Parallelize the mkdirs — for deep shapes the serial loop dominates + // fixture setup time. + mkdirJobs := make(chan string, len(parentList)) for _, d := range parentList { - if err := env.wc.Workspace.MkdirsByPath(ctx, path.Join(remoteDir, d)); err != nil { - tb.Fatalf("mkdir %s: %v", d, err) - } + mkdirJobs <- d + } + close(mkdirJobs) + var mkdirWG stdsync.WaitGroup + var mkdirErr atomic.Pointer[error] + for i := 0; i < 16; i++ { + mkdirWG.Go(func() { + for d := range mkdirJobs { + if err := env.wc.Workspace.MkdirsByPath(ctx, path.Join(remoteDir, d)); err != nil { + e := fmt.Errorf("mkdir %s: %w", d, err) + mkdirErr.CompareAndSwap(nil, &e) + } + } + }) + } + mkdirWG.Wait() + if e := mkdirErr.Load(); e != nil { + tb.Fatalf("%v", *e) } type job struct { rel string @@ -338,7 +357,7 @@ func BenchmarkListRepo(b *testing.B) { } lister := wfc.(*filer.WorkspaceFilesClient) - counts := []int{10, 100, 500} + counts := []int{10, 50} for _, shape := range shapes { for _, n := range counts { b.Run(fmt.Sprintf("shape=%s/N=%d", shape.name, n), func(b *testing.B) {