Skip to content

Commit 4f328b4

Browse files
Liu Yangfacebook-github-bot
authored andcommitted
InferredCopyFrom: consider basename match
Summary: This is another relatively cheap copy-detection heuristic, it roughly goes as: * extract all the basenames of the new/changed files and all the top-level directories they are in from the current changeset * search within the same those directories for files with the same basenames * if any pair has the same `content_id`, record them as copy This makes this type depends on BSSM where we query for files using basenames. Differential Revision: D75554932 fbshipit-source-id: b3c40abbb6017e21d6a90a74b90e8d7b6d1c7e29
1 parent fd43673 commit 4f328b4

5 files changed

Lines changed: 200 additions & 4 deletions

File tree

eden/mononoke/derived_data/BUCK

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -578,6 +578,9 @@ rust_library(
578578
"fbsource//third-party/rust:anyhow",
579579
"fbsource//third-party/rust:async-trait",
580580
"fbsource//third-party/rust:futures",
581+
"fbsource//third-party/rust:itertools",
582+
"fbsource//third-party/rust:vec1",
583+
":basename_suffix_skeleton_manifest_v3",
581584
":fsnodes",
582585
"//common/rust/shed/cloned:cloned",
583586
"//eden/mononoke/blobstore:blobstore",

eden/mononoke/derived_data/inferred_copy_from/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,18 @@ path = "lib.rs"
1313
[dependencies]
1414
anyhow = "1.0.95"
1515
async-trait = "0.1.86"
16+
basename_suffix_skeleton_manifest_v3 = { version = "0.1.0", path = "../basename_suffix_skeleton_manifest_v3" }
1617
blobstore = { version = "0.1.0", path = "../../blobstore" }
1718
cloned = { version = "0.1.0", git = "https://github.com/facebookexperimental/rust-shed.git", branch = "main" }
1819
context = { version = "0.1.0", path = "../../server/context" }
1920
derived_data_manager = { version = "0.1.0", path = "../manager" }
2021
derived_data_service_if = { version = "0.1.0", path = "../remote/if" }
2122
fsnodes = { version = "0.1.0", path = "../fsnodes" }
2223
futures = { version = "0.3.30", features = ["async-await", "compat"] }
24+
itertools = "0.14.0"
2325
manifest = { version = "0.1.0", path = "../../manifest" }
2426
mononoke_types = { version = "0.1.0", path = "../../mononoke_types" }
27+
vec1 = { version = "1", features = ["serde"] }
2528

2629
[dev-dependencies]
2730
facet = { version = "0.1.0", git = "https://github.com/facebookexperimental/rust-shed.git", branch = "main" }

eden/mononoke/derived_data/inferred_copy_from/derive.rs

Lines changed: 121 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,21 @@
66
*/
77

88
use std::collections::HashMap;
9+
use std::collections::HashSet;
910

1011
use anyhow::Context;
12+
use anyhow::Error;
1113
use anyhow::Result;
14+
use basename_suffix_skeleton_manifest_v3::RootBssmV3DirectoryId;
1215
use cloned::cloned;
1316
use context::CoreContext;
1417
use derived_data_manager::DerivationContext;
1518
use fsnodes::RootFsnodeId;
19+
use futures::Stream;
20+
use futures::StreamExt;
1621
use futures::future::try_join_all;
1722
use futures::stream::TryStreamExt;
23+
use itertools::EitherOrBoth;
1824
use manifest::ManifestOps;
1925
use mononoke_types::BonsaiChangeset;
2026
use mononoke_types::ChangesetId;
@@ -23,6 +29,9 @@ use mononoke_types::MPath;
2329
use mononoke_types::NonRootMPath;
2430
use mononoke_types::inferred_copy_from::InferredCopyFrom;
2531
use mononoke_types::inferred_copy_from::InferredCopyFromEntry;
32+
use vec1::Vec1;
33+
34+
const BASENAME_MATCH_MAX_CANDIDATES: usize = 10_000;
2635

2736
// It's possible to have multiple source files that match,
2837
// pick the one with the smallest path
@@ -59,6 +68,26 @@ async fn get_content_to_paths_from_changeset(
5968
Ok(content_to_paths)
6069
}
6170

71+
async fn get_matched_paths_by_basenames_from_changeset(
72+
ctx: &CoreContext,
73+
derivation_ctx: &DerivationContext,
74+
cs_id: ChangesetId,
75+
basenames: Vec<String>,
76+
path_prefixes: Vec<MPath>,
77+
) -> Result<impl Stream<Item = Result<MPath, Error>>> {
78+
derivation_ctx
79+
.fetch_dependency::<RootBssmV3DirectoryId>(ctx, cs_id)
80+
.await?
81+
.find_files_filter_basenames(
82+
ctx,
83+
derivation_ctx.blobstore().clone(),
84+
path_prefixes,
85+
EitherOrBoth::Left(Vec1::try_from_vec(basenames)?),
86+
None,
87+
)
88+
.await
89+
}
90+
6291
// Find exact renames by comparing the content of deleted vs new/changed files
6392
// in the current changeset. If they have the same content, the path pair is
6493
// a rename.
@@ -123,15 +152,104 @@ async fn find_exact_renames(
123152
Ok(renames)
124153
}
125154

155+
// Infer copies by matching basenames between new/changed files in the
156+
// current changeset and other files in the same repo (with some constraints).
157+
// If the basenames match and the content are the same, the path pair is a copy.
158+
async fn find_basename_matched_copies(
159+
ctx: &CoreContext,
160+
derivation_ctx: &DerivationContext,
161+
bonsai: &BonsaiChangeset,
162+
paths_to_ignore: &HashSet<MPath>,
163+
) -> Result<Vec<(MPath, InferredCopyFromEntry)>> {
164+
let mut content_to_paths = HashMap::new();
165+
let mut basenames = HashSet::new();
166+
let mut path_prefixes = HashSet::new();
167+
for (path, file_change) in bonsai.simplified_file_changes() {
168+
if !paths_to_ignore.contains(path.into()) {
169+
if let Some(fc) = file_change {
170+
content_to_paths
171+
.entry(fc.content_id())
172+
.or_insert(vec![])
173+
.push(path.clone());
174+
175+
basenames.insert(path.basename().to_string());
176+
// Restrict search to any of the touched top-level directory
177+
if let Some(path_prefix) = path.take_prefix_components(1)? {
178+
path_prefixes.insert(MPath::from(path_prefix));
179+
}
180+
}
181+
}
182+
}
183+
if basenames.is_empty() {
184+
return Ok(vec![]);
185+
}
186+
187+
let basenames_vec = basenames.into_iter().collect::<Vec<_>>();
188+
let path_prefixes_vec = path_prefixes.into_iter().collect::<Vec<_>>();
189+
let mut content_to_matched_paths = HashMap::new();
190+
191+
for parent_cs_id in bonsai.parents() {
192+
content_to_matched_paths.extend(
193+
get_matched_paths_by_basenames_from_changeset(
194+
ctx,
195+
derivation_ctx,
196+
parent_cs_id,
197+
basenames_vec.clone(),
198+
path_prefixes_vec.clone(),
199+
)
200+
.await?
201+
.try_filter_map(async move |path| Ok(path.into_optional_non_root_path()))
202+
.take(BASENAME_MATCH_MAX_CANDIDATES)
203+
.try_chunks(100)
204+
.try_fold(HashMap::new(), |mut acc, paths| async move {
205+
let hashmap =
206+
get_content_to_paths_from_changeset(ctx, derivation_ctx, parent_cs_id, paths)
207+
.await;
208+
if let Ok(hashmap) = hashmap {
209+
acc.extend(hashmap.into_iter());
210+
}
211+
Ok(acc)
212+
})
213+
.await?
214+
.into_iter(),
215+
);
216+
}
217+
218+
let mut copies = vec![];
219+
for (content_id, paths) in content_to_paths {
220+
if let Some(matched_paths) = content_to_matched_paths.get(&content_id) {
221+
let from = pick_source_from_candidates(matched_paths).unwrap();
222+
for path in paths {
223+
copies.push((
224+
MPath::from(path),
225+
InferredCopyFromEntry {
226+
from_csid: from.0,
227+
from_path: from.1.clone(),
228+
},
229+
));
230+
}
231+
}
232+
}
233+
Ok(copies)
234+
}
235+
236+
// TODO: add more cases
237+
// Ref: https://github.com/git/git/blob/master/diffcore-rename.c
126238
pub(crate) async fn derive_impl(
127239
ctx: &CoreContext,
128240
derivation_ctx: &DerivationContext,
129241
bonsai: &BonsaiChangeset,
130242
) -> Result<Option<InferredCopyFrom>> {
131-
// TODO: add more cases
132-
// Ref: https://github.com/git/git/blob/master/diffcore-rename.c
133-
let entries = find_exact_renames(ctx, derivation_ctx, bonsai).await?;
243+
let mut resolved_paths = HashSet::new();
244+
245+
let exact_renames = find_exact_renames(ctx, derivation_ctx, bonsai).await?;
246+
resolved_paths.extend(exact_renames.iter().map(|(path, _)| path.clone()));
247+
248+
let basename_matched_copies =
249+
find_basename_matched_copies(ctx, derivation_ctx, bonsai, &resolved_paths).await?;
250+
resolved_paths.extend(basename_matched_copies.iter().map(|(path, _)| path.clone()));
134251

252+
let entries = [exact_renames, basename_matched_copies].concat();
135253
if entries.is_empty() {
136254
Ok(None)
137255
} else {

eden/mononoke/derived_data/inferred_copy_from/mapping.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ use anyhow::Error;
1212
use anyhow::Result;
1313
use anyhow::anyhow;
1414
use async_trait::async_trait;
15+
use basename_suffix_skeleton_manifest_v3::RootBssmV3DirectoryId;
1516
use blobstore::BlobstoreGetData;
1617
use blobstore::Storable;
1718
use context::CoreContext;
@@ -73,7 +74,7 @@ impl RootInferredCopyFromId {
7374
impl BonsaiDerivable for RootInferredCopyFromId {
7475
const VARIANT: DerivableType = DerivableType::InferredCopyFrom;
7576

76-
type Dependencies = dependencies![RootFsnodeId];
77+
type Dependencies = dependencies![RootFsnodeId, RootBssmV3DirectoryId];
7778
type PredecessorDependencies = dependencies![];
7879

7980
async fn derive_single(

eden/mononoke/derived_data/inferred_copy_from/tests.rs

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,39 @@ async fn init_repo(ctx: &CoreContext) -> Result<(Repo, HashMap<&'static str, Cha
7171
.await?,
7272
);
7373

74+
changesets.insert(
75+
"d",
76+
CreateCommitContext::new(ctx, &repo, vec![changesets["c"]])
77+
.add_file("path/to/basename1", "aabbcc\n")
78+
.add_file("path/to/basename2", "ddeeff\n")
79+
.set_author_date(DateTime::from_timestamp(1000, 0)?)
80+
.commit()
81+
.await?,
82+
);
83+
changesets.insert(
84+
"e",
85+
CreateCommitContext::new(ctx, &repo, vec![changesets["d"]])
86+
// Inferred copies:
87+
// d:path/to/basename1 -> path/basename1
88+
// d:path/to/basename2 -> path/basename2
89+
// d:path/to/basename2 -> another/path/basename2
90+
.add_file("path/basename1", "aabbcc\n")
91+
.add_file("path/basename2", "ddeeff\n")
92+
.add_file("another/path/basename2", "ddeeff\n")
93+
.set_author_date(DateTime::from_timestamp(1000, 0)?)
94+
.commit()
95+
.await?,
96+
);
97+
changesets.insert(
98+
"f",
99+
CreateCommitContext::new(ctx, &repo, vec![changesets["d"]])
100+
// Not detected due to the directory constraint.
101+
.add_file("another/path/basename2", "ddeeff\n")
102+
.set_author_date(DateTime::from_timestamp(1000, 0)?)
103+
.commit()
104+
.await?,
105+
);
106+
74107
Ok((repo, changesets))
75108
}
76109

@@ -126,6 +159,44 @@ async fn derive_single_test(fb: FacebookInit) -> Result<()> {
126159
)
127160
.await?;
128161

162+
assert_entries(
163+
&ctx,
164+
&repo,
165+
repo_ctx.changeset(changesets["e"]).await?.unwrap().id(),
166+
&[
167+
(
168+
MPath::new("another/path/basename2")?,
169+
InferredCopyFromEntry {
170+
from_csid: changesets["d"],
171+
from_path: MPath::new("path/to/basename2")?,
172+
},
173+
),
174+
(
175+
MPath::new("path/basename1")?,
176+
InferredCopyFromEntry {
177+
from_csid: changesets["d"],
178+
from_path: MPath::new("path/to/basename1")?,
179+
},
180+
),
181+
(
182+
MPath::new("path/basename2")?,
183+
InferredCopyFromEntry {
184+
from_csid: changesets["d"],
185+
from_path: MPath::new("path/to/basename2")?,
186+
},
187+
),
188+
],
189+
)
190+
.await?;
191+
192+
assert_entries(
193+
&ctx,
194+
&repo,
195+
repo_ctx.changeset(changesets["f"]).await?.unwrap().id(),
196+
&[],
197+
)
198+
.await?;
199+
129200
Ok(())
130201
}
131202

0 commit comments

Comments
 (0)