66 */
77
88use std:: collections:: HashMap ;
9+ use std:: collections:: HashSet ;
910
1011use anyhow:: Context ;
12+ use anyhow:: Error ;
1113use anyhow:: Result ;
14+ use basename_suffix_skeleton_manifest_v3:: RootBssmV3DirectoryId ;
1215use cloned:: cloned;
1316use context:: CoreContext ;
1417use derived_data_manager:: DerivationContext ;
1518use fsnodes:: RootFsnodeId ;
19+ use futures:: Stream ;
20+ use futures:: StreamExt ;
1621use futures:: future:: try_join_all;
1722use futures:: stream:: TryStreamExt ;
23+ use itertools:: EitherOrBoth ;
1824use manifest:: ManifestOps ;
1925use mononoke_types:: BonsaiChangeset ;
2026use mononoke_types:: ChangesetId ;
@@ -23,6 +29,9 @@ use mononoke_types::MPath;
2329use mononoke_types:: NonRootMPath ;
2430use mononoke_types:: inferred_copy_from:: InferredCopyFrom ;
2531use mononoke_types:: inferred_copy_from:: InferredCopyFromEntry ;
32+ use vec1:: Vec1 ;
33+
34+ const BASENAME_MATCH_MAX_CANDIDATES : usize = 10_000 ;
2635
2736// It's possible to have multiple source files that match,
2837// pick the one with the smallest path
@@ -59,6 +68,26 @@ async fn get_content_to_paths_from_changeset(
5968 Ok ( content_to_paths)
6069}
6170
71+ async fn get_matched_paths_by_basenames_from_changeset (
72+ ctx : & CoreContext ,
73+ derivation_ctx : & DerivationContext ,
74+ cs_id : ChangesetId ,
75+ basenames : Vec < String > ,
76+ path_prefixes : Vec < MPath > ,
77+ ) -> Result < impl Stream < Item = Result < MPath , Error > > > {
78+ derivation_ctx
79+ . fetch_dependency :: < RootBssmV3DirectoryId > ( ctx, cs_id)
80+ . await ?
81+ . find_files_filter_basenames (
82+ ctx,
83+ derivation_ctx. blobstore ( ) . clone ( ) ,
84+ path_prefixes,
85+ EitherOrBoth :: Left ( Vec1 :: try_from_vec ( basenames) ?) ,
86+ None ,
87+ )
88+ . await
89+ }
90+
6291// Find exact renames by comparing the content of deleted vs new/changed files
6392// in the current changeset. If they have the same content, the path pair is
6493// a rename.
@@ -123,15 +152,104 @@ async fn find_exact_renames(
123152 Ok ( renames)
124153}
125154
155+ // Infer copies by matching basenames between new/changed files in the
156+ // current changeset and other files in the same repo (with some constraints).
157+ // If the basenames match and the content are the same, the path pair is a copy.
158+ async fn find_basename_matched_copies (
159+ ctx : & CoreContext ,
160+ derivation_ctx : & DerivationContext ,
161+ bonsai : & BonsaiChangeset ,
162+ paths_to_ignore : & HashSet < MPath > ,
163+ ) -> Result < Vec < ( MPath , InferredCopyFromEntry ) > > {
164+ let mut content_to_paths = HashMap :: new ( ) ;
165+ let mut basenames = HashSet :: new ( ) ;
166+ let mut path_prefixes = HashSet :: new ( ) ;
167+ for ( path, file_change) in bonsai. simplified_file_changes ( ) {
168+ if !paths_to_ignore. contains ( path. into ( ) ) {
169+ if let Some ( fc) = file_change {
170+ content_to_paths
171+ . entry ( fc. content_id ( ) )
172+ . or_insert ( vec ! [ ] )
173+ . push ( path. clone ( ) ) ;
174+
175+ basenames. insert ( path. basename ( ) . to_string ( ) ) ;
176+ // Restrict search to any of the touched top-level directory
177+ if let Some ( path_prefix) = path. take_prefix_components ( 1 ) ? {
178+ path_prefixes. insert ( MPath :: from ( path_prefix) ) ;
179+ }
180+ }
181+ }
182+ }
183+ if basenames. is_empty ( ) {
184+ return Ok ( vec ! [ ] ) ;
185+ }
186+
187+ let basenames_vec = basenames. into_iter ( ) . collect :: < Vec < _ > > ( ) ;
188+ let path_prefixes_vec = path_prefixes. into_iter ( ) . collect :: < Vec < _ > > ( ) ;
189+ let mut content_to_matched_paths = HashMap :: new ( ) ;
190+
191+ for parent_cs_id in bonsai. parents ( ) {
192+ content_to_matched_paths. extend (
193+ get_matched_paths_by_basenames_from_changeset (
194+ ctx,
195+ derivation_ctx,
196+ parent_cs_id,
197+ basenames_vec. clone ( ) ,
198+ path_prefixes_vec. clone ( ) ,
199+ )
200+ . await ?
201+ . try_filter_map ( async move |path| Ok ( path. into_optional_non_root_path ( ) ) )
202+ . take ( BASENAME_MATCH_MAX_CANDIDATES )
203+ . try_chunks ( 100 )
204+ . try_fold ( HashMap :: new ( ) , |mut acc, paths| async move {
205+ let hashmap =
206+ get_content_to_paths_from_changeset ( ctx, derivation_ctx, parent_cs_id, paths)
207+ . await ;
208+ if let Ok ( hashmap) = hashmap {
209+ acc. extend ( hashmap. into_iter ( ) ) ;
210+ }
211+ Ok ( acc)
212+ } )
213+ . await ?
214+ . into_iter ( ) ,
215+ ) ;
216+ }
217+
218+ let mut copies = vec ! [ ] ;
219+ for ( content_id, paths) in content_to_paths {
220+ if let Some ( matched_paths) = content_to_matched_paths. get ( & content_id) {
221+ let from = pick_source_from_candidates ( matched_paths) . unwrap ( ) ;
222+ for path in paths {
223+ copies. push ( (
224+ MPath :: from ( path) ,
225+ InferredCopyFromEntry {
226+ from_csid : from. 0 ,
227+ from_path : from. 1 . clone ( ) ,
228+ } ,
229+ ) ) ;
230+ }
231+ }
232+ }
233+ Ok ( copies)
234+ }
235+
236+ // TODO: add more cases
237+ // Ref: https://github.com/git/git/blob/master/diffcore-rename.c
126238pub ( crate ) async fn derive_impl (
127239 ctx : & CoreContext ,
128240 derivation_ctx : & DerivationContext ,
129241 bonsai : & BonsaiChangeset ,
130242) -> Result < Option < InferredCopyFrom > > {
131- // TODO: add more cases
132- // Ref: https://github.com/git/git/blob/master/diffcore-rename.c
133- let entries = find_exact_renames ( ctx, derivation_ctx, bonsai) . await ?;
243+ let mut resolved_paths = HashSet :: new ( ) ;
244+
245+ let exact_renames = find_exact_renames ( ctx, derivation_ctx, bonsai) . await ?;
246+ resolved_paths. extend ( exact_renames. iter ( ) . map ( |( path, _) | path. clone ( ) ) ) ;
247+
248+ let basename_matched_copies =
249+ find_basename_matched_copies ( ctx, derivation_ctx, bonsai, & resolved_paths) . await ?;
250+ resolved_paths. extend ( basename_matched_copies. iter ( ) . map ( |( path, _) | path. clone ( ) ) ) ;
134251
252+ let entries = [ exact_renames, basename_matched_copies] . concat ( ) ;
135253 if entries. is_empty ( ) {
136254 Ok ( None )
137255 } else {
0 commit comments