|
| 1 | +// Copyright (c) .NET Foundation. All rights reserved. |
| 2 | +// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information. |
| 3 | + |
| 4 | +using System; |
| 5 | +using System.Collections.Generic; |
| 6 | +using System.IO; |
| 7 | +using System.Linq; |
| 8 | +using System.Security.Cryptography; |
| 9 | +using Microsoft.Build.Framework; |
| 10 | + |
| 11 | +namespace NuGet.Services.Build |
| 12 | +{ |
| 13 | + public class FindDuplicateFiles : Microsoft.Build.Utilities.Task |
| 14 | + { |
| 15 | + [Required] |
| 16 | + public ITaskItem[] Files { get; set; } |
| 17 | + |
| 18 | + [Output] |
| 19 | + public ITaskItem[] UniqueFiles { get; set; } |
| 20 | + |
| 21 | + [Output] |
| 22 | + public ITaskItem[] DuplicateFiles { get; set; } |
| 23 | + |
| 24 | + public override bool Execute() |
| 25 | + { |
| 26 | + var infos = GetUniqueTaskItemInfo(); |
| 27 | + Log.LogMessage( |
| 28 | + MessageImportance.High, |
| 29 | + "Of the {0} items provided, {1} are unique file paths that exist. {2} items will be ignored.", |
| 30 | + Files.Length, |
| 31 | + infos.Count, |
| 32 | + Files.Length - infos.Count); |
| 33 | + |
| 34 | + if (infos.Count == 0) |
| 35 | + { |
| 36 | + UniqueFiles = Array.Empty<ITaskItem>(); |
| 37 | + DuplicateFiles = Array.Empty<ITaskItem>(); |
| 38 | + return true; |
| 39 | + } |
| 40 | + |
| 41 | + var filePathToDuplicates = FindDuplicates(infos); |
| 42 | + |
| 43 | + var fileCount = filePathToDuplicates.Sum(x => x.Value.Count); |
| 44 | + var uniqueCount = filePathToDuplicates.Count; |
| 45 | + var duplicateCount = fileCount - uniqueCount; |
| 46 | + Log.LogMessage( |
| 47 | + MessageImportance.High, |
| 48 | + "Of the {0} unique file paths provided, {1} ({2:P}) are unique and {3} ({4:P}) are duplicate.", |
| 49 | + fileCount, |
| 50 | + uniqueCount, |
| 51 | + (float)uniqueCount / fileCount, |
| 52 | + duplicateCount, |
| 53 | + (float)duplicateCount / fileCount); |
| 54 | + |
| 55 | + var uniqueFiles = new List<ITaskItem>(); |
| 56 | + var duplicateFiles = new List<ITaskItem>(); |
| 57 | + foreach (var pair in filePathToDuplicates) |
| 58 | + { |
| 59 | + foreach (var duplicate in pair.Value) |
| 60 | + { |
| 61 | + if (pair.Key == duplicate.FullPath) |
| 62 | + { |
| 63 | + uniqueFiles.Add(duplicate.Item); |
| 64 | + } |
| 65 | + else |
| 66 | + { |
| 67 | + duplicate.Item.SetMetadata("DuplicateOf", pair.Key); |
| 68 | + duplicateFiles.Add(duplicate.Item); |
| 69 | + } |
| 70 | + } |
| 71 | + } |
| 72 | + |
| 73 | + UniqueFiles = uniqueFiles.ToArray(); |
| 74 | + DuplicateFiles = duplicateFiles.ToArray(); |
| 75 | + |
| 76 | + return true; |
| 77 | + } |
| 78 | + |
| 79 | + private List<TaskItemInfo> GetUniqueTaskItemInfo() |
| 80 | + { |
| 81 | + // Compare paths in a case insensitive manner. |
| 82 | + var uniqueFullPaths = new HashSet<string>(StringComparer.OrdinalIgnoreCase); |
| 83 | + |
| 84 | + var infos = new List<TaskItemInfo>(); |
| 85 | + foreach (var item in Files) |
| 86 | + { |
| 87 | + var fullPath = item.GetMetadata("FullPath"); |
| 88 | + if (fullPath == null || !File.Exists(fullPath)) |
| 89 | + { |
| 90 | + Log.LogWarning("File '{0}' does not exist.", item.ItemSpec); |
| 91 | + continue; |
| 92 | + } |
| 93 | + |
| 94 | + if (!uniqueFullPaths.Add(fullPath)) |
| 95 | + { |
| 96 | + Log.LogWarning("File path '{0}' is associated with multiple items. Only the first will be used.", item.ItemSpec); |
| 97 | + continue; |
| 98 | + } |
| 99 | + |
| 100 | + var info = new TaskItemInfo(item, fullPath); |
| 101 | + |
| 102 | + infos.Add(info); |
| 103 | + } |
| 104 | + |
| 105 | + return infos; |
| 106 | + } |
| 107 | + |
| 108 | + private Dictionary<string, List<TaskItemInfo>> FindDuplicates(List<TaskItemInfo> infos) |
| 109 | + { |
| 110 | + var fileSizeToInfos = GetFileSizeToInfos(infos); |
| 111 | + |
| 112 | + var filePathToDuplicates = new Dictionary<string, List<TaskItemInfo>>(); |
| 113 | + |
| 114 | + // Inspired by https://stackoverflow.com/a/36113168. |
| 115 | + var buffer = new byte[1024]; |
| 116 | + foreach (var fileSizePair in fileSizeToInfos) |
| 117 | + { |
| 118 | + // The file size is unique, the file is unique. |
| 119 | + if (TryAddUnique("file size", filePathToDuplicates, fileSizePair)) |
| 120 | + { |
| 121 | + continue; |
| 122 | + } |
| 123 | + |
| 124 | + BreakTiesWithLeadingHash(filePathToDuplicates, buffer, fileSizePair); |
| 125 | + } |
| 126 | + |
| 127 | + return filePathToDuplicates; |
| 128 | + } |
| 129 | + |
| 130 | + private Dictionary<long, List<TaskItemInfo>> GetFileSizeToInfos( |
| 131 | + IEnumerable<TaskItemInfo> infos) |
| 132 | + { |
| 133 | + var fileSizeToInfos = new Dictionary<long, List<TaskItemInfo>>(); |
| 134 | + foreach (var info in infos) |
| 135 | + { |
| 136 | + info.FileSize = new FileInfo(info.FullPath).Length; |
| 137 | + |
| 138 | + List<TaskItemInfo> infosWithSameFileSize; |
| 139 | + if (!fileSizeToInfos.TryGetValue(info.FileSize, out infosWithSameFileSize)) |
| 140 | + { |
| 141 | + infosWithSameFileSize = new List<TaskItemInfo> { info }; |
| 142 | + fileSizeToInfos.Add(info.FileSize, infosWithSameFileSize); |
| 143 | + } |
| 144 | + else |
| 145 | + { |
| 146 | + infosWithSameFileSize.Add(info); |
| 147 | + } |
| 148 | + } |
| 149 | + |
| 150 | + return fileSizeToInfos; |
| 151 | + } |
| 152 | + |
| 153 | + private void BreakTiesWithLeadingHash( |
| 154 | + Dictionary<string, List<TaskItemInfo>> filePathToDuplicates, |
| 155 | + byte[] buffer, |
| 156 | + KeyValuePair<long, List<TaskItemInfo>> fileSizePair) |
| 157 | + { |
| 158 | + var leadingHashToInfos = GetLeadingHashToInfos(fileSizePair.Value, buffer); |
| 159 | + |
| 160 | + foreach (var leadingHashPair in leadingHashToInfos) |
| 161 | + { |
| 162 | + // If the leading hash is unique, the file is unique. |
| 163 | + if (TryAddUnique("leading hash", filePathToDuplicates, leadingHashPair)) |
| 164 | + { |
| 165 | + continue; |
| 166 | + } |
| 167 | + |
| 168 | + BreakTiesWithHash(filePathToDuplicates, leadingHashPair); |
| 169 | + } |
| 170 | + } |
| 171 | + |
| 172 | + private static Dictionary<string, List<TaskItemInfo>> GetLeadingHashToInfos( |
| 173 | + IEnumerable<TaskItemInfo> infos, |
| 174 | + byte[] buffer) |
| 175 | + { |
| 176 | + var leadingHashToInfos = new Dictionary<string, List<TaskItemInfo>>(); |
| 177 | + foreach (var info in infos) |
| 178 | + { |
| 179 | + using (var fileStream = new FileStream(info.FullPath, FileMode.Open)) |
| 180 | + { |
| 181 | + var totalRead = 0; |
| 182 | + while (totalRead < buffer.Length) |
| 183 | + { |
| 184 | + var read = fileStream.Read(buffer, totalRead, buffer.Length - totalRead); |
| 185 | + totalRead += read; |
| 186 | + if (read == 0) |
| 187 | + { |
| 188 | + break; |
| 189 | + } |
| 190 | + } |
| 191 | + |
| 192 | + using (var hashAlgorithm = SHA256.Create()) |
| 193 | + { |
| 194 | + var hashBytes = hashAlgorithm.ComputeHash(buffer, 0, totalRead); |
| 195 | + var hash = GetHashString(hashBytes); |
| 196 | + info.HeaderHash = hash; |
| 197 | + |
| 198 | + List<TaskItemInfo> infosWithSameLeadingHash; |
| 199 | + if (!leadingHashToInfos.TryGetValue(hash, out infosWithSameLeadingHash)) |
| 200 | + { |
| 201 | + infosWithSameLeadingHash = new List<TaskItemInfo> { info }; |
| 202 | + leadingHashToInfos.Add(hash, infosWithSameLeadingHash); |
| 203 | + } |
| 204 | + else |
| 205 | + { |
| 206 | + infosWithSameLeadingHash.Add(info); |
| 207 | + } |
| 208 | + } |
| 209 | + } |
| 210 | + } |
| 211 | + |
| 212 | + return leadingHashToInfos; |
| 213 | + } |
| 214 | + |
| 215 | + private void BreakTiesWithHash( |
| 216 | + Dictionary<string, List<TaskItemInfo>> filePathToDuplicates, |
| 217 | + KeyValuePair<string, List<TaskItemInfo>> leadingHashPair) |
| 218 | + { |
| 219 | + var hashToInfos = GetHashToInfos(leadingHashPair.Value); |
| 220 | + |
| 221 | + foreach (var hashPair in hashToInfos) |
| 222 | + { |
| 223 | + // If the hash is unique, the file is unique. |
| 224 | + if (TryAddUnique("hash", filePathToDuplicates, hashPair)) |
| 225 | + { |
| 226 | + continue; |
| 227 | + } |
| 228 | + |
| 229 | + // If multiple files has the same hash, they are duplicates. |
| 230 | + filePathToDuplicates.Add(hashPair.Value[0].FullPath, hashPair.Value); |
| 231 | + |
| 232 | + Log.LogMessage( |
| 233 | + "File with {0} duplicates: {1}{2}{3}", |
| 234 | + hashPair.Value.Count - 1, |
| 235 | + hashPair.Value[0].FullPath, |
| 236 | + string.Concat(hashPair.Value.Skip(1).Select(x => Environment.NewLine + " - " + x.FullPath)), |
| 237 | + Environment.NewLine); |
| 238 | + } |
| 239 | + } |
| 240 | + |
| 241 | + private bool TryAddUnique<T>( |
| 242 | + string keyName, |
| 243 | + Dictionary<string, List<TaskItemInfo>> filePathToDuplicates, |
| 244 | + KeyValuePair<T, List<TaskItemInfo>> pair) |
| 245 | + { |
| 246 | + if (pair.Value.Count == 1) |
| 247 | + { |
| 248 | + Log.LogMessage( |
| 249 | + "Unique file by {0} {1}: {2}{3}", |
| 250 | + keyName, |
| 251 | + pair.Key, |
| 252 | + pair.Value[0].FullPath, |
| 253 | + Environment.NewLine); |
| 254 | + |
| 255 | + filePathToDuplicates.Add(pair.Value[0].FullPath, new List<TaskItemInfo> { pair.Value[0] }); |
| 256 | + return true; |
| 257 | + } |
| 258 | + |
| 259 | + return false; |
| 260 | + } |
| 261 | + |
| 262 | + private static Dictionary<string, List<TaskItemInfo>> GetHashToInfos( |
| 263 | + IEnumerable<TaskItemInfo> infos) |
| 264 | + { |
| 265 | + var hashToInfos = new Dictionary<string, List<TaskItemInfo>>(); |
| 266 | + foreach (var info in infos) |
| 267 | + { |
| 268 | + using (var fileStream = new FileStream(info.FullPath, FileMode.Open)) |
| 269 | + using (var hashAlgorithm = SHA256.Create()) |
| 270 | + { |
| 271 | + var hashBytes = hashAlgorithm.ComputeHash(fileStream); |
| 272 | + var hash = GetHashString(hashBytes); |
| 273 | + info.Hash = hash; |
| 274 | + |
| 275 | + List<TaskItemInfo> infosWithSameHash; |
| 276 | + if (!hashToInfos.TryGetValue(hash, out infosWithSameHash)) |
| 277 | + { |
| 278 | + infosWithSameHash = new List<TaskItemInfo> { info }; |
| 279 | + hashToInfos.Add(hash, infosWithSameHash); |
| 280 | + } |
| 281 | + else |
| 282 | + { |
| 283 | + infosWithSameHash.Add(info); |
| 284 | + } |
| 285 | + } |
| 286 | + } |
| 287 | + |
| 288 | + return hashToInfos; |
| 289 | + } |
| 290 | + |
| 291 | + private static string GetHashString(byte[] hashBytes) |
| 292 | + { |
| 293 | + return BitConverter.ToString(hashBytes).Replace("-", string.Empty).ToLowerInvariant(); |
| 294 | + } |
| 295 | + |
| 296 | + private class TaskItemInfo |
| 297 | + { |
| 298 | + public TaskItemInfo(ITaskItem item, string fullPath) |
| 299 | + { |
| 300 | + Item = item; |
| 301 | + FullPath = fullPath; |
| 302 | + } |
| 303 | + |
| 304 | + public ITaskItem Item { get; private set; } |
| 305 | + public string FullPath { get; private set; } |
| 306 | + public long FileSize { get; set; } |
| 307 | + public string HeaderHash { get; set; } |
| 308 | + public string Hash { get; set; } |
| 309 | + } |
| 310 | + } |
| 311 | +} |
0 commit comments