Skip to content
This repository was archived by the owner on Jan 12, 2024. It is now read-only.

Commit 9557076

Browse files
committed
Make de-duplication more robust, add tests
Consider easy scenarios like assuming same package path and source full path, same package path, different source file but same length and file size (i.e. same source file added as Content and copied to each project output and added from there). But also add de-duplication of files where even if the last write is different, they still have the same contents, which could be the case for generated files whose source is shared across projects (i.e. in a shared project, or generated via MSBuild, etc.). This covers all de-duplication scenarios we could think of, and only reports the true duplicates that can't be disambiguated by any of the above means, and are truly authoring errors.
1 parent 2ca03ac commit 9557076

8 files changed

Lines changed: 173 additions & 38 deletions

File tree

src/Build/NuGet.Build.Packaging.Tasks/CreatePackage.cs

Lines changed: 76 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
using NuGet.Packaging;
1111
using System.Collections.Generic;
1212
using static NuGet.Build.Packaging.Properties.Strings;
13+
using System.Security.Cryptography;
1314

1415
namespace NuGet.Build.Packaging.Tasks
1516
{
@@ -40,7 +41,7 @@ public override bool Execute()
4041

4142
OutputPackage = new TaskItem(TargetPath);
4243
Manifest.CopyMetadataTo(OutputPackage);
43-
44+
4445
return !Log.HasLoggedErrors;
4546
}
4647
catch (Exception ex)
@@ -106,7 +107,7 @@ public Manifest CreateManifest()
106107
void AddDependencies(Manifest manifest)
107108
{
108109
var dependencies = from item in Contents
109-
where item.GetMetadata(MetadataName.Kind) == PackageItemKind.Dependency &&
110+
where item.GetMetadata(MetadataName.Kind) == PackageItemKind.Dependency &&
110111
!"all".Equals(item.GetMetadata(MetadataName.PrivateAssets), StringComparison.OrdinalIgnoreCase)
111112
select new Dependency
112113
{
@@ -145,22 +146,75 @@ select item.GetNuGetTargetFramework()))
145146

146147
void AddFiles(Manifest manifest)
147148
{
148-
// Remove duplicate files
149-
var contents = Contents
150-
.Where(item => !string.IsNullOrEmpty(item.GetMetadata(MetadataName.PackagePath)))
151-
.GroupBy(item => item.ItemSpec)
152-
.Select(x => x.First()).ToArray();
149+
var contents = new List<ITaskItem>();
153150

154-
//Find files with conflicting package path
155-
var duplicates = contents.GroupBy(item => item.GetMetadata(MetadataName.PackagePath))
156-
.Where(x => x.Count() > 1)
157-
.Select(x => x.Key);
158-
159-
foreach (var duplicate in duplicates)
151+
var groupedByPackagePath = Contents
152+
.Where(item => !string.IsNullOrEmpty(item.GetMetadata(MetadataName.PackagePath)))
153+
.GroupBy(item => item.GetMetadata(MetadataName.PackagePath))
154+
// Iterate only once for this grouping.
155+
.ToDictionary(item => item.Key, item => item.ToArray());
156+
157+
// Add the ones we already determined as unique by package path.
158+
contents.AddRange(groupedByPackagePath
159+
.Where(group => group.Value.Length == 1)
160+
.Select(group => group.Value.First()));
161+
162+
var groupedByLastWriteAndLength = groupedByPackagePath
163+
.Where(group => group.Value.Length > 1)
164+
.SelectMany(group => group.Value)
165+
// Tuple provides structural comparison and hashing already, so leverage that.
166+
.GroupBy(item => Tuple.Create(
167+
item.GetMetadata(MetadataName.PackagePath),
168+
item.GetMetadata("Filename"),
169+
item.GetMetadata("Extension"),
170+
File.GetLastWriteTime(item.GetMetadata("FullPath")),
171+
new FileInfo(item.GetMetadata("FullPath")).Length))
172+
.ToDictionary(item => item.Key, item => item.ToArray());
173+
174+
// Add the ones we already determined to be duplicates that can safely be
175+
// unified by package path, file name, last write time and file length.
176+
contents.AddRange(groupedByLastWriteAndLength
177+
.Where(group => group.Value.Length > 1)
178+
.Select(group => group.Value.First()));
179+
180+
var md5 = new Lazy<HashAlgorithm>(() => MD5.Create());
181+
string hash(ITaskItem item)
160182
{
161-
Log.LogErrorCode(nameof(ErrorCode.NG0012), ErrorCode.NG0012(duplicate));
183+
using (var file = File.OpenRead(item.GetMetadata("FullPath")))
184+
{
185+
return string.Concat(md5.Value.ComputeHash(file).Select(x => x.ToString("x2")));
186+
}
162187
}
163188

189+
// Last remaining attempt at de-duplication is costly, but by now, we should
190+
// have successfully removed all obvious cases.
191+
// This deals with case where the files are modified at different times
192+
// (maybe a generated file?) but their actual contents are the same.
193+
var groupedByContentHash = groupedByLastWriteAndLength
194+
.Where(group => group.Value.Length == 1)
195+
.SelectMany(group => group.Value)
196+
.GroupBy(item => Tuple.Create(
197+
item.GetMetadata(MetadataName.PackagePath),
198+
hash(item)))
199+
.ToDictionary(item => item.Key, item => item.ToArray());
200+
201+
// Add the ones we determined to be duplicates that can safely be
202+
// unified by package path and MD5 hash
203+
contents.AddRange(groupedByContentHash
204+
.Where(group => group.Value.Length > 1)
205+
.Select(group => group.Value.First()));
206+
207+
// At this point, we're 100% certain these are duplicate package path
208+
// files that have distinct sources and would result in one overwriting
209+
// the other or an invalid package.
210+
var duplicates = string.Join(Environment.NewLine, groupedByContentHash
211+
.Where(group => group.Value.Length == 1)
212+
.SelectMany(group => group.Value)
213+
.Select(item => $"'{item.GetMetadata("FullPath")}' > '{item.GetMetadata(MetadataName.PackagePath)}'"));
214+
215+
if (duplicates.Length > 0)
216+
Log.LogErrorCode(nameof(ErrorCode.NG0012), ErrorCode.NG0012(duplicates));
217+
164218
// All files need to be added so they are included in the nupkg
165219
manifest.Files.AddRange(contents
166220
.Select(item => new ManifestFile
@@ -184,12 +238,12 @@ void AddFiles(Manifest manifest)
184238
void AddFrameworkAssemblies(Manifest manifest)
185239
{
186240
var frameworkReferences = (from item in Contents
187-
where item.GetMetadata(MetadataName.Kind) == PackageItemKind.FrameworkReference
188-
select new FrameworkAssemblyReference
189-
(
190-
item.ItemSpec,
191-
new[] { NuGetFramework.Parse(item.GetTargetFrameworkMoniker().FullName) }
192-
)).Distinct(FrameworkAssemblyReferenceComparer.Default);
241+
where item.GetMetadata(MetadataName.Kind) == PackageItemKind.FrameworkReference
242+
select new FrameworkAssemblyReference
243+
(
244+
item.ItemSpec,
245+
new[] { NuGetFramework.Parse(item.GetTargetFrameworkMoniker().FullName) }
246+
)).Distinct(FrameworkAssemblyReferenceComparer.Default);
193247

194248
manifest.Metadata.FrameworkReferences = frameworkReferences;
195249
}
@@ -203,9 +257,9 @@ void BuildPackage(Stream output)
203257
// We don't use PopulateFiles because that performs search expansion, base path
204258
// extraction and the like, which messes with our determined files to include.
205259
// TBD: do we support wilcard-based include/exclude?
206-
builder.Files.AddRange(manifest.Files.Select(file =>
260+
builder.Files.AddRange(manifest.Files.Select(file =>
207261
new PhysicalPackageFile { SourcePath = file.Source, TargetPath = file.Target }));
208-
262+
209263
builder.Save(output);
210264

211265
if (!string.IsNullOrEmpty(NuspecFile))

src/Build/NuGet.Build.Packaging.Tasks/Properties/Resources.Designer.cs

Lines changed: 2 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/Build/NuGet.Build.Packaging.Tasks/Properties/Resources.resx

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,8 @@
124124
<value>Project references to include in package must be nugetized.</value>
125125
</data>
126126
<data name="ErrorCode_NG0012" xml:space="preserve">
127-
<value>Duplicate package source files detected for target package file path '{0}'.</value>
127+
<value>Duplicate package source files with distinct content detected. Duplicates are not allowed in the package. Please remove the conflict between these files:
128+
{0}</value>
128129
</data>
129130
<data name="ErrorCode_NG0013" xml:space="preserve">
130131
<value>Content files cannot specify the reserved 'contentFiles' relative directory. Please use the 'CodeLanguage' and 'TargetFramework' item metadata for '{0}' to control its relative path within 'contentFiles'.</value>

src/Build/NuGet.Build.Packaging.Tests/Scenarios/given_a_library_with_non_nugetized_reference/b/project.lock.json

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
2-
"version": 2,
2+
"version": 3,
33
"targets": {
44
".NETFramework,Version=v4.6": {
55
"Microsoft.CodeAnalysis/1.3.2": {
@@ -3662,10 +3662,18 @@
36623662
"projectName": "b",
36633663
"projectPath": "C:\\Code\\Xamarin\\NuGet.Build.Packaging\\src\\Build\\NuGet.Build.Packaging.Tests\\Scenarios\\given_a_library_with_non_nugetized_reference\\b\\b.csproj",
36643664
"projectJsonPath": "C:\\Code\\Xamarin\\NuGet.Build.Packaging\\src\\Build\\NuGet.Build.Packaging.Tests\\Scenarios\\given_a_library_with_non_nugetized_reference\\b\\project.json",
3665+
"packagesPath": "C:\\Users\\kzu\\.nuget\\packages\\",
3666+
"outputPath": "C:\\Code\\Xamarin\\NuGet.Build.Packaging\\src\\Build\\NuGet.Build.Packaging.Tests\\Scenarios\\given_a_library_with_non_nugetized_reference\\b\\obj\\",
36653667
"projectStyle": "ProjectJson",
36663668
"validateRuntimeAssets": true,
3667-
"files": {
3668-
"lib/net46/b.dll": "C:\\Code\\Xamarin\\NuGet.Build.Packaging\\src\\Build\\NuGet.Build.Packaging.Tests\\Scenarios\\given_a_library_with_non_nugetized_reference\\b\\bin\\placeholder\\net46\\b.dll"
3669+
"configFilePaths": [
3670+
"C:\\Code\\Xamarin\\NuGet.Build.Packaging\\src\\NuGet.Config",
3671+
"C:\\Users\\kzu\\AppData\\Roaming\\NuGet\\NuGet.Config",
3672+
"C:\\Program Files (x86)\\NuGet\\Config\\Microsoft.VisualStudio.Offline.config"
3673+
],
3674+
"sources": {
3675+
"https://api.nuget.org/v3/index.json": {},
3676+
"https://dotnet.myget.org/F/nuget-volatile/api/v3/index.json": {}
36693677
},
36703678
"frameworks": {
36713679
"net46": {

src/Build/NuGet.Build.Packaging.Tests/Scenarios/given_a_packaging_project_with_netstandard/b/b.csproj

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,5 +19,6 @@
1919
</PackageReference>
2020
</ItemGroup>
2121

22-
<Import Project="$(NuGetTargetsPath)\NuGet.Build.Packaging.targets" Condition="Exists('$(NuGetTargetsPath)\NuGet.Build.Packaging.targets')" />
22+
<Import Project="$(NuGetTargetsPath)\buildMultiTargeting\NuGet.Build.Packaging.targets" Condition="Exists('$(NuGetTargetsPath)\buildMultiTargeting\NuGet.Build.Packaging.targets')" />
23+
<Import Project="$(NuGetTargetsPath)\NuGet.Build.Packaging.targets" Condition="Exists('$(NuGetTargetsPath)\NuGet.Build.Packaging.targets')" />
2324
</Project>

src/Build/NuGet.Build.Packaging.Tests/Scenarios/given_duplicate_package_files/a.csproj

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
<PropertyGroup>
55
<TargetFrameworkVersion>v4.5</TargetFrameworkVersion>
66
<PackageId>A</PackageId>
7+
<ReadmeA Condition="'$(ReadmeA)' == ''">This is a readme</ReadmeA>
78
</PropertyGroup>
89
<ItemGroup>
910
<ProjectReference Include="b.csproj">
@@ -18,6 +19,20 @@
1819
<PackageFile Include="content\content.txt">
1920
<PackagePath>content\content.txt</PackagePath>
2021
</PackageFile>
21-
</ItemGroup>
22+
<PackageFile Include="content\content.txt">
23+
<PackagePath>content\a\1\content.txt</PackagePath>
24+
</PackageFile>
25+
<PackageFile Include="content\content.txt">
26+
<PackagePath>content\a\2\content.txt</PackagePath>
27+
</PackageFile>
28+
</ItemGroup>
29+
<Target Name="GenerateReadme" BeforeTargets="GetPackageContents">
30+
<WriteLinesToFile File="$(OutputPath)\Readme.txt" Lines="$(ReadmeA)" Overwrite="true" />
31+
<ItemGroup>
32+
<PackageFile Include="$(OutputPath)\Readme.txt">
33+
<PackagePath>Readme.txt</PackagePath>
34+
</PackageFile>
35+
</ItemGroup>
36+
</Target>
2237
<Import Project="$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), Scenario.targets))\Scenario.targets" />
2338
</Project>

src/Build/NuGet.Build.Packaging.Tests/Scenarios/given_duplicate_package_files/b.csproj

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
<PropertyGroup>
55
<ProjectGuid>$(GuidB)</ProjectGuid>
66
<TargetFrameworkVersion>v4.5</TargetFrameworkVersion>
7+
<ReadmeB Condition="'$(ReadmeB)' == ''">This is a readme</ReadmeB>
78
</PropertyGroup>
89
<ItemGroup>
910
<Content Include="content\web\**\*.*">
@@ -12,6 +13,20 @@
1213
<PackageFile Include="content\content.txt">
1314
<PackagePath>content\content.txt</PackagePath>
1415
</PackageFile>
15-
</ItemGroup>
16-
<Import Project="$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), Scenario.targets))\Scenario.targets" />
16+
<PackageFile Include="content\content.txt">
17+
<PackagePath>content\b\1\content.txt</PackagePath>
18+
</PackageFile>
19+
<PackageFile Include="content\content.txt">
20+
<PackagePath>content\b\2\content.txt</PackagePath>
21+
</PackageFile>
22+
</ItemGroup>
23+
<Target Name="GenerateReadme" BeforeTargets="GetPackageContents">
24+
<WriteLinesToFile File="$(OutputPath)\Readme.txt" Lines="$(ReadmeB)" Overwrite="true" />
25+
<ItemGroup>
26+
<PackageFile Include="$(OutputPath)\Readme.txt">
27+
<PackagePath>Readme.txt</PackagePath>
28+
</PackageFile>
29+
</ItemGroup>
30+
</Target>
31+
<Import Project="$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), Scenario.targets))\Scenario.targets" />
1732
</Project>

src/Build/NuGet.Build.Packaging.Tests/given_duplicate_package_files.cs

Lines changed: 47 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,17 +15,57 @@ public given_duplicate_package_files(ITestOutputHelper output)
1515
}
1616

1717
[Fact]
18-
public void build_fails()
18+
public void exact_duplicates_are_removed()
1919
{
2020
var result = Builder.BuildScenario(nameof(given_duplicate_package_files), target: "Pack");
2121

22-
Assert.Equal(TargetResultCode.Failure, result.ResultCode);
22+
Assert.Equal(TargetResultCode.Success, result.ResultCode);
23+
}
2324

24-
// content.txt is marked as PackageFile and the path to the file is exacly the same in both project a and b.
25-
Assert.False(result.Logger.Errors.Any(e => e.Message.Contains("content.txt")));
25+
[Fact]
26+
public void same_source_different_target_are_preserved()
27+
{
28+
var result = Builder.BuildScenario(nameof(given_duplicate_package_files), target: "GetPackageContents");
29+
Assert.Equal(TargetResultCode.Success, result.ResultCode);
30+
31+
Assert.Contains(result.Items, item => item.Matches(new
32+
{
33+
Kind = PackageItemKind.None,
34+
PackagePath = @"content\a\1\content.txt"
35+
}));
36+
Assert.Contains(result.Items, item => item.Matches(new
37+
{
38+
Kind = PackageItemKind.None,
39+
PackagePath = @"content\a\2\content.txt"
40+
}));
41+
Assert.Contains(result.Items, item => item.Matches(new
42+
{
43+
Kind = PackageItemKind.None,
44+
PackagePath = @"content\b\1\content.txt"
45+
}));
46+
Assert.Contains(result.Items, item => item.Matches(new
47+
{
48+
Kind = PackageItemKind.None,
49+
PackagePath = @"content\b\2\content.txt"
50+
}));
51+
52+
result = Builder.BuildScenario(nameof(given_duplicate_package_files), target: "Pack");
53+
Assert.Equal(TargetResultCode.Success, result.ResultCode);
54+
}
55+
56+
57+
[Fact]
58+
public void real_duplicates_fail()
59+
{
60+
var result = Builder.BuildScenario(nameof(given_duplicate_package_files),
61+
new { ReadmeA = "First Readme", ReadmeB = "Second Readme" },
62+
target: "Pack");
63+
64+
Assert.Equal(TargetResultCode.Failure, result.ResultCode);
2665

27-
// nuget.js is marked as content and the path to the file is "bin\{projectname}\content\web\js\nuget.js" so to the package they are two conflicting files.
28-
Assert.True(result.Logger.Errors.Any(e => e.Message.Contains("nuget.js")));
66+
// A distinct Readme.txt is generated by a.csproj and b.csproj but both have the
67+
// same PackagePath so we can't disambiguate after hashing the contents.
68+
Assert.True(result.Logger.Errors.Any(e => e.Message.Contains("Readme.txt")));
2969
}
3070
}
31-
}
71+
}

0 commit comments

Comments
 (0)