Skip to content

Commit 5882f3e

Browse files
authored
Use new TFM logic for package ingestion in backfill command (#8431)
New TFM logic for package ingestion and backfill command
1 parent 21348d6 commit 5882f3e

10 files changed

Lines changed: 233 additions & 159 deletions

File tree

src/GalleryTools/Commands/BackfillCommand.cs

Lines changed: 105 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -5,21 +5,25 @@
55
using CsvHelper;
66
using CsvHelper.Configuration;
77
using CsvHelper.TypeConversion;
8+
using Knapcode.MiniZip;
89
using Microsoft.Extensions.CommandLineUtils;
910
using NuGet.Packaging;
1011
using NuGet.Services.Entities;
1112
using NuGetGallery;
12-
using NuGetGallery.Configuration;
1313
using System;
14+
using System.Collections.Generic;
1415
using System.Data.Entity;
16+
using System.Data.SqlClient;
1517
using System.IO;
1618
using System.Linq;
19+
using System.Linq.Expressions;
1720
using System.Net.Http;
1821
using System.Text;
1922
using System.Threading.Tasks;
2023
using System.Xml;
2124
using System.Xml.Linq;
2225
using GalleryTools.Utils;
26+
using NuGet.Services.Sql;
2327

2428
namespace GalleryTools.Commands
2529
{
@@ -35,6 +39,15 @@ public abstract class BackfillCommand<TMetadata>
3539

3640
protected virtual int UpdateBatchSize => 100;
3741

42+
protected virtual int LimitTo => 0;
43+
44+
protected virtual MetadataSourceType SourceType => MetadataSourceType.NuspecOnly;
45+
46+
protected virtual Expression<Func<Package, object>> QueryIncludes => null;
47+
48+
49+
protected IPackageService _packageService;
50+
3851
public static void Configure<TCommand>(CommandLineApplication config) where TCommand : BackfillCommand<TMetadata>, new()
3952
{
4053
config.Description = "Backfill metadata for packages in the gallery";
@@ -53,10 +66,12 @@ public abstract class BackfillCommand<TMetadata>
5366
builder.RegisterAssemblyModules(typeof(DefaultDependenciesModule).Assembly);
5467
var container = builder.Build();
5568

56-
var connectionString = container.Resolve<IAppConfiguration>().SqlConnectionString;
69+
var sqlConnectionFactory = container.Resolve<ISqlConnectionFactory>();
70+
var sqlConnection = await sqlConnectionFactory.CreateAsync();
5771
var serviceDiscoveryUriValue = new Uri(serviceDiscoveryUri.Value());
5872

5973
var command = new TCommand();
74+
command._packageService = container.Resolve<IPackageService>();
6075

6176
var metadataFileName = fileName.HasValue() ? fileName.Value() : command.MetadataFileName;
6277

@@ -75,54 +90,90 @@ public abstract class BackfillCommand<TMetadata>
7590
}
7691
}
7792

78-
await command.Collect(connectionString, serviceDiscoveryUriValue, lastCreateTime, metadataFileName);
93+
await command.Collect(sqlConnection, serviceDiscoveryUriValue, lastCreateTime, metadataFileName);
7994
}
8095

8196
if (updateDB.HasValue())
8297
{
83-
await command.Update(connectionString, metadataFileName);
98+
await command.Update(sqlConnection, metadataFileName);
8499
}
85100

86101
return 0;
87102
});
88103
}
89104

90-
public async Task Collect(string connectionString, Uri serviceDiscoveryUri, DateTime? lastCreateTime, string fileName)
105+
public async Task Collect(SqlConnection connection, Uri serviceDiscoveryUri, DateTime? lastCreateTime, string fileName)
91106
{
92-
using (var context = new EntitiesContext(connectionString, readOnly: true))
107+
using (var context = new EntitiesContext(connection, readOnly: true))
93108
using (var cursor = new FileCursor(CursorFileName))
94109
using (var logger = new Logger(ErrorsFileName))
95110
{
111+
context.SetCommandTimeout(300); // large query
112+
96113
var startTime = await cursor.Read();
97114

98115
logger.Log($"Starting metadata collection - Cursor time: {startTime:u}");
99116

100117
var repository = new EntityRepository<Package>(context);
101118

102119
var packages = repository.GetAll()
103-
.Include(p => p.PackageRegistration)
120+
.Include(p => p.PackageRegistration);
121+
if (QueryIncludes != null)
122+
{
123+
packages = packages.Include(QueryIncludes);
124+
}
125+
126+
packages = packages
104127
.Where(p => p.Created < lastCreateTime && p.Created > startTime)
105-
.Where(p => p.PackageStatusKey == PackageStatus.Available || p.PackageStatusKey == PackageStatus.Validating)
106-
.OrderBy(p => p.Created);
128+
.OrderBy(p => p.PackageRegistration.Id);
129+
if (LimitTo > 0)
130+
{
131+
packages = packages.Take(LimitTo);
132+
}
107133

108134
var flatContainerUri = await GetFlatContainerUri(serviceDiscoveryUri);
109135

110136
using (var csv = CreateCsvWriter(fileName))
111137
using (var http = new HttpClient())
112138
{
139+
// We want these downloads ignored by stats pipelines - this user agent is automatically skipped.
140+
// See https://github.com/NuGet/NuGet.Jobs/blob/262da48ed05d0366613bbf1c54f47879aad96dcd/src/Stats.ImportAzureCdnStatistics/StatisticsParser.cs#L41
141+
http.DefaultRequestHeaders.TryAddWithoutValidation("User-Agent",
142+
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; AppInsights) Backfill Job: NuGet.Gallery GalleryTools");
143+
113144
var counter = 0;
114145
var lastCreatedDate = default(DateTime?);
115146

116147
foreach (var package in packages)
117148
{
118149
var id = package.PackageRegistration.Id;
119150
var version = package.NormalizedVersion;
120-
121-
var nuspecUri = $"{flatContainerUri}/{id.ToLowerInvariant()}/{version.ToLowerInvariant()}/{id.ToLowerInvariant()}.nuspec";
151+
var idLowered = id.ToLowerInvariant();
152+
var versionLowered = version.ToLowerInvariant();
122153

123154
try
124155
{
125-
var metadata = await FetchMetadata(http, nuspecUri);
156+
var metadata = default(TMetadata);
157+
158+
var nuspecUri =
159+
$"{flatContainerUri}/{idLowered}/{versionLowered}/{idLowered}.nuspec";
160+
using (var nuspecStream = await http.GetStreamAsync(nuspecUri))
161+
{
162+
var document = LoadDocument(nuspecStream);
163+
164+
var nuspecReader = new NuspecReader(document);
165+
166+
if (SourceType == MetadataSourceType.NuspecOnly)
167+
{
168+
metadata = ReadMetadata(nuspecReader);
169+
}
170+
else if (SourceType == MetadataSourceType.Nupkg)
171+
{
172+
var nupkgUri =
173+
$"{flatContainerUri}/{idLowered}/{versionLowered}/{idLowered}.{versionLowered}.nupkg";
174+
metadata = await FetchMetadataAsync(http, nupkgUri, nuspecReader, id, version, logger);
175+
}
176+
}
126177

127178
if (ShouldWriteMetadata(metadata))
128179
{
@@ -132,7 +183,7 @@ public async Task Collect(string connectionString, Uri serviceDiscoveryUri, Date
132183

133184
await csv.NextRecordAsync();
134185

135-
logger.LogPackage(id, version, "Metadata saved.");
186+
logger.LogPackage(id, version, $"Metadata saved");
136187
}
137188
}
138189
catch (Exception e)
@@ -163,14 +214,14 @@ public async Task Collect(string connectionString, Uri serviceDiscoveryUri, Date
163214
}
164215
}
165216

166-
public async Task Update(string connectionString, string fileName)
217+
public async Task Update(SqlConnection connection, string fileName)
167218
{
168219
if (!File.Exists(fileName))
169220
{
170221
throw new ArgumentException($"File '{fileName}' doesn't exist");
171222
}
172223

173-
using (var context = new EntitiesContext(connectionString, readOnly: false))
224+
using (var context = new EntitiesContext(connection, readOnly: false))
174225
using (var cursor = new FileCursor(CursorFileName))
175226
using (var logger = new Logger(ErrorsFileName))
176227
{
@@ -199,9 +250,7 @@ public async Task Update(string connectionString, string fileName)
199250

200251
if (package != null)
201252
{
202-
203-
UpdatePackage(package, metadata.Metadata);
204-
253+
UpdatePackage(package, metadata.Metadata, context);
205254
logger.LogPackage(metadata.Id, metadata.Version, "Metadata updated.");
206255

207256
counter++;
@@ -234,13 +283,15 @@ public async Task Update(string connectionString, string fileName)
234283
}
235284
}
236285

237-
protected abstract TMetadata ReadMetadata(NuspecReader reader);
286+
protected virtual TMetadata ReadMetadata(NuspecReader reader) => default;
287+
288+
protected virtual TMetadata ReadMetadata(IList<string> files, NuspecReader nuspecReader) => default;
238289

239290
protected abstract bool ShouldWriteMetadata(TMetadata metadata);
240291

241292
protected abstract void ConfigureClassMap(PackageMetadataClassMap map);
242293

243-
protected abstract void UpdatePackage(Package package, TMetadata metadata);
294+
protected abstract void UpdatePackage(Package package, TMetadata metadata, EntitiesContext context);
244295

245296
private static async Task<string> GetFlatContainerUri(Uri serviceDiscoveryUri)
246297
{
@@ -251,15 +302,26 @@ private static async Task<string> GetFlatContainerUri(Uri serviceDiscoveryUri)
251302
return result.First().AbsoluteUri.TrimEnd('/');
252303
}
253304

254-
private async Task<TMetadata> FetchMetadata(HttpClient httpClient, string nuspecUri)
305+
private async Task<TMetadata> FetchMetadataAsync(
306+
HttpClient httpClient, string nupkgUri, NuspecReader nuspecReader, string id, string version, Logger logger)
255307
{
256-
using (var nuspecStream = await httpClient.GetStreamAsync(nuspecUri))
257-
{
258-
var document = LoadDocument(nuspecStream);
259-
260-
var reader = new NuspecReader(document);
308+
var httpZipProvider = new HttpZipProvider(httpClient);
261309

262-
return ReadMetadata(reader);
310+
try
311+
{
312+
var zipDirectoryReader = await httpZipProvider.GetReaderAsync(new Uri(nupkgUri));
313+
var zipDirectory = await zipDirectoryReader.ReadAsync();
314+
var files = zipDirectory
315+
.Entries
316+
.Select(x => x.GetName())
317+
.ToList();
318+
319+
return ReadMetadata(files, nuspecReader);
320+
}
321+
catch (Exception e)
322+
{
323+
await logger.LogPackageError(id, version, e);
324+
return default;
263325
}
264326
}
265327

@@ -460,5 +522,21 @@ public void Dispose()
460522
Writer.Dispose();
461523
}
462524
}
525+
526+
/// <summary>
527+
/// This enum allows our logic to respond to a package's need for only a nupsec to determine metadata, or whether
528+
/// it needs access to the .nupkg for analysis of the package
529+
/// </summary>
530+
public enum MetadataSourceType
531+
{
532+
/// <summary>
533+
/// Just the nuspec will suffice for metadata extraction
534+
/// </summary>
535+
NuspecOnly,
536+
/// <summary>
537+
/// We need to dig deeper into the bupkg for the metadata
538+
/// </summary>
539+
Nupkg
540+
}
463541
}
464542
}

src/GalleryTools/Commands/BackfillDevelopmentDependencyMetadataCommand.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
using Microsoft.Extensions.CommandLineUtils;
55
using NuGet.Packaging;
66
using NuGet.Services.Entities;
7+
using NuGetGallery;
78

89
namespace GalleryTools.Commands
910
{
@@ -32,7 +33,7 @@ protected override void ConfigureClassMap(PackageMetadataClassMap map)
3233
map.Map(x => x.Metadata).Index(3);
3334
}
3435

35-
protected override void UpdatePackage(Package package, bool metadata)
36+
protected override void UpdatePackage(Package package, bool metadata, EntitiesContext context)
3637
{
3738
package.DevelopmentDependency = metadata;
3839
}

src/GalleryTools/Commands/BackfillRepositoryMetadataCommand.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
using NuGet.Packaging;
66
using NuGet.Packaging.Core;
77
using NuGet.Services.Entities;
8+
using NuGetGallery;
89

910
namespace GalleryTools.Commands
1011
{
@@ -38,7 +39,7 @@ protected override void ConfigureClassMap(PackageMetadataClassMap map)
3839
map.Map(x => x.Metadata.Commit).Index(6);
3940
}
4041

41-
protected override void UpdatePackage(Package package, RepositoryMetadata metadata)
42+
protected override void UpdatePackage(Package package, RepositoryMetadata metadata, EntitiesContext context)
4243
{
4344
package.RepositoryUrl = metadata.Url;
4445

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
// Copyright (c) .NET Foundation. All rights reserved.
2+
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
3+
4+
using System;
5+
using System.Collections.Generic;
6+
using System.Linq;
7+
using System.Linq.Expressions;
8+
using Microsoft.Extensions.CommandLineUtils;
9+
using NuGet.Packaging;
10+
using NuGet.Services.Entities;
11+
using NuGetGallery;
12+
13+
namespace GalleryTools.Commands
14+
{
15+
public sealed class BackfillTfmMetadataCommand : BackfillCommand<List<string>>
16+
{
17+
protected override string MetadataFileName => "tfmMetadata.txt";
18+
19+
protected override MetadataSourceType SourceType => MetadataSourceType.Nupkg;
20+
21+
protected override Expression<Func<Package, object>> QueryIncludes => p => p.SupportedFrameworks;
22+
23+
public static void Configure(CommandLineApplication config)
24+
{
25+
Configure<BackfillTfmMetadataCommand>(config);
26+
}
27+
28+
protected override List<string> ReadMetadata(IList<string> files, NuspecReader nuspecReader)
29+
{
30+
var supportedTFMs = new List<string>();
31+
var supportedFrameworks = _packageService.GetSupportedFrameworks(nuspecReader, files);
32+
foreach (var tfm in supportedFrameworks)
33+
{
34+
supportedTFMs.Add(tfm.GetShortFolderName());
35+
}
36+
37+
return supportedTFMs;
38+
}
39+
40+
protected override bool ShouldWriteMetadata(List<string> metadata) => true;
41+
42+
protected override void ConfigureClassMap(PackageMetadataClassMap map)
43+
{
44+
map.Map(x => x.Metadata).Index(3);
45+
}
46+
47+
protected override void UpdatePackage(Package package, List<string> metadata, EntitiesContext context)
48+
{
49+
var existingTFMs = package.SupportedFrameworks == null
50+
? Enumerable.Empty<string>()
51+
: package.SupportedFrameworks.Select(f => f.FrameworkName.GetShortFolderName()).OrderBy(f => f);
52+
53+
var newTFMs = metadata == null || metadata.Count == 0
54+
? Enumerable.Empty<string>()
55+
: metadata.OrderBy(f => f);
56+
57+
if (Enumerable.SequenceEqual(existingTFMs, newTFMs))
58+
{
59+
return; // nothing to change
60+
}
61+
62+
// clean out the old (which will be left unattached in table otherwise) before adding new
63+
if (package.SupportedFrameworks != null)
64+
{
65+
foreach (var supportedFramework in package.SupportedFrameworks.ToList())
66+
{
67+
package.SupportedFrameworks.Remove(supportedFramework);
68+
context.PackageFrameworks.Remove(supportedFramework);
69+
}
70+
}
71+
72+
package.SupportedFrameworks = newTFMs.Select(f => new PackageFramework {Package = package, TargetFramework = f}).ToList();
73+
}
74+
}
75+
}

0 commit comments

Comments
 (0)