55using CsvHelper ;
66using CsvHelper . Configuration ;
77using CsvHelper . TypeConversion ;
8+ using Knapcode . MiniZip ;
89using Microsoft . Extensions . CommandLineUtils ;
910using NuGet . Packaging ;
1011using NuGet . Services . Entities ;
1112using NuGetGallery ;
12- using NuGetGallery . Configuration ;
1313using System ;
14+ using System . Collections . Generic ;
1415using System . Data . Entity ;
16+ using System . Data . SqlClient ;
1517using System . IO ;
1618using System . Linq ;
19+ using System . Linq . Expressions ;
1720using System . Net . Http ;
1821using System . Text ;
1922using System . Threading . Tasks ;
2023using System . Xml ;
2124using System . Xml . Linq ;
2225using GalleryTools . Utils ;
26+ using NuGet . Services . Sql ;
2327
2428namespace GalleryTools . Commands
2529{
@@ -35,6 +39,15 @@ public abstract class BackfillCommand<TMetadata>
3539
3640 protected virtual int UpdateBatchSize => 100 ;
3741
42+ protected virtual int LimitTo => 0 ;
43+
44+ protected virtual MetadataSourceType SourceType => MetadataSourceType . NuspecOnly ;
45+
46+ protected virtual Expression < Func < Package , object > > QueryIncludes => null ;
47+
48+
49+ protected IPackageService _packageService ;
50+
3851 public static void Configure < TCommand > ( CommandLineApplication config ) where TCommand : BackfillCommand < TMetadata > , new ( )
3952 {
4053 config . Description = "Backfill metadata for packages in the gallery" ;
@@ -53,10 +66,12 @@ public abstract class BackfillCommand<TMetadata>
5366 builder . RegisterAssemblyModules ( typeof ( DefaultDependenciesModule ) . Assembly ) ;
5467 var container = builder . Build ( ) ;
5568
56- var connectionString = container . Resolve < IAppConfiguration > ( ) . SqlConnectionString ;
69+ var sqlConnectionFactory = container . Resolve < ISqlConnectionFactory > ( ) ;
70+ var sqlConnection = await sqlConnectionFactory . CreateAsync ( ) ;
5771 var serviceDiscoveryUriValue = new Uri ( serviceDiscoveryUri . Value ( ) ) ;
5872
5973 var command = new TCommand ( ) ;
74+ command . _packageService = container . Resolve < IPackageService > ( ) ;
6075
6176 var metadataFileName = fileName . HasValue ( ) ? fileName . Value ( ) : command . MetadataFileName ;
6277
@@ -75,54 +90,90 @@ public abstract class BackfillCommand<TMetadata>
7590 }
7691 }
7792
78- await command . Collect ( connectionString , serviceDiscoveryUriValue , lastCreateTime , metadataFileName ) ;
93+ await command . Collect ( sqlConnection , serviceDiscoveryUriValue , lastCreateTime , metadataFileName ) ;
7994 }
8095
8196 if ( updateDB . HasValue ( ) )
8297 {
83- await command . Update ( connectionString , metadataFileName ) ;
98+ await command . Update ( sqlConnection , metadataFileName ) ;
8499 }
85100
86101 return 0 ;
87102 } ) ;
88103 }
89104
90- public async Task Collect ( string connectionString , Uri serviceDiscoveryUri , DateTime ? lastCreateTime , string fileName )
105+ public async Task Collect ( SqlConnection connection , Uri serviceDiscoveryUri , DateTime ? lastCreateTime , string fileName )
91106 {
92- using ( var context = new EntitiesContext ( connectionString , readOnly : true ) )
107+ using ( var context = new EntitiesContext ( connection , readOnly : true ) )
93108 using ( var cursor = new FileCursor ( CursorFileName ) )
94109 using ( var logger = new Logger ( ErrorsFileName ) )
95110 {
111+ context . SetCommandTimeout ( 300 ) ; // large query
112+
96113 var startTime = await cursor . Read ( ) ;
97114
98115 logger . Log ( $ "Starting metadata collection - Cursor time: { startTime : u} ") ;
99116
100117 var repository = new EntityRepository < Package > ( context ) ;
101118
102119 var packages = repository . GetAll ( )
103- . Include ( p => p . PackageRegistration )
120+ . Include ( p => p . PackageRegistration ) ;
121+ if ( QueryIncludes != null )
122+ {
123+ packages = packages . Include ( QueryIncludes ) ;
124+ }
125+
126+ packages = packages
104127 . Where ( p => p . Created < lastCreateTime && p . Created > startTime )
105- . Where ( p => p . PackageStatusKey == PackageStatus . Available || p . PackageStatusKey == PackageStatus . Validating )
106- . OrderBy ( p => p . Created ) ;
128+ . OrderBy ( p => p . PackageRegistration . Id ) ;
129+ if ( LimitTo > 0 )
130+ {
131+ packages = packages . Take ( LimitTo ) ;
132+ }
107133
108134 var flatContainerUri = await GetFlatContainerUri ( serviceDiscoveryUri ) ;
109135
110136 using ( var csv = CreateCsvWriter ( fileName ) )
111137 using ( var http = new HttpClient ( ) )
112138 {
139+ // We want these downloads ignored by stats pipelines - this user agent is automatically skipped.
140+ // See https://github.com/NuGet/NuGet.Jobs/blob/262da48ed05d0366613bbf1c54f47879aad96dcd/src/Stats.ImportAzureCdnStatistics/StatisticsParser.cs#L41
141+ http . DefaultRequestHeaders . TryAddWithoutValidation ( "User-Agent" ,
142+ "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; AppInsights) Backfill Job: NuGet.Gallery GalleryTools" ) ;
143+
113144 var counter = 0 ;
114145 var lastCreatedDate = default ( DateTime ? ) ;
115146
116147 foreach ( var package in packages )
117148 {
118149 var id = package . PackageRegistration . Id ;
119150 var version = package . NormalizedVersion ;
120-
121- var nuspecUri = $ " { flatContainerUri } / { id . ToLowerInvariant ( ) } / { version . ToLowerInvariant ( ) } / { id . ToLowerInvariant ( ) } .nuspec" ;
151+ var idLowered = id . ToLowerInvariant ( ) ;
152+ var versionLowered = version . ToLowerInvariant ( ) ;
122153
123154 try
124155 {
125- var metadata = await FetchMetadata ( http , nuspecUri ) ;
156+ var metadata = default ( TMetadata ) ;
157+
158+ var nuspecUri =
159+ $ "{ flatContainerUri } /{ idLowered } /{ versionLowered } /{ idLowered } .nuspec";
160+ using ( var nuspecStream = await http . GetStreamAsync ( nuspecUri ) )
161+ {
162+ var document = LoadDocument ( nuspecStream ) ;
163+
164+ var nuspecReader = new NuspecReader ( document ) ;
165+
166+ if ( SourceType == MetadataSourceType . NuspecOnly )
167+ {
168+ metadata = ReadMetadata ( nuspecReader ) ;
169+ }
170+ else if ( SourceType == MetadataSourceType . Nupkg )
171+ {
172+ var nupkgUri =
173+ $ "{ flatContainerUri } /{ idLowered } /{ versionLowered } /{ idLowered } .{ versionLowered } .nupkg";
174+ metadata = await FetchMetadataAsync ( http , nupkgUri , nuspecReader , id , version , logger ) ;
175+ }
176+ }
126177
127178 if ( ShouldWriteMetadata ( metadata ) )
128179 {
@@ -132,7 +183,7 @@ public async Task Collect(string connectionString, Uri serviceDiscoveryUri, Date
132183
133184 await csv . NextRecordAsync ( ) ;
134185
135- logger . LogPackage ( id , version , "Metadata saved. " ) ;
186+ logger . LogPackage ( id , version , $ "Metadata saved") ;
136187 }
137188 }
138189 catch ( Exception e )
@@ -163,14 +214,14 @@ public async Task Collect(string connectionString, Uri serviceDiscoveryUri, Date
163214 }
164215 }
165216
166- public async Task Update ( string connectionString , string fileName )
217+ public async Task Update ( SqlConnection connection , string fileName )
167218 {
168219 if ( ! File . Exists ( fileName ) )
169220 {
170221 throw new ArgumentException ( $ "File '{ fileName } ' doesn't exist") ;
171222 }
172223
173- using ( var context = new EntitiesContext ( connectionString , readOnly : false ) )
224+ using ( var context = new EntitiesContext ( connection , readOnly : false ) )
174225 using ( var cursor = new FileCursor ( CursorFileName ) )
175226 using ( var logger = new Logger ( ErrorsFileName ) )
176227 {
@@ -199,9 +250,7 @@ public async Task Update(string connectionString, string fileName)
199250
200251 if ( package != null )
201252 {
202-
203- UpdatePackage ( package , metadata . Metadata ) ;
204-
253+ UpdatePackage ( package , metadata . Metadata , context ) ;
205254 logger . LogPackage ( metadata . Id , metadata . Version , "Metadata updated." ) ;
206255
207256 counter ++ ;
@@ -234,13 +283,15 @@ public async Task Update(string connectionString, string fileName)
234283 }
235284 }
236285
237- protected abstract TMetadata ReadMetadata ( NuspecReader reader ) ;
286+ protected virtual TMetadata ReadMetadata ( NuspecReader reader ) => default ;
287+
288+ protected virtual TMetadata ReadMetadata ( IList < string > files , NuspecReader nuspecReader ) => default ;
238289
239290 protected abstract bool ShouldWriteMetadata ( TMetadata metadata ) ;
240291
241292 protected abstract void ConfigureClassMap ( PackageMetadataClassMap map ) ;
242293
243- protected abstract void UpdatePackage ( Package package , TMetadata metadata ) ;
294+ protected abstract void UpdatePackage ( Package package , TMetadata metadata , EntitiesContext context ) ;
244295
245296 private static async Task < string > GetFlatContainerUri ( Uri serviceDiscoveryUri )
246297 {
@@ -251,15 +302,26 @@ private static async Task<string> GetFlatContainerUri(Uri serviceDiscoveryUri)
251302 return result . First ( ) . AbsoluteUri . TrimEnd ( '/' ) ;
252303 }
253304
254- private async Task < TMetadata > FetchMetadata ( HttpClient httpClient , string nuspecUri )
305+ private async Task < TMetadata > FetchMetadataAsync (
306+ HttpClient httpClient , string nupkgUri , NuspecReader nuspecReader , string id , string version , Logger logger )
255307 {
256- using ( var nuspecStream = await httpClient . GetStreamAsync ( nuspecUri ) )
257- {
258- var document = LoadDocument ( nuspecStream ) ;
259-
260- var reader = new NuspecReader ( document ) ;
308+ var httpZipProvider = new HttpZipProvider ( httpClient ) ;
261309
262- return ReadMetadata ( reader ) ;
310+ try
311+ {
312+ var zipDirectoryReader = await httpZipProvider . GetReaderAsync ( new Uri ( nupkgUri ) ) ;
313+ var zipDirectory = await zipDirectoryReader . ReadAsync ( ) ;
314+ var files = zipDirectory
315+ . Entries
316+ . Select ( x => x . GetName ( ) )
317+ . ToList ( ) ;
318+
319+ return ReadMetadata ( files , nuspecReader ) ;
320+ }
321+ catch ( Exception e )
322+ {
323+ await logger . LogPackageError ( id , version , e ) ;
324+ return default ;
263325 }
264326 }
265327
@@ -460,5 +522,21 @@ public void Dispose()
460522 Writer . Dispose ( ) ;
461523 }
462524 }
525+
526+ /// <summary>
527+ /// This enum allows our logic to respond to a package's need for only a nupsec to determine metadata, or whether
528+ /// it needs access to the .nupkg for analysis of the package
529+ /// </summary>
530+ public enum MetadataSourceType
531+ {
532+ /// <summary>
533+ /// Just the nuspec will suffice for metadata extraction
534+ /// </summary>
535+ NuspecOnly ,
536+ /// <summary>
537+ /// We need to dig deeper into the bupkg for the metadata
538+ /// </summary>
539+ Nupkg
540+ }
463541 }
464542}
0 commit comments