Skip to content
This repository was archived by the owner on Jul 30, 2024. It is now read-only.

Commit 8f331cb

Browse files
committed
Reboot search instances returning 500 and 503 (#661)
Address NuGet/Engineering#1864
1 parent 6c17549 commit 8f331cb

5 files changed

Lines changed: 116 additions & 1 deletion

File tree

src/Monitoring.RebootSearchInstance/SearchInstanceRebooter.cs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
using System;
55
using System.Diagnostics;
66
using System.Linq;
7+
using System.Net;
78
using System.Threading;
89
using System.Threading.Tasks;
910
using Microsoft.Extensions.Logging;
@@ -220,6 +221,20 @@ private async Task<InstanceHealth> DetermineInstanceHealthAsync(
220221
{
221222
commitDateTime = await _searchServiceClient.GetCommitDateTimeAsync(instance, token);
222223
}
224+
catch (HttpResponseException ex) when (ex.StatusCode == HttpStatusCode.ServiceUnavailable
225+
|| ex.StatusCode == HttpStatusCode.InternalServerError)
226+
{
227+
_logger.LogInformation(
228+
(EventId)0,
229+
ex,
230+
"The HTTP response when hitting {DiagUrl} was {StatusCode} {ReasonPhrase}. Considering this " +
231+
"instance as an unhealthy state.",
232+
instance.DiagUrl,
233+
(int)ex.StatusCode,
234+
ex.ReasonPhrase);
235+
236+
return InstanceHealth.Unhealthy;
237+
}
223238
catch (Exception ex)
224239
{
225240
_logger.LogInformation(
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
// Copyright (c) .NET Foundation. All rights reserved.
2+
// Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
3+
4+
using System;
5+
using System.Net;
6+
7+
namespace NuGet.Jobs.Montoring.PackageLag
8+
{
9+
public class HttpResponseException : Exception
10+
{
11+
public HttpResponseException(HttpStatusCode statusCode, string reasonPhrase, string message)
12+
: base(message)
13+
{
14+
StatusCode = statusCode;
15+
ReasonPhrase = reasonPhrase;
16+
}
17+
18+
public HttpStatusCode StatusCode { get; }
19+
public string ReasonPhrase { get; }
20+
}
21+
}

src/PackageLagMonitor/Monitoring.PackageLag.csproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
</PropertyGroup>
4949
<ItemGroup>
5050
<Compile Include="AzureManagementAPIWrapperConfiguration.cs" />
51+
<Compile Include="HttpResponseException.cs" />
5152
<Compile Include="Instance.cs" />
5253
<Compile Include="ISearchServiceClient.cs" />
5354
<Compile Include="Job.cs" />

src/PackageLagMonitor/SearchServiceClient.cs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,15 @@ public async Task<DateTimeOffset> GetCommitDateTimeAsync(Instance instance, Canc
4646
HttpCompletionOption.ResponseContentRead,
4747
token))
4848
{
49+
if (!diagResponse.IsSuccessStatusCode)
50+
{
51+
throw new HttpResponseException(
52+
diagResponse.StatusCode,
53+
diagResponse.ReasonPhrase,
54+
$"The HTTP response when hitting {instance.DiagUrl} was {(int)diagResponse.StatusCode} " +
55+
$"{diagResponse.ReasonPhrase}, which is not successful.");
56+
}
57+
4958
var diagContent = diagResponse.Content;
5059
var searchDiagResultRaw = await diagContent.ReadAsStringAsync();
5160
var searchDiagResultObject = JsonConvert.DeserializeObject<SearchDiagnosticResponse>(searchDiagResultRaw);

tests/Monitoring.RebootSearchInstance.Tests/SearchInstanceRebooterFacts.cs

Lines changed: 70 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
using System;
55
using System.Collections.Generic;
66
using System.IO;
7+
using System.Net;
78
using System.Threading;
89
using System.Threading.Tasks;
910
using Microsoft.Extensions.Logging;
@@ -169,7 +170,7 @@ public async Task RestartsFirstUnhealthyInstance()
169170
}
170171

171172
[Fact]
172-
public async Task TreatsExceptionWhenGettingCommitTimestampAsUnknown()
173+
public async Task TreatsUnknownExceptionWhenGettingCommitTimestampAsUnknown()
173174
{
174175
_searchServiceClient
175176
.SetupSequence(x => x.GetCommitDateTimeAsync(It.IsAny<Instance>(), It.IsAny<CancellationToken>()))
@@ -195,6 +196,74 @@ public async Task TreatsExceptionWhenGettingCommitTimestampAsUnknown()
195196
_telemetryService.Verify(x => x.TrackInstanceCount(_region, 3), Times.Once);
196197
}
197198

199+
[Theory]
200+
[InlineData(HttpStatusCode.BadGateway)]
201+
[InlineData(HttpStatusCode.NotFound)]
202+
public async Task TreatsUnknownHttpStatusCodeExceptionWhenGettingCommitTimestampAsUnknown(HttpStatusCode statusCode)
203+
{
204+
_searchServiceClient
205+
.SetupSequence(x => x.GetCommitDateTimeAsync(It.IsAny<Instance>(), It.IsAny<CancellationToken>()))
206+
.ThrowsAsync(new HttpResponseException(statusCode, "Service Unavailable", "Some problem."))
207+
.ReturnsAsync(DateTimeOffset.MaxValue)
208+
.ReturnsAsync(DateTimeOffset.MaxValue);
209+
210+
await _target.RunAsync(_token);
211+
212+
_azureManagementAPIWrapper.Verify(
213+
x => x.RebootCloudServiceRoleInstanceAsync(
214+
It.IsAny<string>(),
215+
It.IsAny<string>(),
216+
It.IsAny<string>(),
217+
It.IsAny<string>(),
218+
It.IsAny<string>(),
219+
It.IsAny<string>(),
220+
It.IsAny<CancellationToken>()),
221+
Times.Never);
222+
_telemetryService.Verify(x => x.TrackHealthyInstanceCount(_region, 2), Times.Once);
223+
_telemetryService.Verify(x => x.TrackUnhealthyInstanceCount(_region, 0), Times.Once);
224+
_telemetryService.Verify(x => x.TrackUnknownInstanceCount(_region, 1), Times.Once);
225+
_telemetryService.Verify(x => x.TrackInstanceCount(_region, 3), Times.Once);
226+
}
227+
228+
[Theory]
229+
[InlineData(HttpStatusCode.InternalServerError)]
230+
[InlineData(HttpStatusCode.ServiceUnavailable)]
231+
public async Task TreatsSome500sHttpResponseExceptionAsUnhealthy(HttpStatusCode statusCode)
232+
{
233+
_searchServiceClient
234+
.SetupSequence(x => x.GetCommitDateTimeAsync(It.IsAny<Instance>(), It.IsAny<CancellationToken>()))
235+
.ThrowsAsync(new HttpResponseException(statusCode, "Service Unavailable", "Some problem."))
236+
.ReturnsAsync(DateTimeOffset.MaxValue)
237+
.ReturnsAsync(DateTimeOffset.MaxValue);
238+
239+
await _target.RunAsync(_token);
240+
241+
_azureManagementAPIWrapper.Verify(
242+
x => x.RebootCloudServiceRoleInstanceAsync(
243+
_subscription,
244+
_resourceGroup,
245+
_serviceName,
246+
"Production",
247+
_role,
248+
It.IsAny<string>(),
249+
It.IsAny<CancellationToken>()),
250+
Times.Once);
251+
_azureManagementAPIWrapper.Verify(
252+
x => x.RebootCloudServiceRoleInstanceAsync(
253+
It.IsAny<string>(),
254+
It.IsAny<string>(),
255+
It.IsAny<string>(),
256+
It.IsAny<string>(),
257+
It.IsAny<string>(),
258+
It.IsAny<string>(),
259+
It.IsAny<CancellationToken>()),
260+
Times.Once);
261+
_telemetryService.Verify(x => x.TrackHealthyInstanceCount(_region, 2), Times.Once);
262+
_telemetryService.Verify(x => x.TrackUnhealthyInstanceCount(_region, 1), Times.Once);
263+
_telemetryService.Verify(x => x.TrackUnknownInstanceCount(_region, 0), Times.Once);
264+
_telemetryService.Verify(x => x.TrackInstanceCount(_region, 3), Times.Once);
265+
}
266+
198267
[Fact]
199268
public async Task TreatsLagBetweenThresholdsAsUnknown()
200269
{

0 commit comments

Comments
 (0)