Skip to content

Commit e07550e

Browse files
Merge pull request #2478 from Particular/ravendb-health-reporter
2 parents a9628dc + abc73f3 commit e07550e

File tree

6 files changed

+204
-1
lines changed

6 files changed

+204
-1
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
ServiceControl Health: Audit Message Ingestion Process
2+
ServiceControl.Audit Health: Audit Database Index Lag
23
ServiceControl.Audit Health: Audit Message Ingestion
34
Storage space: ServiceControl.Audit database
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
namespace ServiceControl
2+
{
3+
using System;
4+
using System.Linq;
5+
using System.Text;
6+
using System.Threading.Tasks;
7+
using NServiceBus.CustomChecks;
8+
using NServiceBus.Logging;
9+
using Raven.Client;
10+
using Raven.Abstractions.Data;
11+
using ServiceControl.Audit.Infrastructure.Settings;
12+
13+
class CheckRavenDBIndexLag : CustomCheck
14+
{
15+
public CheckRavenDBIndexLag(IDocumentStore store, LoggingSettings settings)
16+
: base("Audit Database Index Lag", "ServiceControl.Audit Health", TimeSpan.FromMinutes(5))
17+
{
18+
_store = store;
19+
LogPath = settings?.LogPath;
20+
}
21+
22+
public override Task<CheckResult> PerformCheck()
23+
{
24+
var statistics = _store.DatabaseCommands.GetStatistics();
25+
var indexes = statistics.Indexes.OrderBy(x => x.Name).ToArray();
26+
27+
CreateDiagnosticsLogEntry(statistics, indexes);
28+
29+
CheckAndReportIndexErrors(statistics);
30+
31+
var indexCountWithTooMuchLag = CheckAndReportIndexesWithTooMuchIndexLag(indexes);
32+
33+
if (indexCountWithTooMuchLag > 0)
34+
{
35+
return CheckResult.Failed($"At least one index significantly stale. Please run maintenance mode if this custom check persists to ensure index(es) can recover. See log file in `{LogPath}` for more details. Visit https://docs.particular.net/search?q=servicecontrol+troubleshooting for more information.");
36+
}
37+
38+
return CheckResult.Pass;
39+
}
40+
41+
static int CheckAndReportIndexesWithTooMuchIndexLag(IndexStats[] indexes)
42+
{
43+
int indexCountWithTooMuchLag = 0;
44+
45+
foreach (var indexStats in indexes)
46+
{
47+
// IndexingLag is the number of documents that the index is behind, it is not a time unit.
48+
var indexLag = indexStats.IndexingLag.GetValueOrDefault();
49+
indexLag = Math.Abs(indexLag);
50+
51+
if (indexLag > IndexLagThresholdError)
52+
{
53+
indexCountWithTooMuchLag++;
54+
_log.Error($"Index [{indexStats.Name}] IndexingLag {indexLag:n0} is above error threshold ({IndexLagThresholdError:n0}). Launch in maintenance mode to let indexes catch up.");
55+
}
56+
else if (indexLag > IndexLagThresholdWarning)
57+
{
58+
indexCountWithTooMuchLag++;
59+
_log.Warn($"Index [{indexStats.Name}] IndexingLag {indexLag:n0} is above warning threshold ({IndexLagThresholdWarning:n0}). Launch in maintenance mode to let indexes catch up.");
60+
}
61+
}
62+
63+
return indexCountWithTooMuchLag;
64+
}
65+
66+
static void CheckAndReportIndexErrors(DatabaseStatistics statistics)
67+
{
68+
foreach (var indexError in statistics.Errors)
69+
{
70+
_log.Error($"Index [{indexError.IndexName}] Error: {indexError.Error} (Action: {indexError.Action}, Doc: {indexError.Document}, At: {indexError.Timestamp})");
71+
}
72+
}
73+
74+
static void CreateDiagnosticsLogEntry(DatabaseStatistics statistics, IndexStats[] indexes)
75+
{
76+
if (!_log.IsDebugEnabled)
77+
{
78+
return;
79+
}
80+
81+
var report = new StringBuilder();
82+
report.AppendLine("Internal RavenDB index health report:");
83+
84+
foreach (var indexStats in indexes)
85+
{
86+
// IndexingLag is the number of documents that the index is behind, it is not a time unit.
87+
var indexLag = indexStats.IndexingLag.GetValueOrDefault();
88+
indexLag = Math.Abs(indexLag);
89+
report.AppendLine($"- Index [{indexStats.Name,-44}] Stale: {statistics.StaleIndexes.Contains(indexStats.Name),-5}, Lag: {indexLag,9:n0}, Valid: {indexStats.IsInvalidIndex,-5}, LastIndexed: {indexStats.LastIndexedTimestamp:u}, LastIndexing: {indexStats.LastIndexingTime:u}");
90+
}
91+
_log.Debug(report.ToString());
92+
}
93+
94+
const int IndexLagThresholdWarning = 10000;
95+
const int IndexLagThresholdError = 100000;
96+
static ILog _log = LogManager.GetLogger<CheckRavenDBIndexLag>();
97+
IDocumentStore _store;
98+
string LogPath;
99+
}
100+
}

src/ServiceControl.Audit/Infrastructure/RavenDB/RavenBootstrapper.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
using NServiceBus.Logging;
1010
using Raven.Client.Embedded;
1111
using Raven.Client.Indexes;
12+
using ServiceControl.Infrastructure.RavenDB;
1213
using ServiceControl.SagaAudit;
1314
using Settings;
1415

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
Health: ServiceControl Primary Instance
22
Health: ServiceControl Remotes
33
ServiceControl Health: Audit Message Ingestion
4+
ServiceControl Health: Error Database Index Lag
45
ServiceControl Health: Error Message Ingestion
56
ServiceControl Health: Error Message Ingestion Process
67
Storage space: ServiceControl database

src/ServiceControl.sln

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ServiceControl.Config.Tests
9696
EndProject
9797
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ServiceControl.LicenseManagement", "ServiceControl.LicenseManagement\ServiceControl.LicenseManagement.csproj", "{FD1B9998-4B0C-4109-A3BC-0748F829F852}"
9898
EndProject
99-
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ServiceControl.Infrastructure.Metrics", "ServiceControl.Infrastructure.Metrics\ServiceControl.Infrastructure.Metrics.csproj", "{56422C80-6A26-46B4-AF5C-84AF08BAB1D1}"
99+
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ServiceControl.Infrastructure.Metrics", "ServiceControl.Infrastructure.Metrics\ServiceControl.Infrastructure.Metrics.csproj", "{56422C80-6A26-46B4-AF5C-84AF08BAB1D1}"
100100
EndProject
101101
Global
102102
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
namespace ServiceControl
2+
{
3+
using System;
4+
using System.Linq;
5+
using System.Text;
6+
using System.Threading.Tasks;
7+
using NServiceBus.CustomChecks;
8+
using NServiceBus.Logging;
9+
using Raven.Client;
10+
using Raven.Abstractions.Data;
11+
using ServiceBus.Management.Infrastructure.Settings;
12+
13+
class CheckRavenDBIndexLag : CustomCheck
14+
{
15+
public CheckRavenDBIndexLag(IDocumentStore store, LoggingSettings settings)
16+
: base("Error Database Index Lag", "ServiceControl Health", TimeSpan.FromMinutes(5))
17+
{
18+
_store = store;
19+
LogPath = settings?.LogPath;
20+
}
21+
22+
public override Task<CheckResult> PerformCheck()
23+
{
24+
var statistics = _store.DatabaseCommands.GetStatistics();
25+
var indexes = statistics.Indexes.OrderBy(x => x.Name).ToArray();
26+
27+
CreateDiagnosticsLogEntry(statistics, indexes);
28+
29+
CheckAndReportIndexErrors(statistics);
30+
31+
var indexCountWithTooMuchLag = CheckAndReportIndexesWithTooMuchIndexLag(indexes);
32+
33+
if (indexCountWithTooMuchLag > 0)
34+
{
35+
return CheckResult.Failed($"At least one index significantly stale. Please run maintenance mode if this custom check persists to ensure index(es) can recover. See log file in `{LogPath}` for more details. Visit https://docs.particular.net/search?q=servicecontrol+troubleshooting for more information.");
36+
}
37+
38+
return CheckResult.Pass;
39+
}
40+
41+
static int CheckAndReportIndexesWithTooMuchIndexLag(IndexStats[] indexes)
42+
{
43+
int indexCountWithTooMuchLag = 0;
44+
45+
foreach (var indexStats in indexes)
46+
{
47+
// IndexingLag is the number of documents that the index is behind, it is not a time unit.
48+
var indexLag = indexStats.IndexingLag.GetValueOrDefault();
49+
indexLag = Math.Abs(indexLag);
50+
51+
if (indexLag > IndexLagThresholdError)
52+
{
53+
indexCountWithTooMuchLag++;
54+
_log.Error($"Index [{indexStats.Name}] IndexingLag {indexLag:n0} is above error threshold ({IndexLagThresholdError:n0}). Launch in maintenance mode to let indexes catch up.");
55+
}
56+
else if (indexLag > IndexLagThresholdWarning)
57+
{
58+
indexCountWithTooMuchLag++;
59+
_log.Warn($"Index [{indexStats.Name}] IndexingLag {indexLag:n0} is above warning threshold ({IndexLagThresholdWarning:n0}). Launch in maintenance mode to let indexes catch up.");
60+
}
61+
}
62+
63+
return indexCountWithTooMuchLag;
64+
}
65+
66+
static void CheckAndReportIndexErrors(DatabaseStatistics statistics)
67+
{
68+
foreach (var indexError in statistics.Errors)
69+
{
70+
_log.Error($"Index [{indexError.IndexName}] Error: {indexError.Error} (Action: {indexError.Action}, Doc: {indexError.Document}, At: {indexError.Timestamp})");
71+
}
72+
}
73+
74+
static void CreateDiagnosticsLogEntry(DatabaseStatistics statistics, IndexStats[] indexes)
75+
{
76+
if (!_log.IsDebugEnabled)
77+
{
78+
return;
79+
}
80+
81+
var report = new StringBuilder();
82+
report.AppendLine("Internal RavenDB index health report:");
83+
84+
foreach (var indexStats in indexes)
85+
{
86+
// IndexingLag is the number of documents that the index is behind, it is not a time unit.
87+
var indexLag = indexStats.IndexingLag.GetValueOrDefault();
88+
indexLag = Math.Abs(indexLag);
89+
report.AppendLine($"- Index [{indexStats.Name,-44}] Stale: {statistics.StaleIndexes.Contains(indexStats.Name),-5}, Lag: {indexLag,9:n0}, Valid: {indexStats.IsInvalidIndex,-5}, LastIndexed: {indexStats.LastIndexedTimestamp:u}, LastIndexing: {indexStats.LastIndexingTime:u}");
90+
}
91+
_log.Debug(report.ToString());
92+
}
93+
94+
const int IndexLagThresholdWarning = 10000;
95+
const int IndexLagThresholdError = 100000;
96+
static ILog _log = LogManager.GetLogger<CheckRavenDBIndexLag>();
97+
IDocumentStore _store;
98+
string LogPath;
99+
}
100+
}

0 commit comments

Comments
 (0)