From 73b176b74abbe4d81567f185d28615c12e8d50b6 Mon Sep 17 00:00:00 2001 From: Erik Darling <2136037+erikdarlingdata@users.noreply.github.com> Date: Fri, 6 Mar 2026 06:31:55 -0600 Subject: [PATCH] Fix poison wait false positives and alert log parsing (#445) Cherry-pick of 564bab0 (accidentally merged to main via PR #447). The Lite poison wait query had no time filter, so stale data from days/weeks ago with high avg waits kept triggering alerts indefinitely. Added a 10-minute window matching Dashboard's existing filter. Also fixed alert history logging: non-numeric display strings (poison wait, LRQ, TempDB, job alerts) failed double.TryParse and logged as 0/0. Added optional numeric parameters to TrySendAlertEmailAsync so call sites can pass actual values for the DuckDB alert log while keeping display strings for emails. Co-Authored-By: Claude Opus 4.6 --- Lite/MainWindow.xaml.cs | 16 ++++++++++++---- Lite/Services/EmailAlertService.cs | 12 ++++++++---- Lite/Services/LocalDataService.WaitStats.cs | 1 + 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/Lite/MainWindow.xaml.cs b/Lite/MainWindow.xaml.cs index 7e16095a..f52087be 100644 --- a/Lite/MainWindow.xaml.cs +++ b/Lite/MainWindow.xaml.cs @@ -1099,7 +1099,9 @@ await _emailAlertService.TrySendAlertEmailAsync( allWaitNames, $"{App.AlertPoisonWaitThresholdMs}ms avg", summary.ServerId, - poisonContext); + poisonContext, + numericCurrentValue: worst.AvgMsPerWait, + numericThresholdValue: App.AlertPoisonWaitThresholdMs); } } else if (_activePoisonWaitAlert.TryGetValue(key, out var wasPoisonWait) && wasPoisonWait) @@ -1147,7 +1149,9 @@ await _emailAlertService.TrySendAlertEmailAsync( $"{longRunning.Count} query(s), longest {elapsedMinutes}m", $"{App.AlertLongRunningQueryThresholdMinutes}m", summary.ServerId, - lrqContext); + lrqContext, + numericCurrentValue: elapsedMinutes, + numericThresholdValue: App.AlertLongRunningQueryThresholdMinutes); } } else if (_activeLongRunningQueryAlert.TryGetValue(key, out var wasLongRunning) && wasLongRunning) @@ -1191,7 +1195,9 @@ await _emailAlertService.TrySendAlertEmailAsync( $"{tempDb.UsedPercent:F0}% used ({tempDb.TotalReservedMb:F0} MB)", $"{App.AlertTempDbSpaceThresholdPercent}%", summary.ServerId, - tempDbContext); + tempDbContext, + numericCurrentValue: tempDb.UsedPercent, + numericThresholdValue: App.AlertTempDbSpaceThresholdPercent); } } else if (_activeTempDbSpaceAlert.TryGetValue(key, out var wasTempDb) && wasTempDb) @@ -1240,7 +1246,9 @@ await _emailAlertService.TrySendAlertEmailAsync( $"{anomalousJobs.Count} job(s) exceeding {App.AlertLongRunningJobMultiplier}x average", $"{App.AlertLongRunningJobMultiplier}x historical avg", summary.ServerId, - jobContext); + jobContext, + numericCurrentValue: (double)worst.PercentOfAverage, + numericThresholdValue: App.AlertLongRunningJobMultiplier * 100); } } else if (_activeLongRunningJobAlert.TryGetValue(key, out var wasJob) && wasJob) diff --git a/Lite/Services/EmailAlertService.cs b/Lite/Services/EmailAlertService.cs index f837221e..743a93fb 100644 --- a/Lite/Services/EmailAlertService.cs +++ b/Lite/Services/EmailAlertService.cs @@ -48,7 +48,9 @@ public async Task TrySendAlertEmailAsync( string currentValue, string thresholdValue, int serverId = 0, - AlertContext? context = null) + AlertContext? context = null, + double? numericCurrentValue = null, + double? numericThresholdValue = null) { try { @@ -108,10 +110,12 @@ public async Task TrySendAlertEmailAsync( } /* Always log the alert to DuckDB, regardless of email status */ + var logCurrent = numericCurrentValue + ?? (double.TryParse(currentValue.TrimEnd('%'), out var cv) ? cv : 0); + var logThreshold = numericThresholdValue + ?? (double.TryParse(thresholdValue.TrimEnd('%'), out var tv) ? tv : 0); await LogAlertAsync(serverId, serverName, metricName, - double.TryParse(currentValue.TrimEnd('%'), out var cv) ? cv : 0, - double.TryParse(thresholdValue.TrimEnd('%'), out var tv) ? tv : 0, - sent, notificationType, sendError); + logCurrent, logThreshold, sent, notificationType, sendError); } catch (Exception ex) { diff --git a/Lite/Services/LocalDataService.WaitStats.cs b/Lite/Services/LocalDataService.WaitStats.cs index d00a2364..7175358e 100644 --- a/Lite/Services/LocalDataService.WaitStats.cs +++ b/Lite/Services/LocalDataService.WaitStats.cs @@ -170,6 +170,7 @@ FROM v_wait_stats WHERE server_id = $1 AND wait_type IN ('THREADPOOL', 'RESOURCE_SEMAPHORE', 'RESOURCE_SEMAPHORE_QUERY_COMPILE') AND delta_waiting_tasks > 0 +AND collection_time >= NOW() - INTERVAL '10 minutes' ORDER BY collection_time DESC LIMIT 3";