From 20696a55c2e21df1b1f5b31208d51eb4b74b5eef Mon Sep 17 00:00:00 2001 From: Erik Darling <2136037+erikdarlingdata@users.noreply.github.com> Date: Fri, 22 May 2026 08:31:51 -0400 Subject: [PATCH] Persist the Lite alert email cooldown across restarts (#981) EmailAlertService deduplicated alert emails with an in-memory ConcurrentDictionary cooldown keyed {serverId}:{metricName}. It was never persisted, so restarting Lite cleared every cooldown and an alert email sent minutes before the restart could be sent again immediately. The first time each alert key is evaluated, the cooldown is now seeded from config_alert_log via GetLastEmailSentUtcAsync - the most recent alert_time for that server/metric where an email actually succeeded (notification_type IN ('email','email+webhook') AND send_error IS NULL, mirroring exactly when _cooldowns is updated after SendEmailAsync). The in-memory dictionary stays authoritative once seeded. Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 2 + Lite/Services/EmailAlertService.cs | 64 ++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a9b60fa2..abc74ba3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **Lite UI no longer freezes during archival** ([#979]) — archival held DuckDB's exclusive write lock across the entire export-to-Parquet step, blocking every UI query (tab switches showed the spinning wheel, worse with more monitored servers). Export-to-Parquet only reads the database, so it now runs under a shared read lock concurrently with the UI; only the brief `DELETE` takes the exclusive write lock - **Lite FinOps no longer recommends an edition downgrade on an Availability Group secondary** ([#980]) — the licensing recommendations suggested "downgrade to Standard to save $X/mo" for any Enterprise instance, with no AG awareness. On a secondary replica that advice is misleading — every replica in an AG must run the same edition. FinOps now detects the AG replica role and, on a secondary, shows an informational note instead of the downgrade/savings estimate +- **Lite alert emails no longer re-fire after an app restart** ([#981]) — the per-metric email cooldown lived only in memory, so restarting Lite cleared it and an alert sent minutes earlier could be sent again immediately. The cooldown is now seeded from `config_alert_log` (the most recent successful send for that server/metric) the first time each alert is evaluated, so it survives restarts - **Data Retention job no longer fails with `xp_delete_file` error 22049** ([#972]) — the trace-file cleanup added in v2.11.0 passed a wildcard path to `xp_delete_file`, raising an uncatchable `Msg 22049` that failed the entire `PerformanceMonitor - Data Retention` Agent job on every run once any `Monitor_LongQueries_*.trc` files existed. `xp_delete_file` also cannot delete `.trc` files at all — it only accepts SQL Server backup files and Maintenance Plan report files — so that cleanup step has been removed from `config.data_retention` ### Changed @@ -24,6 +25,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 [#972]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/972 [#979]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/979 [#980]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/980 +[#981]: https://github.com/erikdarlingdata/PerformanceMonitor/issues/981 ## [2.11.0] - 2026-05-19 diff --git a/Lite/Services/EmailAlertService.cs b/Lite/Services/EmailAlertService.cs index c895b9f8..cba44035 100644 --- a/Lite/Services/EmailAlertService.cs +++ b/Lite/Services/EmailAlertService.cs @@ -67,6 +67,20 @@ public async Task TrySendAlertEmailAsync( !string.IsNullOrWhiteSpace(App.SmtpRecipients)) { var cooldownKey = $"{serverId}:{metricName}"; + + /* Seed the in-memory cooldown from config_alert_log the first + time this key is seen, so an alert email sent shortly before + an app restart is not immediately re-sent afterward (#981). + The in-memory dictionary is authoritative once seeded. */ + if (!_cooldowns.ContainsKey(cooldownKey)) + { + var lastPersistedSend = await GetLastEmailSentUtcAsync(serverId, metricName); + if (lastPersistedSend.HasValue) + { + _cooldowns.TryAdd(cooldownKey, lastPersistedSend.Value); + } + } + var withinCooldown = _cooldowns.TryGetValue(cooldownKey, out var lastSent) && DateTime.UtcNow - lastSent < TimeSpan.FromMinutes(App.EmailCooldownMinutes); @@ -140,6 +154,56 @@ await LogAlertAsync(serverId, serverName, metricName, } } + /// + /// Returns the UTC time the most recent alert email was successfully sent + /// for this server/metric, read from config_alert_log — or null if none. + /// Used to seed the in-memory cooldown after an app restart (#981). + /// + private async Task GetLastEmailSentUtcAsync(int serverId, string metricName) + { + try + { + /* Use injected initializer, fall back to creating one from App.DatabasePath */ + var duckDb = _duckDb; + if (duckDb == null) + { + var dbPath = App.DatabasePath; + if (string.IsNullOrEmpty(dbPath)) return null; + duckDb = new DuckDbInitializer(dbPath); + } + + using var readLock = duckDb.AcquireReadLock(); + using var connection = duckDb.CreateConnection(); + await connection.OpenAsync(); + + using var command = connection.CreateCommand(); + /* A successful email send is logged with a notification_type of + 'email' / 'email+webhook' and a null send_error — that mirrors + exactly when _cooldowns is updated after SendEmailAsync. */ + command.CommandText = @" +SELECT MAX(alert_time) +FROM config_alert_log +WHERE server_id = $1 +AND metric_name = $2 +AND notification_type IN ('email', 'email+webhook') +AND send_error IS NULL"; + command.Parameters.Add(new DuckDB.NET.Data.DuckDBParameter { Value = serverId }); + command.Parameters.Add(new DuckDB.NET.Data.DuckDBParameter { Value = metricName }); + + var result = await command.ExecuteScalarAsync(); + if (result == null || result == DBNull.Value) return null; + + /* alert_time is written as DateTime.UtcNow; tag it UTC so the kind + is explicit (the cooldown subtraction is tick math regardless). */ + return DateTime.SpecifyKind(Convert.ToDateTime(result), DateTimeKind.Utc); + } + catch (Exception ex) + { + AppLogger.Error("EmailAlert", $"Could not read persisted alert cooldown: {ex.Message}"); + return null; + } + } + /// /// Sends a test email to verify SMTP configuration. /// Returns null on success, or the error message on failure.