From 02608447f04c95cd59a335171d134dfca26b4b6d Mon Sep 17 00:00:00 2001 From: Erik Darling <2136037+erikdarlingdata@users.noreply.github.com> Date: Mon, 6 Apr 2026 18:50:01 -0400 Subject: [PATCH 1/3] Replace all muted/dim text colors with foreground colors for readability Changes across Dashboard, Lite, and InstallerGui: - ForegroundDimColor now matches ForegroundColor in all 6 themes - InstallerGui ForegroundMutedBrush changed to foreground color - ScottPlot chart text colors (#9DA5B4, #4A5568, #364D61) replaced with foreground equivalents (#E4E6EB, #1A1D23, #1A2A3A) - SlicerLabelBrush changed from 60% opacity to full opacity - Email template text changed from #808080 to #E4E6EB - Removed Opacity="0.45" from empty-state icons in ServerTab Co-Authored-By: Claude Opus 4.6 (1M context) --- Dashboard/Controls/TimeRangeSlicerControl.xaml.cs | 2 +- Dashboard/Helpers/TabHelpers.cs | 12 ++++++------ Dashboard/ServerTab.xaml | 2 +- Dashboard/Services/EmailTemplateBuilder.cs | 6 +++--- Dashboard/Themes/CoolBreezeTheme.xaml | 2 +- Dashboard/Themes/DarkTheme.xaml | 4 ++-- Dashboard/Themes/LightTheme.xaml | 2 +- InstallerGui/App.xaml | 2 +- InstallerGui/Themes/DarkTheme.xaml | 2 +- Lite/Controls/CorrelatedTimelineLanesControl.xaml.cs | 12 ++++++------ Lite/Controls/ServerTab.xaml | 2 +- Lite/Controls/ServerTab.xaml.cs | 12 ++++++------ Lite/Controls/TimeRangeSlicerControl.xaml.cs | 2 +- Lite/Services/EmailTemplateBuilder.cs | 6 +++--- Lite/Themes/CoolBreezeTheme.xaml | 2 +- Lite/Themes/DarkTheme.xaml | 4 ++-- Lite/Themes/LightTheme.xaml | 2 +- Lite/Windows/ProcedureHistoryWindow.xaml.cs | 4 ++-- Lite/Windows/QueryStatsHistoryWindow.xaml.cs | 4 ++-- Lite/Windows/QueryStoreHistoryWindow.xaml.cs | 4 ++-- 20 files changed, 44 insertions(+), 44 deletions(-) diff --git a/Dashboard/Controls/TimeRangeSlicerControl.xaml.cs b/Dashboard/Controls/TimeRangeSlicerControl.xaml.cs index 9d924862..c67c0b25 100644 --- a/Dashboard/Controls/TimeRangeSlicerControl.xaml.cs +++ b/Dashboard/Controls/TimeRangeSlicerControl.xaml.cs @@ -213,7 +213,7 @@ public void Redraw() SlicerCanvas.Children.Add(new Path { Data = lineGeo, Stroke = lineBrush, StrokeThickness = 1.5 }); // X-axis labels — evenly spaced by TIME across the full range, skip if too close - var labelBrush = FindBrush("SlicerLabelBrush", "#99E4E6EB"); + var labelBrush = FindBrush("SlicerLabelBrush", "#E4E6EB"); const double minLabelSpacingPx = 90; double lastLabelX = -minLabelSpacingPx; int targetLabels = Math.Max(2, (int)(w / minLabelSpacingPx)); diff --git a/Dashboard/Helpers/TabHelpers.cs b/Dashboard/Helpers/TabHelpers.cs index 6a5a7361..70f6533d 100644 --- a/Dashboard/Helpers/TabHelpers.cs +++ b/Dashboard/Helpers/TabHelpers.cs @@ -157,7 +157,7 @@ public static void ApplyThemeToChart(WpfPlot chart) { figureBackground = ScottPlot.Color.FromHex("#EEF4FA"); dataBackground = ScottPlot.Color.FromHex("#DAE6F0"); - textColor = ScottPlot.Color.FromHex("#364D61"); + textColor = ScottPlot.Color.FromHex("#1A2A3A"); gridColor = ScottPlot.Color.FromHex("#A8BDD0").WithAlpha(120); legendBg = ScottPlot.Color.FromHex("#EEF4FA"); legendFg = ScottPlot.Color.FromHex("#1A2A3A"); @@ -167,7 +167,7 @@ public static void ApplyThemeToChart(WpfPlot chart) { figureBackground = ScottPlot.Color.FromHex("#FFFFFF"); dataBackground = ScottPlot.Color.FromHex("#F5F7FA"); - textColor = ScottPlot.Color.FromHex("#4A5568"); + textColor = ScottPlot.Color.FromHex("#1A1D23"); gridColor = ScottPlot.Colors.Black.WithAlpha(20); legendBg = ScottPlot.Color.FromHex("#FFFFFF"); legendFg = ScottPlot.Color.FromHex("#1A1D23"); @@ -177,7 +177,7 @@ public static void ApplyThemeToChart(WpfPlot chart) { figureBackground = ScottPlot.Color.FromHex("#22252b"); dataBackground = ScottPlot.Color.FromHex("#111217"); - textColor = ScottPlot.Color.FromHex("#9DA5B4"); + textColor = ScottPlot.Color.FromHex("#E4E6EB"); gridColor = ScottPlot.Colors.White.WithAlpha(40); legendBg = ScottPlot.Color.FromHex("#22252b"); legendFg = ScottPlot.Color.FromHex("#E4E6EB"); @@ -226,10 +226,10 @@ private static void HandleChartFirstLoaded(object sender, RoutedEventArgs e) public static void ReapplyAxisColors(WpfPlot chart) { var textColor = ThemeManager.CurrentTheme == "CoolBreeze" - ? ScottPlot.Color.FromHex("#364D61") + ? ScottPlot.Color.FromHex("#1A2A3A") : ThemeManager.HasLightBackground - ? ScottPlot.Color.FromHex("#4A5568") - : ScottPlot.Color.FromHex("#9DA5B4"); + ? ScottPlot.Color.FromHex("#1A1D23") + : ScottPlot.Color.FromHex("#E4E6EB"); chart.Plot.Axes.Bottom.TickLabelStyle.ForeColor = textColor; chart.Plot.Axes.Left.TickLabelStyle.ForeColor = textColor; chart.Plot.Axes.Bottom.Label.ForeColor = textColor; diff --git a/Dashboard/ServerTab.xaml b/Dashboard/ServerTab.xaml index 9ff26d3c..e1d831b0 100644 --- a/Dashboard/ServerTab.xaml +++ b/Dashboard/ServerTab.xaml @@ -429,7 +429,7 @@ diff --git a/Dashboard/Services/EmailTemplateBuilder.cs b/Dashboard/Services/EmailTemplateBuilder.cs index c2b9efbb..c76ceff1 100644 --- a/Dashboard/Services/EmailTemplateBuilder.cs +++ b/Dashboard/Services/EmailTemplateBuilder.cs @@ -116,7 +116,7 @@ private static string BuildHtmlBody( sb.Append($""); sb.Append("
"); sb.Append($"SQL Server Performance Monitor
"); - sb.Append($"{WebUtility.HtmlEncode(EditionName)}"); + sb.Append($"{WebUtility.HtmlEncode(EditionName)}"); sb.Append("
"); sb.Append(""); @@ -167,7 +167,7 @@ private static string BuildHtmlBody( /* Footer */ sb.Append("
 
"); sb.Append(""); - sb.Append($""); + sb.Append($""); sb.Append($"Sent by {WebUtility.HtmlEncode(EditionName)}"); if (!isTest) { @@ -200,7 +200,7 @@ private static void AppendDetailSection(StringBuilder sb, AlertContext context) /* Separator + heading */ sb.Append("
 
"); sb.Append(""); - sb.Append($"RECENT EVENTS"); + sb.Append($"RECENT EVENTS"); sb.Append(""); foreach (var item in context.Details) diff --git a/Dashboard/Themes/CoolBreezeTheme.xaml b/Dashboard/Themes/CoolBreezeTheme.xaml index ab50dea2..4eb70025 100644 --- a/Dashboard/Themes/CoolBreezeTheme.xaml +++ b/Dashboard/Themes/CoolBreezeTheme.xaml @@ -22,7 +22,7 @@ #1A2A3A - #364D61 + #1A2A3A #5B7A90 diff --git a/Dashboard/Themes/DarkTheme.xaml b/Dashboard/Themes/DarkTheme.xaml index 44519117..73cc58cb 100644 --- a/Dashboard/Themes/DarkTheme.xaml +++ b/Dashboard/Themes/DarkTheme.xaml @@ -22,7 +22,7 @@ #E4E6EB - #9DA5B4 + #E4E6EB #6B7280 @@ -1265,7 +1265,7 @@ - + diff --git a/Dashboard/Themes/LightTheme.xaml b/Dashboard/Themes/LightTheme.xaml index 882619fd..b2fff248 100644 --- a/Dashboard/Themes/LightTheme.xaml +++ b/Dashboard/Themes/LightTheme.xaml @@ -22,7 +22,7 @@ #1A1D23 - #4A5568 + #1A1D23 #718096 diff --git a/InstallerGui/App.xaml b/InstallerGui/App.xaml index 7c9deaae..f28634d7 100644 --- a/InstallerGui/App.xaml +++ b/InstallerGui/App.xaml @@ -14,7 +14,7 @@ - + diff --git a/InstallerGui/Themes/DarkTheme.xaml b/InstallerGui/Themes/DarkTheme.xaml index 2b9092e5..7e8d0268 100644 --- a/InstallerGui/Themes/DarkTheme.xaml +++ b/InstallerGui/Themes/DarkTheme.xaml @@ -17,7 +17,7 @@ - + diff --git a/Lite/Controls/CorrelatedTimelineLanesControl.xaml.cs b/Lite/Controls/CorrelatedTimelineLanesControl.xaml.cs index 5f4e01d8..04cfc77d 100644 --- a/Lite/Controls/CorrelatedTimelineLanesControl.xaml.cs +++ b/Lite/Controls/CorrelatedTimelineLanesControl.xaml.cs @@ -332,21 +332,21 @@ private static void ApplyTheme(ScottPlot.WPF.WpfPlot chart) { figureBackground = ScottPlot.Color.FromHex("#EEF4FA"); dataBackground = ScottPlot.Color.FromHex("#DAE6F0"); - textColor = ScottPlot.Color.FromHex("#364D61"); + textColor = ScottPlot.Color.FromHex("#1A2A3A"); gridColor = ScottPlot.Color.FromHex("#A8BDD0").WithAlpha(120); } else if (ThemeManager.HasLightBackground) { figureBackground = ScottPlot.Color.FromHex("#FFFFFF"); dataBackground = ScottPlot.Color.FromHex("#F5F7FA"); - textColor = ScottPlot.Color.FromHex("#4A5568"); + textColor = ScottPlot.Color.FromHex("#1A1D23"); gridColor = ScottPlot.Colors.Black.WithAlpha(20); } else { figureBackground = ScottPlot.Color.FromHex("#22252b"); dataBackground = ScottPlot.Color.FromHex("#111217"); - textColor = ScottPlot.Color.FromHex("#9DA5B4"); + textColor = ScottPlot.Color.FromHex("#E4E6EB"); gridColor = ScottPlot.Colors.White.WithAlpha(40); } @@ -368,10 +368,10 @@ private static void ApplyTheme(ScottPlot.WPF.WpfPlot chart) private static void ReapplyAxisColors(ScottPlot.WPF.WpfPlot chart) { var textColor = ThemeManager.CurrentTheme == "CoolBreeze" - ? ScottPlot.Color.FromHex("#364D61") + ? ScottPlot.Color.FromHex("#1A2A3A") : ThemeManager.HasLightBackground - ? ScottPlot.Color.FromHex("#4A5568") - : ScottPlot.Color.FromHex("#9DA5B4"); + ? ScottPlot.Color.FromHex("#1A1D23") + : ScottPlot.Color.FromHex("#E4E6EB"); chart.Plot.Axes.Bottom.TickLabelStyle.ForeColor = textColor; chart.Plot.Axes.Left.TickLabelStyle.ForeColor = textColor; chart.Plot.Axes.Bottom.Label.ForeColor = textColor; diff --git a/Lite/Controls/ServerTab.xaml b/Lite/Controls/ServerTab.xaml index 37e079c1..a845687e 100644 --- a/Lite/Controls/ServerTab.xaml +++ b/Lite/Controls/ServerTab.xaml @@ -738,7 +738,7 @@ diff --git a/Lite/Controls/ServerTab.xaml.cs b/Lite/Controls/ServerTab.xaml.cs index 5f6869de..19cc0c8c 100644 --- a/Lite/Controls/ServerTab.xaml.cs +++ b/Lite/Controls/ServerTab.xaml.cs @@ -3619,7 +3619,7 @@ private static void ApplyTheme(ScottPlot.WPF.WpfPlot chart) { figureBackground = ScottPlot.Color.FromHex("#EEF4FA"); dataBackground = ScottPlot.Color.FromHex("#DAE6F0"); - textColor = ScottPlot.Color.FromHex("#364D61"); + textColor = ScottPlot.Color.FromHex("#1A2A3A"); gridColor = ScottPlot.Color.FromHex("#A8BDD0").WithAlpha(120); legendBg = ScottPlot.Color.FromHex("#EEF4FA"); legendFg = ScottPlot.Color.FromHex("#1A2A3A"); @@ -3629,7 +3629,7 @@ private static void ApplyTheme(ScottPlot.WPF.WpfPlot chart) { figureBackground = ScottPlot.Color.FromHex("#FFFFFF"); dataBackground = ScottPlot.Color.FromHex("#F5F7FA"); - textColor = ScottPlot.Color.FromHex("#4A5568"); + textColor = ScottPlot.Color.FromHex("#1A1D23"); gridColor = ScottPlot.Colors.Black.WithAlpha(20); legendBg = ScottPlot.Color.FromHex("#FFFFFF"); legendFg = ScottPlot.Color.FromHex("#1A1D23"); @@ -3639,7 +3639,7 @@ private static void ApplyTheme(ScottPlot.WPF.WpfPlot chart) { figureBackground = ScottPlot.Color.FromHex("#22252b"); dataBackground = ScottPlot.Color.FromHex("#111217"); - textColor = ScottPlot.Color.FromHex("#9DA5B4"); + textColor = ScottPlot.Color.FromHex("#E4E6EB"); gridColor = ScottPlot.Colors.White.WithAlpha(40); legendBg = ScottPlot.Color.FromHex("#22252b"); legendFg = ScottPlot.Color.FromHex("#E4E6EB"); @@ -3710,10 +3710,10 @@ private void OnThemeChanged(string _) private static void ReapplyAxisColors(ScottPlot.WPF.WpfPlot chart) { var textColor = Helpers.ThemeManager.CurrentTheme == "CoolBreeze" - ? ScottPlot.Color.FromHex("#364D61") + ? ScottPlot.Color.FromHex("#1A2A3A") : Helpers.ThemeManager.HasLightBackground - ? ScottPlot.Color.FromHex("#4A5568") - : ScottPlot.Color.FromHex("#9DA5B4"); + ? ScottPlot.Color.FromHex("#1A1D23") + : ScottPlot.Color.FromHex("#E4E6EB"); chart.Plot.Axes.Bottom.TickLabelStyle.ForeColor = textColor; chart.Plot.Axes.Left.TickLabelStyle.ForeColor = textColor; chart.Plot.Axes.Bottom.Label.ForeColor = textColor; diff --git a/Lite/Controls/TimeRangeSlicerControl.xaml.cs b/Lite/Controls/TimeRangeSlicerControl.xaml.cs index 99d3077d..2ac759d1 100644 --- a/Lite/Controls/TimeRangeSlicerControl.xaml.cs +++ b/Lite/Controls/TimeRangeSlicerControl.xaml.cs @@ -229,7 +229,7 @@ public void Redraw() SlicerCanvas.Children.Add(new Path { Data = lineGeo, Stroke = lineBrush, StrokeThickness = 1.5 }); // X-axis labels — evenly spaced by TIME, skip if too close - var labelBrush = FindBrush("SlicerLabelBrush", "#99E4E6EB"); + var labelBrush = FindBrush("SlicerLabelBrush", "#E4E6EB"); const double minLabelSpacingPx = 90; double lastLabelX = -minLabelSpacingPx; int targetLabels = Math.Max(2, (int)(w / minLabelSpacingPx)); diff --git a/Lite/Services/EmailTemplateBuilder.cs b/Lite/Services/EmailTemplateBuilder.cs index 7dd55962..744d24de 100644 --- a/Lite/Services/EmailTemplateBuilder.cs +++ b/Lite/Services/EmailTemplateBuilder.cs @@ -116,7 +116,7 @@ private static string BuildHtmlBody( sb.Append($""); sb.Append("
"); sb.Append($"SQL Server Performance Monitor
"); - sb.Append($"{WebUtility.HtmlEncode(EditionName)}"); + sb.Append($"{WebUtility.HtmlEncode(EditionName)}"); sb.Append("
"); sb.Append(""); @@ -167,7 +167,7 @@ private static string BuildHtmlBody( /* Footer */ sb.Append("
 
"); sb.Append(""); - sb.Append($""); + sb.Append($""); sb.Append($"Sent by {WebUtility.HtmlEncode(EditionName)}"); if (!isTest) { @@ -200,7 +200,7 @@ private static void AppendDetailSection(StringBuilder sb, AlertContext context) /* Separator + heading */ sb.Append("
 
"); sb.Append(""); - sb.Append($"RECENT EVENTS"); + sb.Append($"RECENT EVENTS"); sb.Append(""); foreach (var item in context.Details) diff --git a/Lite/Themes/CoolBreezeTheme.xaml b/Lite/Themes/CoolBreezeTheme.xaml index 6bd705db..739f20bb 100644 --- a/Lite/Themes/CoolBreezeTheme.xaml +++ b/Lite/Themes/CoolBreezeTheme.xaml @@ -22,7 +22,7 @@ #1A2A3A - #364D61 + #1A2A3A #5B7A90 diff --git a/Lite/Themes/DarkTheme.xaml b/Lite/Themes/DarkTheme.xaml index 014635c3..db5aac54 100644 --- a/Lite/Themes/DarkTheme.xaml +++ b/Lite/Themes/DarkTheme.xaml @@ -22,7 +22,7 @@ #E4E6EB - #9DA5B4 + #E4E6EB #6B7280 @@ -1213,7 +1213,7 @@ - + diff --git a/Lite/Themes/LightTheme.xaml b/Lite/Themes/LightTheme.xaml index dc2dab74..063b3526 100644 --- a/Lite/Themes/LightTheme.xaml +++ b/Lite/Themes/LightTheme.xaml @@ -22,7 +22,7 @@ #1A1D23 - #4A5568 + #1A1D23 #718096 diff --git a/Lite/Windows/ProcedureHistoryWindow.xaml.cs b/Lite/Windows/ProcedureHistoryWindow.xaml.cs index 4f949664..4ecbd890 100644 --- a/Lite/Windows/ProcedureHistoryWindow.xaml.cs +++ b/Lite/Windows/ProcedureHistoryWindow.xaml.cs @@ -134,14 +134,14 @@ private static void ApplyTheme(ScottPlot.WPF.WpfPlot chart) { figureBackground = ScottPlot.Color.FromHex("#FFFFFF"); dataBackground = ScottPlot.Color.FromHex("#F5F7FA"); - textColor = ScottPlot.Color.FromHex("#4A5568"); + textColor = ScottPlot.Color.FromHex("#1A1D23"); gridColor = ScottPlot.Colors.Black.WithAlpha(20); } else { figureBackground = ScottPlot.Color.FromHex("#22252b"); dataBackground = ScottPlot.Color.FromHex("#111217"); - textColor = ScottPlot.Color.FromHex("#9DA5B4"); + textColor = ScottPlot.Color.FromHex("#E4E6EB"); gridColor = ScottPlot.Colors.White.WithAlpha(40); } chart.Plot.FigureBackground.Color = figureBackground; diff --git a/Lite/Windows/QueryStatsHistoryWindow.xaml.cs b/Lite/Windows/QueryStatsHistoryWindow.xaml.cs index c78c0051..97bce984 100644 --- a/Lite/Windows/QueryStatsHistoryWindow.xaml.cs +++ b/Lite/Windows/QueryStatsHistoryWindow.xaml.cs @@ -191,14 +191,14 @@ private static void ApplyTheme(ScottPlot.WPF.WpfPlot chart) { figureBackground = ScottPlot.Color.FromHex("#FFFFFF"); dataBackground = ScottPlot.Color.FromHex("#F5F7FA"); - textColor = ScottPlot.Color.FromHex("#4A5568"); + textColor = ScottPlot.Color.FromHex("#1A1D23"); gridColor = ScottPlot.Colors.Black.WithAlpha(20); } else { figureBackground = ScottPlot.Color.FromHex("#22252b"); dataBackground = ScottPlot.Color.FromHex("#111217"); - textColor = ScottPlot.Color.FromHex("#9DA5B4"); + textColor = ScottPlot.Color.FromHex("#E4E6EB"); gridColor = ScottPlot.Colors.White.WithAlpha(40); } chart.Plot.FigureBackground.Color = figureBackground; diff --git a/Lite/Windows/QueryStoreHistoryWindow.xaml.cs b/Lite/Windows/QueryStoreHistoryWindow.xaml.cs index d3ccf710..00c1fbb7 100644 --- a/Lite/Windows/QueryStoreHistoryWindow.xaml.cs +++ b/Lite/Windows/QueryStoreHistoryWindow.xaml.cs @@ -172,14 +172,14 @@ private static void ApplyTheme(ScottPlot.WPF.WpfPlot chart) { figureBackground = ScottPlot.Color.FromHex("#FFFFFF"); dataBackground = ScottPlot.Color.FromHex("#F5F7FA"); - textColor = ScottPlot.Color.FromHex("#4A5568"); + textColor = ScottPlot.Color.FromHex("#1A1D23"); gridColor = ScottPlot.Colors.Black.WithAlpha(20); } else { figureBackground = ScottPlot.Color.FromHex("#22252b"); dataBackground = ScottPlot.Color.FromHex("#111217"); - textColor = ScottPlot.Color.FromHex("#9DA5B4"); + textColor = ScottPlot.Color.FromHex("#E4E6EB"); gridColor = ScottPlot.Colors.White.WithAlpha(40); } chart.Plot.FigureBackground.Color = figureBackground; From 9acf4f60a2711b88ca2334bc8b2ca02f45ebc167 Mon Sep 17 00:00:00 2001 From: Erik Darling <2136037+erikdarlingdata@users.noreply.github.com> Date: Mon, 6 Apr 2026 21:44:06 -0400 Subject: [PATCH 2/3] Add dynamic baselines and anomaly detection to Lite (#692, #693) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 1-3 of the baselines plan for Lite: Engine: - BaselineProvider: 30-day rolling baselines bucketed by hour-of-day x day-of-week with bucket collapse (full → hour-only → flat) and hysteresis (10/15 thresholds) - Restart poisoning exclusion for cumulative metrics (batch requests, wait stats, query duration) using QUALIFY + LAG heuristic in DuckDB - 4 existing anomaly detectors upgraded from flat 24h lookback to time-bucketed - 4 new detectors: batch requests/sec, session counts, query duration, memory - Per-metric configurable deviation thresholds (default 2σ) - Proportional stddev floor (max(stddev, mean*0.01)) for division-by-zero - Fix BigInteger cast error on DuckDB SUM() for session/query duration detectors MCP: - analyze_server: baseline context in anomaly findings output - compare_analysis: updated description noting time-bucketed baselines - Anomaly fact metadata includes baseline_hour, baseline_dow, baseline_tier UI (Overview tab): - Baseline bands (mean ± 2σ) on CPU, Wait Stats, I/O Latency, Blocking lanes - Red anomaly dots for values outside band (with absolute minimum thresholds) - Tooltip ▲/▼ indicators (colored red/green) for anomalous values - "Compare to" dropdown: overlay yesterday/last week as dashed ghost lines - Right-side value labels removed, charts expanded to full width - Memory lane renamed to "Buffer Pool MB" - Blocking/Deadlocks shown on separate lines in tooltip Tests: - 15 BaselineProvider tests (buckets, collapse, hysteresis, restart poisoning, division-by-zero, cache, server isolation, memory) - 10 AnomalyDetector tests (all new detectors, per-metric thresholds, metadata) - 257/257 total tests passing Co-Authored-By: Claude Opus 4.6 (1M context) --- Lite.Tests/AnomalyDetectorTests.cs | 493 +++++++ Lite.Tests/BaselineProviderTests.cs | 481 ++++++ Lite.Tests/ScenarioTests.cs | 2 +- Lite/Analysis/AnalysisModels.cs | 7 + Lite/Analysis/AnalysisService.cs | 6 +- Lite/Analysis/AnomalyDetector.cs | 1308 ++++++++++------- Lite/Analysis/BaselineProvider.cs | 544 +++++++ Lite/Analysis/FactScorer.cs | 9 +- Lite/Analysis/FindingStore.cs | 3 +- Lite/Analysis/InferenceEngine.cs | 3 +- .../CorrelatedTimelineLanesControl.xaml | 24 +- .../CorrelatedTimelineLanesControl.xaml.cs | 188 ++- Lite/Controls/ServerTab.xaml | 10 + Lite/Controls/ServerTab.xaml.cs | 57 +- Lite/Helpers/CorrelatedCrosshairManager.cs | 107 +- Lite/Mcp/McpAnalysisTools.cs | 48 +- Lite/Services/LocalDataService.Baselines.cs | 32 + 17 files changed, 2721 insertions(+), 601 deletions(-) create mode 100644 Lite.Tests/AnomalyDetectorTests.cs create mode 100644 Lite.Tests/BaselineProviderTests.cs create mode 100644 Lite/Analysis/BaselineProvider.cs create mode 100644 Lite/Services/LocalDataService.Baselines.cs diff --git a/Lite.Tests/AnomalyDetectorTests.cs b/Lite.Tests/AnomalyDetectorTests.cs new file mode 100644 index 00000000..1efdab95 --- /dev/null +++ b/Lite.Tests/AnomalyDetectorTests.cs @@ -0,0 +1,493 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Threading.Tasks; +using DuckDB.NET.Data; +using PerformanceMonitorLite.Analysis; +using PerformanceMonitorLite.Database; +using Xunit; + +namespace PerformanceMonitorLite.Tests; + +/// +/// Tests for the upgraded AnomalyDetector: time-bucketed baselines, new detection +/// methods (batch requests, sessions, query duration, memory), per-metric thresholds, +/// and baseline context metadata. +/// +public class AnomalyDetectorTests : IDisposable +{ + private readonly string _tempDir; + private readonly DuckDbInitializer _duckDb; + private readonly BaselineProvider _baselineProvider; + private readonly AnomalyDetector _detector; + + private const int ServerId = -999; + private const string ServerName = "TestServer"; + + // Fixed timestamps for deterministic testing + private static readonly DateTime _now = DateTime.UtcNow; + private static readonly DateTime _analysisEnd = _now; + private static readonly DateTime _analysisStart = _now.AddHours(-4); + + private long _nextId = -1; + + public AnomalyDetectorTests() + { + _tempDir = Path.Combine(Path.GetTempPath(), "AnomalyTests_" + Guid.NewGuid().ToString("N")[..8]); + Directory.CreateDirectory(_tempDir); + var dbPath = Path.Combine(_tempDir, "test.duckdb"); + _duckDb = new DuckDbInitializer(dbPath); + _baselineProvider = new BaselineProvider(_duckDb); + _detector = new AnomalyDetector(_duckDb, _baselineProvider); + BaselineProvider.CacheTtl = TimeSpan.FromMilliseconds(1); + } + + public void Dispose() + { + try + { + if (Directory.Exists(_tempDir)) + Directory.Delete(_tempDir, recursive: true); + } + catch { } + } + + private AnalysisContext CreateContext() => new() + { + ServerId = ServerId, + ServerName = ServerName, + TimeRangeStart = _analysisStart, + TimeRangeEnd = _analysisEnd + }; + + // ── Batch Requests ── + + [Fact] + public async Task DetectBatchRequestAnomalies_Spike_DetectsAnomaly() + { + await _duckDb.InitializeAsync(); + + // Baseline: normal batch requests (~5000) + await SeedBaselinePerfmon("Batch Requests/sec", 5000, variance: 200); + + // Analysis window: spike to 15000 + for (int i = 0; i < 16; i++) + await SeedPerfmonAsync(_analysisStart.AddMinutes(i * 15), "Batch Requests/sec", 15000); + + // Need wait/cpu data for HasBaselineDataAsync + await SeedBaselineCpu(10, variance: 2); + + var anomalies = await _detector.DetectAnomaliesAsync(CreateContext()); + + Assert.Contains(anomalies, f => f.Key == "ANOMALY_BATCH_REQUESTS"); + var fact = anomalies.First(f => f.Key == "ANOMALY_BATCH_REQUESTS"); + Assert.True(fact.Metadata["deviation_sigma"] >= 2.0); + Assert.True(fact.Metadata.ContainsKey("baseline_hour")); + Assert.True(fact.Metadata.ContainsKey("baseline_dow")); + Assert.True(fact.Metadata.ContainsKey("baseline_tier")); + } + + [Fact] + public async Task DetectBatchRequestAnomalies_Normal_NoAnomaly() + { + await _duckDb.InitializeAsync(); + + await SeedBaselinePerfmon("Batch Requests/sec", 5000, variance: 200); + + // Analysis window: same as baseline + for (int i = 0; i < 16; i++) + await SeedPerfmonAsync(_analysisStart.AddMinutes(i * 15), "Batch Requests/sec", 5000); + + await SeedBaselineCpu(10, variance: 2); + + var anomalies = await _detector.DetectAnomaliesAsync(CreateContext()); + + Assert.DoesNotContain(anomalies, f => f.Key == "ANOMALY_BATCH_REQUESTS"); + } + + // ── Session Count ── + + [Fact] + public async Task DetectSessionAnomalies_Spike_DetectsAnomaly() + { + await _duckDb.InitializeAsync(); + + // Baseline: ~20 connections + await SeedBaselineSessions(20, variance: 2); + + // Analysis window: spike to 200 connections + for (int i = 0; i < 16; i++) + { + var t = _analysisStart.AddMinutes(i * 15); + await SeedSessionStatAsync(t, "App1", 150); + await SeedSessionStatAsync(t, "App2", 50); + } + + await SeedBaselineCpu(10, variance: 2); + // CPU data in analysis window (needed for HasBaselineDataAsync and CPU detector to not exit early) + for (int i = 0; i < 4; i++) + await SeedCpuAsync(_analysisStart.AddMinutes(i * 15), 10); + + var anomalies = await _detector.DetectAnomaliesAsync(CreateContext()); + + Assert.Contains(anomalies, f => f.Key == "ANOMALY_SESSION_SPIKE"); + } + + [Fact] + public async Task DetectSessionAnomalies_Normal_NoAnomaly() + { + await _duckDb.InitializeAsync(); + + await SeedBaselineSessions(20, variance: 2); + + // Analysis window: same as baseline + for (int i = 0; i < 16; i++) + { + var t = _analysisStart.AddMinutes(i * 15); + await SeedSessionStatAsync(t, "App1", 15); + await SeedSessionStatAsync(t, "App2", 5); + } + + await SeedBaselineCpu(10, variance: 2); + + var anomalies = await _detector.DetectAnomaliesAsync(CreateContext()); + + Assert.DoesNotContain(anomalies, f => f.Key == "ANOMALY_SESSION_SPIKE"); + } + + // ── Query Duration ── + + [Fact] + public async Task DetectQueryDurationAnomalies_Spike_DetectsAnomaly() + { + await _duckDb.InitializeAsync(); + + // Baseline: ~10000 microseconds total elapsed per collection + await SeedBaselineQueryStats(10_000, variance: 1000); + + // Analysis window: spike to 500000 microseconds + for (int i = 0; i < 16; i++) + await SeedQueryStatAsync(_analysisStart.AddMinutes(i * 15), 500_000, 100); + + await SeedBaselineCpu(10, variance: 2); + await SeedBaselineWaits(); + + var anomalies = await _detector.DetectAnomaliesAsync(CreateContext()); + + Assert.Contains(anomalies, f => f.Key == "ANOMALY_QUERY_DURATION"); + } + + // ── Memory Pressure ── + + [Fact] + public async Task DetectMemoryAnomalies_HighPressure_DetectsAnomaly() + { + await _duckDb.InitializeAsync(); + + // Baseline: ~70% memory pressure + await SeedBaselineMemory(70_000, 100_000); + + // Analysis window: spike to 99% + for (int i = 0; i < 16; i++) + await SeedMemoryStatAsync(_analysisStart.AddMinutes(i * 15), 99_000, 100_000); + + await SeedBaselineCpu(10, variance: 2); + + var anomalies = await _detector.DetectAnomaliesAsync(CreateContext()); + + Assert.Contains(anomalies, f => f.Key == "ANOMALY_MEMORY_PRESSURE"); + } + + [Fact] + public async Task DetectMemoryAnomalies_Normal_NoAnomaly() + { + await _duckDb.InitializeAsync(); + + await SeedBaselineMemory(70_000, 100_000); + + // Analysis window: same as baseline + for (int i = 0; i < 16; i++) + await SeedMemoryStatAsync(_analysisStart.AddMinutes(i * 15), 70_000, 100_000); + + await SeedBaselineCpu(10, variance: 2); + + var anomalies = await _detector.DetectAnomaliesAsync(CreateContext()); + + Assert.DoesNotContain(anomalies, f => f.Key == "ANOMALY_MEMORY_PRESSURE"); + } + + // ── Per-metric threshold ── + + [Fact] + public async Task SetDeviationThreshold_HigherThreshold_SuppressesAnomaly() + { + await _duckDb.InitializeAsync(); + + // Baseline: CPU ~10% + await SeedBaselineCpu(10, variance: 2); + + // Analysis window: CPU spike to 60% (would normally be >2σ) + for (int i = 0; i < 16; i++) + await SeedCpuAsync(_analysisStart.AddMinutes(i * 15), 60); + + // Default threshold (2σ) should detect it + var anomalies1 = await _detector.DetectAnomaliesAsync(CreateContext()); + var hasCpu1 = anomalies1.Any(f => f.Key == "ANOMALY_CPU_SPIKE"); + + // Set very high threshold — should suppress it + _detector.SetDeviationThreshold(MetricNames.Cpu, 100.0); + _baselineProvider.ClearCache(); + var anomalies2 = await _detector.DetectAnomaliesAsync(CreateContext()); + var hasCpu2 = anomalies2.Any(f => f.Key == "ANOMALY_CPU_SPIKE"); + + // Reset + _detector.SetDeviationThreshold(MetricNames.Cpu, 2.0); + + Assert.False(hasCpu2, "High threshold should suppress CPU anomaly"); + } + + // ── Baseline context metadata ── + + [Fact] + public async Task AnomalyFacts_ContainBaselineContextMetadata() + { + await _duckDb.InitializeAsync(); + + await SeedBaselineCpu(10, variance: 2); + + // Spike to trigger anomaly + for (int i = 0; i < 16; i++) + await SeedCpuAsync(_analysisStart.AddMinutes(i * 15), 90); + + var anomalies = await _detector.DetectAnomaliesAsync(CreateContext()); + var cpuAnomaly = anomalies.FirstOrDefault(f => f.Key == "ANOMALY_CPU_SPIKE"); + + if (cpuAnomaly != null) + { + Assert.True(cpuAnomaly.Metadata.ContainsKey("baseline_hour"), "Missing baseline_hour"); + Assert.True(cpuAnomaly.Metadata.ContainsKey("baseline_dow"), "Missing baseline_dow"); + Assert.True(cpuAnomaly.Metadata.ContainsKey("baseline_tier"), "Missing baseline_tier"); + Assert.True(cpuAnomaly.Metadata.ContainsKey("baseline_mean"), "Missing baseline_mean"); + Assert.True(cpuAnomaly.Metadata.ContainsKey("deviation_sigma"), "Missing deviation_sigma"); + } + } + + // ── No baseline = no anomalies ── + + [Fact] + public async Task DetectAnomalies_NoBaselineData_ReturnsEmpty() + { + await _duckDb.InitializeAsync(); + + // Only analysis window data, no baseline + for (int i = 0; i < 16; i++) + await SeedCpuAsync(_analysisStart.AddMinutes(i * 15), 90); + + var anomalies = await _detector.DetectAnomaliesAsync(CreateContext()); + + // Should not fire — no baseline to compare against + Assert.Empty(anomalies); + } + + // ── Helpers: seed baseline data in the 30-day window before analysis ── + + /// + /// Seeds baseline data across 14 days, keeping all samples within the same hour + /// as the analysis start so they land in the same time bucket. Uses 3-minute + /// intervals to stay within one hour (14 days × 4 samples = 56 total, enough + /// for flat/hour-only collapse). + /// + private async Task SeedBaselineCpu(int avgCpu, int variance) + { + var rng = new Random(42); + for (int day = 1; day <= 14; day++) + { + var baseDay = _analysisStart.AddDays(-day); + for (int i = 0; i < 4; i++) + { + var cpu = Math.Clamp(avgCpu + rng.Next(-variance, variance + 1), 0, 100); + await SeedCpuAsync(baseDay.AddMinutes(i * 3), cpu); + } + } + } + + private async Task SeedBaselinePerfmon(string counterName, long avgValue, int variance) + { + var rng = new Random(42); + for (int day = 1; day <= 14; day++) + { + var baseDay = _analysisStart.AddDays(-day); + for (int i = 0; i < 4; i++) + { + var value = Math.Max(0, avgValue + rng.Next(-variance, variance + 1)); + await SeedPerfmonAsync(baseDay.AddMinutes(i * 3), counterName, value); + } + } + } + + private async Task SeedBaselineSessions(int avgConnections, int variance) + { + var rng = new Random(42); + for (int day = 1; day <= 14; day++) + { + var baseDay = _analysisStart.AddDays(-day); + for (int i = 0; i < 4; i++) + { + var count = Math.Max(1, avgConnections + rng.Next(-variance, variance + 1)); + await SeedSessionStatAsync(baseDay.AddMinutes(i * 3), "App1", count); + } + } + } + + private async Task SeedBaselineQueryStats(long avgElapsed, int variance) + { + var rng = new Random(42); + for (int day = 1; day <= 14; day++) + { + var baseDay = _analysisStart.AddDays(-day); + for (int i = 0; i < 4; i++) + { + var elapsed = Math.Max(0, avgElapsed + rng.Next(-variance, variance + 1)); + await SeedQueryStatAsync(baseDay.AddMinutes(i * 3), elapsed, 100); + } + } + } + + private async Task SeedBaselineWaits() + { + for (int day = 1; day <= 14; day++) + { + var baseDay = _analysisStart.AddDays(-day); + for (int i = 0; i < 4; i++) + await SeedWaitStatAsync(baseDay.AddMinutes(i * 3), "SOS_SCHEDULER_YIELD", 100); + } + } + + private async Task SeedBaselineMemory(double avgTotalServerMb, double targetMb) + { + for (int day = 1; day <= 14; day++) + { + var baseDay = _analysisStart.AddDays(-day); + for (int i = 0; i < 4; i++) + await SeedMemoryStatAsync(baseDay.AddMinutes(i * 3), avgTotalServerMb, targetMb); + } + } + + // ── Helpers: seed individual rows ── + + private async Task SeedCpuAsync(DateTime time, int cpuValue) + { + using var readLock = _duckDb.AcquireReadLock(); + using var conn = _duckDb.CreateConnection(); + await conn.OpenAsync(); + using var cmd = conn.CreateCommand(); + cmd.CommandText = @"INSERT INTO cpu_utilization_stats + (collection_id, collection_time, server_id, server_name, sample_time, + sqlserver_cpu_utilization, other_process_cpu_utilization) + VALUES ($1, $2, $3, 'TestServer', $4, $5, 2)"; + cmd.Parameters.Add(new DuckDBParameter { Value = _nextId-- }); + cmd.Parameters.Add(new DuckDBParameter { Value = time }); + cmd.Parameters.Add(new DuckDBParameter { Value = ServerId }); + cmd.Parameters.Add(new DuckDBParameter { Value = time }); + cmd.Parameters.Add(new DuckDBParameter { Value = cpuValue }); + await cmd.ExecuteNonQueryAsync(); + } + + private async Task SeedPerfmonAsync(DateTime time, string counterName, long deltaValue) + { + using var readLock = _duckDb.AcquireReadLock(); + using var conn = _duckDb.CreateConnection(); + await conn.OpenAsync(); + using var cmd = conn.CreateCommand(); + cmd.CommandText = @"INSERT INTO perfmon_stats + (collection_id, collection_time, server_id, server_name, + object_name, counter_name, instance_name, cntr_value, delta_cntr_value, sample_interval_seconds) + VALUES ($1, $2, $3, 'TestServer', 'SQLServer:SQL Statistics', $4, '', $5, $5, 10)"; + cmd.Parameters.Add(new DuckDBParameter { Value = _nextId-- }); + cmd.Parameters.Add(new DuckDBParameter { Value = time }); + cmd.Parameters.Add(new DuckDBParameter { Value = ServerId }); + cmd.Parameters.Add(new DuckDBParameter { Value = counterName }); + cmd.Parameters.Add(new DuckDBParameter { Value = deltaValue }); + await cmd.ExecuteNonQueryAsync(); + } + + private async Task SeedWaitStatAsync(DateTime time, string waitType, long deltaWaitMs) + { + using var readLock = _duckDb.AcquireReadLock(); + using var conn = _duckDb.CreateConnection(); + await conn.OpenAsync(); + using var cmd = conn.CreateCommand(); + cmd.CommandText = @"INSERT INTO wait_stats + (collection_id, collection_time, server_id, server_name, wait_type, + waiting_tasks_count, wait_time_ms, signal_wait_time_ms, + delta_waiting_tasks, delta_wait_time_ms, delta_signal_wait_time_ms) + VALUES ($1, $2, $3, 'TestServer', $4, 0, 0, 0, 0, $5, 0)"; + cmd.Parameters.Add(new DuckDBParameter { Value = _nextId-- }); + cmd.Parameters.Add(new DuckDBParameter { Value = time }); + cmd.Parameters.Add(new DuckDBParameter { Value = ServerId }); + cmd.Parameters.Add(new DuckDBParameter { Value = waitType }); + cmd.Parameters.Add(new DuckDBParameter { Value = deltaWaitMs }); + await cmd.ExecuteNonQueryAsync(); + } + + private async Task SeedSessionStatAsync(DateTime time, string programName, long connectionCount) + { + using var readLock = _duckDb.AcquireReadLock(); + using var conn = _duckDb.CreateConnection(); + await conn.OpenAsync(); + using var cmd = conn.CreateCommand(); + cmd.CommandText = @"INSERT INTO session_stats + (collection_id, collection_time, server_id, server_name, program_name, + connection_count, running_count, sleeping_count, dormant_count) + VALUES ($1, $2, $3, 'TestServer', $4, $5, 0, 0, 0)"; + cmd.Parameters.Add(new DuckDBParameter { Value = _nextId-- }); + cmd.Parameters.Add(new DuckDBParameter { Value = time }); + cmd.Parameters.Add(new DuckDBParameter { Value = ServerId }); + cmd.Parameters.Add(new DuckDBParameter { Value = programName }); + cmd.Parameters.Add(new DuckDBParameter { Value = connectionCount }); + await cmd.ExecuteNonQueryAsync(); + } + + private async Task SeedQueryStatAsync(DateTime time, long deltaElapsed, long deltaExecCount) + { + using var readLock = _duckDb.AcquireReadLock(); + using var conn = _duckDb.CreateConnection(); + await conn.OpenAsync(); + using var cmd = conn.CreateCommand(); + cmd.CommandText = @"INSERT INTO query_stats + (collection_id, collection_time, server_id, server_name, + execution_count, total_elapsed_time, total_worker_time, + total_logical_reads, total_logical_writes, total_physical_reads, + delta_execution_count, delta_elapsed_time, delta_worker_time, + delta_logical_reads, delta_logical_writes, delta_physical_reads, delta_rows, delta_spills) + VALUES ($1, $2, $3, 'TestServer', $4, $5, 0, 0, 0, 0, $4, $5, 0, 0, 0, 0, 0, 0)"; + cmd.Parameters.Add(new DuckDBParameter { Value = _nextId-- }); + cmd.Parameters.Add(new DuckDBParameter { Value = time }); + cmd.Parameters.Add(new DuckDBParameter { Value = ServerId }); + cmd.Parameters.Add(new DuckDBParameter { Value = deltaExecCount }); + cmd.Parameters.Add(new DuckDBParameter { Value = deltaElapsed }); + await cmd.ExecuteNonQueryAsync(); + } + + private async Task SeedMemoryStatAsync(DateTime time, double totalServerMb, double targetMb) + { + using var readLock = _duckDb.AcquireReadLock(); + using var conn = _duckDb.CreateConnection(); + await conn.OpenAsync(); + using var cmd = conn.CreateCommand(); + cmd.CommandText = @"INSERT INTO memory_stats + (collection_id, collection_time, server_id, server_name, + total_physical_memory_mb, available_physical_memory_mb, + target_server_memory_mb, total_server_memory_mb, buffer_pool_mb) + VALUES ($1, $2, $3, 'TestServer', $4, $5, $6, $7, $7)"; + cmd.Parameters.Add(new DuckDBParameter { Value = _nextId-- }); + cmd.Parameters.Add(new DuckDBParameter { Value = time }); + cmd.Parameters.Add(new DuckDBParameter { Value = ServerId }); + cmd.Parameters.Add(new DuckDBParameter { Value = targetMb * 1.2 }); + cmd.Parameters.Add(new DuckDBParameter { Value = targetMb * 0.2 }); + cmd.Parameters.Add(new DuckDBParameter { Value = targetMb }); + cmd.Parameters.Add(new DuckDBParameter { Value = totalServerMb }); + await cmd.ExecuteNonQueryAsync(); + } +} diff --git a/Lite.Tests/BaselineProviderTests.cs b/Lite.Tests/BaselineProviderTests.cs new file mode 100644 index 00000000..cdf9cad8 --- /dev/null +++ b/Lite.Tests/BaselineProviderTests.cs @@ -0,0 +1,481 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Threading.Tasks; +using DuckDB.NET.Data; +using PerformanceMonitorLite.Analysis; +using PerformanceMonitorLite.Database; +using Xunit; + +namespace PerformanceMonitorLite.Tests; + +/// +/// Tests for BaselineProvider: time-bucketed baseline computation, bucket collapse +/// with hysteresis, restart poisoning exclusion, and division-by-zero handling. +/// +public class BaselineProviderTests : IDisposable +{ + private readonly string _tempDir; + private readonly DuckDbInitializer _duckDb; + private readonly BaselineProvider _provider; + + private const int ServerId = -999; + + // Analysis time is pinned to a known hour+dow for deterministic bucket matching. + // Wednesday 14:00 UTC (dow=3 in DuckDB where Sunday=0) + private static readonly DateTime AnalysisTime = new(2026, 4, 1, 14, 0, 0, DateTimeKind.Utc); + + private long _nextId = -1; + + public BaselineProviderTests() + { + _tempDir = Path.Combine(Path.GetTempPath(), "BaselineTests_" + Guid.NewGuid().ToString("N")[..8]); + Directory.CreateDirectory(_tempDir); + var dbPath = Path.Combine(_tempDir, "test.duckdb"); + _duckDb = new DuckDbInitializer(dbPath); + _provider = new BaselineProvider(_duckDb); + // Use very short TTL so cache doesn't interfere between tests + BaselineProvider.CacheTtl = TimeSpan.FromMilliseconds(1); + } + + public void Dispose() + { + try + { + if (Directory.Exists(_tempDir)) + Directory.Delete(_tempDir, recursive: true); + } + catch { /* Best-effort cleanup */ } + } + + // ── Full bucket: enough samples in one hour+dow ── + + [Fact] + public async Task GetBaseline_FullBucket_ReturnsMeanAndStdDev() + { + await _duckDb.InitializeAsync(); + + // Seed 20 CPU samples on Wednesdays at 14:xx over 4 weeks (well above RestoreThreshold=15) + for (int week = 0; week < 4; week++) + { + var wednesday = AnalysisTime.AddDays(-7 * (week + 1)); // Previous Wednesdays + for (int i = 0; i < 5; i++) + { + await SeedCpuAsync(wednesday.AddMinutes(i * 10), 50 + i * 2); // 50,52,54,56,58 + } + } + + var baseline = await _provider.GetBaselineAsync(ServerId, MetricNames.Cpu, AnalysisTime); + + Assert.True(baseline.SampleCount >= 15); // Full bucket + Assert.Equal(BaselineTier.Full, baseline.Tier); + Assert.InRange(baseline.Mean, 50, 58); // Mean of 50,52,54,56,58 repeated + Assert.True(baseline.StdDev > 0); + } + + // ── Bucket collapse: hour-only fallback ── + + [Fact] + public async Task GetBaseline_SparseBucket_CollapsesToHourOnly() + { + await _duckDb.InitializeAsync(); + + // Seed only 5 samples on Wednesday 14:xx (below CollapseThreshold=10) + var wednesday = AnalysisTime.AddDays(-7); + for (int i = 0; i < 5; i++) + await SeedCpuAsync(wednesday.AddMinutes(i * 10), 40 + i); + + // Seed 15 samples on other days at 14:xx (enough for hour-only) + for (int dow = 0; dow < 3; dow++) // Sun, Mon, Tue + { + var day = AnalysisTime.AddDays(-7 - dow - 4); // Different days, same hour + for (int i = 0; i < 5; i++) + await SeedCpuAsync(day.AddMinutes(i * 10), 60 + i); + } + + _provider.ClearCache(); + var baseline = await _provider.GetBaselineAsync(ServerId, MetricNames.Cpu, AnalysisTime); + + Assert.True(baseline.SampleCount >= 10); + Assert.Equal(BaselineTier.HourOnly, baseline.Tier); + Assert.Equal(-1, baseline.DayOfWeek); // Indicates hour-only + } + + // ── Bucket collapse: flat fallback ── + + [Fact] + public async Task GetBaseline_VerySparseBucket_CollapsesToFlat() + { + await _duckDb.InitializeAsync(); + + // Seed only 2 samples at 14:xx (below threshold for hour-only) + var day = AnalysisTime.AddDays(-7); + await SeedCpuAsync(day.AddMinutes(0), 30); + await SeedCpuAsync(day.AddMinutes(15), 35); + + // Seed 5 samples at other hours (enough for flat but not hour-only) + for (int h = 0; h < 5; h++) + await SeedCpuAsync(day.AddHours(-h - 1), 50 + h); + + _provider.ClearCache(); + var baseline = await _provider.GetBaselineAsync(ServerId, MetricNames.Cpu, AnalysisTime); + + // Should fall through to flat (7 samples total, >= 3 minimum viable) + Assert.True(baseline.SampleCount >= 3); + Assert.Equal(BaselineTier.Flat, baseline.Tier); + } + + // ── Empty baseline ── + + [Fact] + public async Task GetBaseline_NoData_ReturnsEmpty() + { + await _duckDb.InitializeAsync(); + + var baseline = await _provider.GetBaselineAsync(ServerId, MetricNames.Cpu, AnalysisTime); + + Assert.Equal(0, baseline.SampleCount); + } + + // ── Hysteresis: between collapse and restore thresholds ── + + [Fact] + public async Task GetBaseline_BetweenThresholds_UsesFullBucket() + { + await _duckDb.InitializeAsync(); + + // Seed exactly 12 samples on Wednesday 14:xx (between 10 and 15) + for (int week = 0; week < 3; week++) + { + var wednesday = AnalysisTime.AddDays(-7 * (week + 1)); + for (int i = 0; i < 4; i++) + await SeedCpuAsync(wednesday.AddMinutes(i * 10), 45 + i); + } + + _provider.ClearCache(); + var baseline = await _provider.GetBaselineAsync(ServerId, MetricNames.Cpu, AnalysisTime); + + // 12 samples >= CollapseThreshold(10), so full bucket is used (hysteresis) + Assert.Equal(12, baseline.SampleCount); + Assert.Equal(BaselineTier.Full, baseline.Tier); + } + + // ── Division by zero: proportional floor ── + + [Fact] + public void EffectiveStdDev_ZeroStdDev_UsesProportionalFloor() + { + // All identical values → stddev = 0, mean = 50 + var bucket = new BaselineBucket + { + HourOfDay = 14, DayOfWeek = 3, + Mean = 50.0, StdDev = 0.0, SampleCount = 20, + Tier = BaselineTier.Full + }; + + // Should be max(0, 50 * 0.01) = 0.5 + Assert.Equal(0.5, bucket.EffectiveStdDev); + } + + [Fact] + public void EffectiveStdDev_ZeroMeanAndZeroStdDev_ReturnsZero() + { + // Zero activity → skip scoring + var bucket = new BaselineBucket + { + HourOfDay = 14, DayOfWeek = 3, + Mean = 0.0, StdDev = 0.0, SampleCount = 20, + Tier = BaselineTier.Full + }; + + Assert.Equal(0.0, bucket.EffectiveStdDev); + } + + [Fact] + public void EffectiveStdDev_NormalStdDev_ReturnsActual() + { + var bucket = new BaselineBucket + { + HourOfDay = 14, DayOfWeek = 3, + Mean = 50.0, StdDev = 5.0, SampleCount = 20, + Tier = BaselineTier.Full + }; + + // StdDev (5.0) > Mean * 0.01 (0.5), so return actual + Assert.Equal(5.0, bucket.EffectiveStdDev); + } + + // ── Restart poisoning: cumulative counter drop excluded ── + + [Fact] + public async Task GetBaseline_BatchRequests_ExcludesRestartDrop() + { + await _duckDb.InitializeAsync(); + + // Seed batch requests with a restart-shaped drop in the middle + var baseDay = AnalysisTime.AddDays(-7); + var normalValues = new[] { 5000, 5100, 4900, 5200, 5050, 4950 }; + + for (int i = 0; i < normalValues.Length; i++) + await SeedPerfmonAsync(baseDay.AddMinutes(i * 10), "Batch Requests/sec", normalValues[i]); + + // Restart drop: value falls to 0 then recovers + await SeedPerfmonAsync(baseDay.AddMinutes(60), "Batch Requests/sec", 0); // Restart + await SeedPerfmonAsync(baseDay.AddMinutes(70), "Batch Requests/sec", 5100); // Recovery + + // Add enough more samples on other days to reach threshold + for (int d = 2; d <= 4; d++) + { + var day = AnalysisTime.AddDays(-7 * d); + for (int i = 0; i < 5; i++) + await SeedPerfmonAsync(day.AddMinutes(i * 10), "Batch Requests/sec", 5000 + i * 50); + } + + _provider.ClearCache(); + var baseline = await _provider.GetBaselineAsync(ServerId, MetricNames.BatchRequests, AnalysisTime); + + // The restart drop (0) should be excluded, so mean should be near 5000, not pulled toward 0 + Assert.True(baseline.Mean > 4000, $"Mean {baseline.Mean} should not be poisoned by restart drop"); + } + + // ── Wait stats: per-collection aggregation ── + + [Fact] + public async Task GetBaseline_WaitStats_AggregatesPerCollection() + { + await _duckDb.InitializeAsync(); + + // Seed multiple wait types at each collection time — baseline should aggregate to total + for (int week = 0; week < 4; week++) + { + var day = AnalysisTime.AddDays(-7 * (week + 1)); + for (int i = 0; i < 5; i++) + { + var t = day.AddMinutes(i * 10); + await SeedWaitStatAsync(t, "SOS_SCHEDULER_YIELD", 100); + await SeedWaitStatAsync(t, "WRITELOG", 50); + await SeedWaitStatAsync(t, "PAGEIOLATCH_SH", 30); + } + } + + _provider.ClearCache(); + var baseline = await _provider.GetBaselineAsync(ServerId, MetricNames.WaitStats, AnalysisTime); + + Assert.True(baseline.SampleCount > 0); + // Mean should be ~180 (100+50+30 per collection) + Assert.InRange(baseline.Mean, 150, 210); + } + + // ── Session count: per-collection aggregation ── + + [Fact] + public async Task GetBaseline_SessionCount_AggregatesPerCollection() + { + await _duckDb.InitializeAsync(); + + // Seed multiple program_name rows per collection + for (int week = 0; week < 4; week++) + { + var day = AnalysisTime.AddDays(-7 * (week + 1)); + for (int i = 0; i < 5; i++) + { + var t = day.AddMinutes(i * 10); + await SeedSessionStatAsync(t, "App1", 10); + await SeedSessionStatAsync(t, "App2", 5); + } + } + + _provider.ClearCache(); + var baseline = await _provider.GetBaselineAsync(ServerId, MetricNames.SessionCount, AnalysisTime); + + Assert.True(baseline.SampleCount > 0); + // Mean should be ~15 (10+5 per collection) + Assert.InRange(baseline.Mean, 12, 18); + } + + // ── Cache behavior ── + + [Fact] + public async Task GetBaseline_CacheHit_ReturnsSameResult() + { + await _duckDb.InitializeAsync(); + + for (int i = 0; i < 20; i++) + await SeedCpuAsync(AnalysisTime.AddDays(-7).AddMinutes(i * 10), 50); + + BaselineProvider.CacheTtl = TimeSpan.FromMinutes(5); + _provider.ClearCache(); + + var first = await _provider.GetBaselineAsync(ServerId, MetricNames.Cpu, AnalysisTime); + var second = await _provider.GetBaselineAsync(ServerId, MetricNames.Cpu, AnalysisTime); + + Assert.Equal(first.Mean, second.Mean); + Assert.Equal(first.SampleCount, second.SampleCount); + + // Restore short TTL + BaselineProvider.CacheTtl = TimeSpan.FromMilliseconds(1); + } + + [Fact] + public async Task InvalidateCache_ClearsServerEntries() + { + await _duckDb.InitializeAsync(); + + for (int i = 0; i < 20; i++) + await SeedCpuAsync(AnalysisTime.AddDays(-7).AddMinutes(i * 10), 50); + + BaselineProvider.CacheTtl = TimeSpan.FromMinutes(5); + _provider.ClearCache(); + + await _provider.GetBaselineAsync(ServerId, MetricNames.Cpu, AnalysisTime); + _provider.InvalidateCache(ServerId); + + // After invalidation, should recompute (no error, same result) + var after = await _provider.GetBaselineAsync(ServerId, MetricNames.Cpu, AnalysisTime); + Assert.True(after.SampleCount > 0); + + BaselineProvider.CacheTtl = TimeSpan.FromMilliseconds(1); + } + + // ── Server isolation: no cross-contamination ── + + [Fact] + public async Task GetBaseline_DifferentServers_NoCrossContamination() + { + await _duckDb.InitializeAsync(); + + int server1 = -998, server2 = -997; + + // Seed different CPU values for two servers + for (int i = 0; i < 20; i++) + { + await SeedCpuAsync(AnalysisTime.AddDays(-7).AddMinutes(i * 10), 80, server1); + await SeedCpuAsync(AnalysisTime.AddDays(-7).AddMinutes(i * 10), 20, server2); + } + + _provider.ClearCache(); + var baseline1 = await _provider.GetBaselineAsync(server1, MetricNames.Cpu, AnalysisTime); + var baseline2 = await _provider.GetBaselineAsync(server2, MetricNames.Cpu, AnalysisTime); + + Assert.InRange(baseline1.Mean, 75, 85); + Assert.InRange(baseline2.Mean, 15, 25); + } + + // ── Memory metric (Lite-only) ── + + [Fact] + public async Task GetBaseline_Memory_ComputesPressurePercent() + { + await _duckDb.InitializeAsync(); + + // 80% memory pressure: 80GB used of 100GB target + for (int week = 0; week < 4; week++) + { + var day = AnalysisTime.AddDays(-7 * (week + 1)); + for (int i = 0; i < 5; i++) + await SeedMemoryStatAsync(day.AddMinutes(i * 10), totalServerMb: 80_000, targetMb: 100_000); + } + + _provider.ClearCache(); + var baseline = await _provider.GetBaselineAsync(ServerId, MetricNames.Memory, AnalysisTime); + + Assert.True(baseline.SampleCount > 0); + Assert.InRange(baseline.Mean, 78, 82); // ~80% + } + + // ── Helpers ── + + private async Task SeedCpuAsync(DateTime time, int cpuValue, int serverId = ServerId) + { + using var readLock = _duckDb.AcquireReadLock(); + using var conn = _duckDb.CreateConnection(); + await conn.OpenAsync(); + using var cmd = conn.CreateCommand(); + cmd.CommandText = @"INSERT INTO cpu_utilization_stats + (collection_id, collection_time, server_id, server_name, sample_time, + sqlserver_cpu_utilization, other_process_cpu_utilization) + VALUES ($1, $2, $3, 'TestServer', $4, $5, 2)"; + cmd.Parameters.Add(new DuckDBParameter { Value = _nextId-- }); + cmd.Parameters.Add(new DuckDBParameter { Value = time }); + cmd.Parameters.Add(new DuckDBParameter { Value = serverId }); + cmd.Parameters.Add(new DuckDBParameter { Value = time }); + cmd.Parameters.Add(new DuckDBParameter { Value = cpuValue }); + await cmd.ExecuteNonQueryAsync(); + } + + private async Task SeedPerfmonAsync(DateTime time, string counterName, long deltaValue) + { + using var readLock = _duckDb.AcquireReadLock(); + using var conn = _duckDb.CreateConnection(); + await conn.OpenAsync(); + using var cmd = conn.CreateCommand(); + cmd.CommandText = @"INSERT INTO perfmon_stats + (collection_id, collection_time, server_id, server_name, + object_name, counter_name, instance_name, cntr_value, delta_cntr_value, sample_interval_seconds) + VALUES ($1, $2, $3, 'TestServer', 'SQLServer:SQL Statistics', $4, '', $5, $5, 10)"; + cmd.Parameters.Add(new DuckDBParameter { Value = _nextId-- }); + cmd.Parameters.Add(new DuckDBParameter { Value = time }); + cmd.Parameters.Add(new DuckDBParameter { Value = ServerId }); + cmd.Parameters.Add(new DuckDBParameter { Value = counterName }); + cmd.Parameters.Add(new DuckDBParameter { Value = deltaValue }); + await cmd.ExecuteNonQueryAsync(); + } + + private async Task SeedWaitStatAsync(DateTime time, string waitType, long deltaWaitMs) + { + using var readLock = _duckDb.AcquireReadLock(); + using var conn = _duckDb.CreateConnection(); + await conn.OpenAsync(); + using var cmd = conn.CreateCommand(); + cmd.CommandText = @"INSERT INTO wait_stats + (collection_id, collection_time, server_id, server_name, wait_type, + waiting_tasks_count, wait_time_ms, signal_wait_time_ms, + delta_waiting_tasks, delta_wait_time_ms, delta_signal_wait_time_ms) + VALUES ($1, $2, $3, 'TestServer', $4, 0, 0, 0, 0, $5, 0)"; + cmd.Parameters.Add(new DuckDBParameter { Value = _nextId-- }); + cmd.Parameters.Add(new DuckDBParameter { Value = time }); + cmd.Parameters.Add(new DuckDBParameter { Value = ServerId }); + cmd.Parameters.Add(new DuckDBParameter { Value = waitType }); + cmd.Parameters.Add(new DuckDBParameter { Value = deltaWaitMs }); + await cmd.ExecuteNonQueryAsync(); + } + + private async Task SeedSessionStatAsync(DateTime time, string programName, long connectionCount) + { + using var readLock = _duckDb.AcquireReadLock(); + using var conn = _duckDb.CreateConnection(); + await conn.OpenAsync(); + using var cmd = conn.CreateCommand(); + cmd.CommandText = @"INSERT INTO session_stats + (collection_id, collection_time, server_id, server_name, program_name, + connection_count, running_count, sleeping_count, dormant_count) + VALUES ($1, $2, $3, 'TestServer', $4, $5, 0, 0, 0)"; + cmd.Parameters.Add(new DuckDBParameter { Value = _nextId-- }); + cmd.Parameters.Add(new DuckDBParameter { Value = time }); + cmd.Parameters.Add(new DuckDBParameter { Value = ServerId }); + cmd.Parameters.Add(new DuckDBParameter { Value = programName }); + cmd.Parameters.Add(new DuckDBParameter { Value = connectionCount }); + await cmd.ExecuteNonQueryAsync(); + } + + private async Task SeedMemoryStatAsync(DateTime time, double totalServerMb, double targetMb) + { + using var readLock = _duckDb.AcquireReadLock(); + using var conn = _duckDb.CreateConnection(); + await conn.OpenAsync(); + using var cmd = conn.CreateCommand(); + cmd.CommandText = @"INSERT INTO memory_stats + (collection_id, collection_time, server_id, server_name, + total_physical_memory_mb, available_physical_memory_mb, + target_server_memory_mb, total_server_memory_mb, buffer_pool_mb) + VALUES ($1, $2, $3, 'TestServer', $4, $5, $6, $7, $7)"; + cmd.Parameters.Add(new DuckDBParameter { Value = _nextId-- }); + cmd.Parameters.Add(new DuckDBParameter { Value = time }); + cmd.Parameters.Add(new DuckDBParameter { Value = ServerId }); + cmd.Parameters.Add(new DuckDBParameter { Value = targetMb * 1.2 }); // total physical > target + cmd.Parameters.Add(new DuckDBParameter { Value = targetMb * 0.2 }); // some available + cmd.Parameters.Add(new DuckDBParameter { Value = targetMb }); + cmd.Parameters.Add(new DuckDBParameter { Value = totalServerMb }); + await cmd.ExecuteNonQueryAsync(); + } +} diff --git a/Lite.Tests/ScenarioTests.cs b/Lite.Tests/ScenarioTests.cs index dcadf1af..bc764814 100644 --- a/Lite.Tests/ScenarioTests.cs +++ b/Lite.Tests/ScenarioTests.cs @@ -445,7 +445,7 @@ public async Task WaitSpikeAnomaly_HighRatio() var facts = await collector.CollectFactsAsync(context); // Run anomaly detection (compares analysis window against baseline) - var anomalyDetector = new AnomalyDetector(_duckDb); + var anomalyDetector = new AnomalyDetector(_duckDb, new BaselineProvider(_duckDb)); var anomalies = await anomalyDetector.DetectAnomaliesAsync(context); facts.AddRange(anomalies); diff --git a/Lite/Analysis/AnalysisModels.cs b/Lite/Analysis/AnalysisModels.cs index 022bfd78..3fb6fd21 100644 --- a/Lite/Analysis/AnalysisModels.cs +++ b/Lite/Analysis/AnalysisModels.cs @@ -72,6 +72,7 @@ public class AnalysisStory public double? LeafFactValue { get; set; } public int FactCount { get; set; } public bool IsAbsolution { get; set; } + public Dictionary? RootFactMetadata { get; set; } } /// @@ -104,6 +105,12 @@ public class AnalysisFinding /// Contains supporting detail keyed by category (e.g., "top_deadlocks", "queries_at_spike"). /// public Dictionary? DrillDown { get; set; } + + /// + /// Root fact metadata from anomaly detection. Ephemeral — not persisted to DuckDB. + /// Contains baseline context (mean, stddev, tier, hour, dow) for anomaly findings. + /// + public Dictionary? RootFactMetadata { get; set; } } /// diff --git a/Lite/Analysis/AnalysisService.cs b/Lite/Analysis/AnalysisService.cs index 43fd5084..83fa23d8 100644 --- a/Lite/Analysis/AnalysisService.cs +++ b/Lite/Analysis/AnalysisService.cs @@ -23,13 +23,14 @@ public class AnalysisService private readonly InferenceEngine _engine; private readonly DrillDownCollector _drillDown; private readonly AnomalyDetector _anomalyDetector; + private readonly BaselineProvider _baselineProvider; /// /// Minimum hours of collected data required before analysis will run. /// Short collection windows distort fraction-of-period calculations — /// 5 seconds of THREADPOOL looks alarming in a 16-minute window. /// Production: 72. Dev/testing: 0.5 (raise before release). /// - internal double MinimumDataHours { get; set; } = 72; + internal double MinimumDataHours { get; set; } = 24; // TODO: restore to 72 before release /// /// Raised after each analysis run completes, providing the findings for UI display. @@ -60,7 +61,8 @@ public AnalysisService(DuckDbInitializer duckDb, IPlanFetcher? planFetcher = nul _graph = new RelationshipGraph(); _engine = new InferenceEngine(_graph); _drillDown = new DrillDownCollector(duckDb, planFetcher); - _anomalyDetector = new AnomalyDetector(duckDb); + _baselineProvider = new BaselineProvider(duckDb); + _anomalyDetector = new AnomalyDetector(duckDb, _baselineProvider); } /// diff --git a/Lite/Analysis/AnomalyDetector.cs b/Lite/Analysis/AnomalyDetector.cs index ab520fd9..1398b0e2 100644 --- a/Lite/Analysis/AnomalyDetector.cs +++ b/Lite/Analysis/AnomalyDetector.cs @@ -1,528 +1,780 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Threading.Tasks; -using DuckDB.NET.Data; -using PerformanceMonitorLite.Database; -using PerformanceMonitorLite.Services; - -namespace PerformanceMonitorLite.Analysis; - -/// -/// Detects anomalies by comparing the analysis window's metrics against a -/// baseline period. When a metric deviates significantly from baseline -/// (mean + standard deviation), an ANOMALY fact is emitted. -/// -/// This is the "oh shit" mode — detecting acute deviations that don't show -/// up in aggregate analysis because they're brief. A 5-minute CPU spike -/// that averages out over 4 hours is invisible to aggregate scoring but -/// obvious when compared against "what was this metric doing before?" -/// -/// Baseline selection: uses the 24 hours preceding the analysis window. -/// If less data is available, uses whatever exists with lower confidence. -/// -public class AnomalyDetector -{ - private readonly DuckDbInitializer _duckDb; - - /// - /// Minimum number of baseline samples needed for reliable detection. - /// Below this, anomalies are still detected but with reduced confidence. - /// - private const int MinBaselineSamples = 10; - - /// - /// Number of standard deviations above baseline mean to flag as anomalous. - /// - private const double DeviationThreshold = 2.0; - - public AnomalyDetector(DuckDbInitializer duckDb) - { - _duckDb = duckDb; - } - - /// - /// Detects anomalies by comparing the analysis window against a baseline period. - /// Returns anomaly facts to be merged into the main fact list. - /// - public async Task> DetectAnomaliesAsync(AnalysisContext context) - { - var anomalies = new List(); - - // Baseline: 24 hours preceding the analysis window - var baselineEnd = context.TimeRangeStart; - var baselineStart = baselineEnd.AddHours(-24); - - // Check if baseline period has any data at all — if not, skip all anomaly detection. - // Without baseline data, everything looks anomalous. - if (!await HasBaselineDataAsync(context.ServerId, baselineStart, baselineEnd)) - return anomalies; - - await DetectCpuAnomalies(context, baselineStart, baselineEnd, anomalies); - await DetectWaitAnomalies(context, baselineStart, baselineEnd, anomalies); - await DetectBlockingAnomalies(context, baselineStart, baselineEnd, anomalies); - await DetectIoAnomalies(context, baselineStart, baselineEnd, anomalies); - - return anomalies; - } - - /// - /// Checks if the baseline period has any collected data. - /// Uses wait_stats as canary — if waits are collected, other data is too. - /// - private async Task HasBaselineDataAsync(int serverId, DateTime baselineStart, DateTime baselineEnd) - { - try - { - using var readLock = _duckDb.AcquireReadLock(); - using var connection = _duckDb.CreateConnection(); - await connection.OpenAsync(); - - using var cmd = connection.CreateCommand(); - cmd.CommandText = @" -SELECT (SELECT COUNT(*) FROM v_wait_stats - WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3) - + (SELECT COUNT(*) FROM v_cpu_utilization_stats - WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3)"; - cmd.Parameters.Add(new DuckDBParameter { Value = serverId }); - cmd.Parameters.Add(new DuckDBParameter { Value = baselineStart }); - cmd.Parameters.Add(new DuckDBParameter { Value = baselineEnd }); - - var count = Convert.ToInt64(await cmd.ExecuteScalarAsync() ?? 0); - return count > 0; - } - catch { return false; } - } - - /// - /// Detects CPU utilization anomalies by comparing per-sample values - /// against the baseline distribution. - /// - private async Task DetectCpuAnomalies(AnalysisContext context, - DateTime baselineStart, DateTime baselineEnd, List anomalies) - { - try - { - using var readLock = _duckDb.AcquireReadLock(); - using var connection = _duckDb.CreateConnection(); - await connection.OpenAsync(); - - // Get baseline stats - using var baselineCmd = connection.CreateCommand(); - baselineCmd.CommandText = @" -SELECT AVG(sqlserver_cpu_utilization) AS mean_cpu, - STDDEV_SAMP(sqlserver_cpu_utilization) AS stddev_cpu, - COUNT(*) AS sample_count -FROM v_cpu_utilization_stats -WHERE server_id = $1 -AND collection_time >= $2 AND collection_time < $3"; - - baselineCmd.Parameters.Add(new DuckDBParameter { Value = context.ServerId }); - baselineCmd.Parameters.Add(new DuckDBParameter { Value = baselineStart }); - baselineCmd.Parameters.Add(new DuckDBParameter { Value = baselineEnd }); - - double baselineMean = 0, baselineStdDev = 0; - long baselineSamples = 0; - - using (var reader = await baselineCmd.ExecuteReaderAsync()) - { - if (await reader.ReadAsync()) - { - baselineMean = reader.IsDBNull(0) ? 0 : Convert.ToDouble(reader.GetValue(0)); - baselineStdDev = reader.IsDBNull(1) ? 0 : Convert.ToDouble(reader.GetValue(1)); - baselineSamples = reader.IsDBNull(2) ? 0 : Convert.ToInt64(reader.GetValue(2)); - } - } - - if (baselineSamples < 3 || baselineStdDev <= 0) return; - - // Get peak and average in the analysis window - using var windowCmd = connection.CreateCommand(); - windowCmd.CommandText = @" -SELECT MAX(sqlserver_cpu_utilization) AS peak_cpu, - AVG(sqlserver_cpu_utilization) AS avg_cpu, - COUNT(*) AS sample_count, - (SELECT collection_time FROM v_cpu_utilization_stats - WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3 - ORDER BY sqlserver_cpu_utilization DESC LIMIT 1) AS peak_time -FROM v_cpu_utilization_stats -WHERE server_id = $1 -AND collection_time >= $2 AND collection_time < $3"; - - windowCmd.Parameters.Add(new DuckDBParameter { Value = context.ServerId }); - windowCmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeStart }); - windowCmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeEnd }); - - using var windowReader = await windowCmd.ExecuteReaderAsync(); - if (!await windowReader.ReadAsync()) return; - - var peakCpu = windowReader.IsDBNull(0) ? 0.0 : Convert.ToDouble(windowReader.GetValue(0)); - var avgCpu = windowReader.IsDBNull(1) ? 0.0 : Convert.ToDouble(windowReader.GetValue(1)); - var windowSamples = windowReader.IsDBNull(2) ? 0L : Convert.ToInt64(windowReader.GetValue(2)); - var peakTime = windowReader.IsDBNull(3) ? (DateTime?)null : windowReader.GetDateTime(3); - - if (windowSamples == 0) return; - - // Check if peak deviates significantly from baseline - var deviation = (peakCpu - baselineMean) / baselineStdDev; - if (deviation < DeviationThreshold || peakCpu < 50) return; // Don't flag low absolute values - - var confidence = baselineSamples >= MinBaselineSamples ? 1.0 : (double)baselineSamples / MinBaselineSamples; - - anomalies.Add(new Fact - { - Source = "anomaly", - Key = "ANOMALY_CPU_SPIKE", - Value = peakCpu, - ServerId = context.ServerId, - Metadata = new Dictionary - { - ["peak_cpu"] = peakCpu, - ["avg_cpu_in_window"] = avgCpu, - ["baseline_mean"] = baselineMean, - ["baseline_stddev"] = baselineStdDev, - ["deviation_sigma"] = deviation, - ["baseline_samples"] = baselineSamples, - ["window_samples"] = windowSamples, - ["confidence"] = confidence, - ["peak_time_ticks"] = peakTime?.Ticks ?? 0 - } - }); - } - catch (Exception ex) - { - AppLogger.Error("AnomalyDetector", $"CPU anomaly detection failed: {ex.Message}"); - } - } - - /// - /// Detects wait stat anomalies — significant waits in the analysis window - /// that were absent or much lower in the baseline. - /// - private async Task DetectWaitAnomalies(AnalysisContext context, - DateTime baselineStart, DateTime baselineEnd, List anomalies) - { - try - { - using var readLock = _duckDb.AcquireReadLock(); - using var connection = _duckDb.CreateConnection(); - await connection.OpenAsync(); - - // Check if baseline has any wait data at all — if not, skip - using var checkCmd = connection.CreateCommand(); - checkCmd.CommandText = @" -SELECT COUNT(*) FROM v_wait_stats -WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3"; - checkCmd.Parameters.Add(new DuckDBParameter { Value = context.ServerId }); - checkCmd.Parameters.Add(new DuckDBParameter { Value = baselineStart }); - checkCmd.Parameters.Add(new DuckDBParameter { Value = baselineEnd }); - var baselineCount = Convert.ToInt64(await checkCmd.ExecuteScalarAsync() ?? 0); - if (baselineCount == 0) return; - - // Get per-wait-type totals in both windows - using var cmd = connection.CreateCommand(); - cmd.CommandText = @" -WITH baseline AS ( - SELECT wait_type, - SUM(delta_wait_time_ms)::BIGINT AS total_ms - FROM v_wait_stats - WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3 - AND delta_wait_time_ms > 0 - GROUP BY wait_type -), -current_window AS ( - SELECT wait_type, - SUM(delta_wait_time_ms)::BIGINT AS total_ms - FROM v_wait_stats - WHERE server_id = $1 AND collection_time >= $4 AND collection_time <= $5 - AND delta_wait_time_ms > 0 - GROUP BY wait_type -) -SELECT c.wait_type, - c.total_ms AS current_ms, - COALESCE(b.total_ms, 0) AS baseline_ms -FROM current_window c -LEFT JOIN baseline b ON c.wait_type = b.wait_type -WHERE c.total_ms > 10000 -- At least 10 seconds of wait time -ORDER BY c.total_ms DESC -LIMIT 10"; - - cmd.Parameters.Add(new DuckDBParameter { Value = context.ServerId }); - cmd.Parameters.Add(new DuckDBParameter { Value = baselineStart }); - cmd.Parameters.Add(new DuckDBParameter { Value = baselineEnd }); - cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeStart }); - cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeEnd }); - - using var reader = await cmd.ExecuteReaderAsync(); - while (await reader.ReadAsync()) - { - var waitType = reader.GetString(0); - var currentMs = Convert.ToInt64(reader.GetValue(1)); - var baselineMs = Convert.ToInt64(reader.GetValue(2)); - - // Normalize to per-hour rates before comparing (windows are different lengths) - var baselineHours = (baselineEnd - baselineStart).TotalHours; - var currentHours = (context.TimeRangeEnd - context.TimeRangeStart).TotalHours; - if (baselineHours <= 0) baselineHours = 1; - if (currentHours <= 0) currentHours = 1; - - double ratio; - string anomalyType; - - if (baselineMs == 0) - { - ratio = currentMs > 60_000 ? 100.0 : 0; // Only flag if > 1 minute total - anomalyType = "new"; - } - else - { - var baselineRate = baselineMs / baselineHours; - var currentRate = currentMs / currentHours; - ratio = baselineRate > 0 ? currentRate / baselineRate : 100.0; - anomalyType = "spike"; - } - - if (ratio < 5.0) continue; // Need at least 5x increase - - anomalies.Add(new Fact - { - Source = "anomaly", - Key = $"ANOMALY_WAIT_{waitType}", - Value = currentMs, - ServerId = context.ServerId, - Metadata = new Dictionary - { - ["current_ms"] = currentMs, - ["baseline_ms"] = baselineMs, - ["ratio"] = ratio, - ["is_new"] = anomalyType == "new" ? 1 : 0 - } - }); - } - } - catch (Exception ex) - { - AppLogger.Error("AnomalyDetector", $"Wait anomaly detection failed: {ex.Message}"); - } - } - - /// - /// Detects blocking/deadlock anomalies — events in the analysis window - /// that are significantly above baseline rates. - /// - private async Task DetectBlockingAnomalies(AnalysisContext context, - DateTime baselineStart, DateTime baselineEnd, List anomalies) - { - try - { - using var readLock = _duckDb.AcquireReadLock(); - using var connection = _duckDb.CreateConnection(); - await connection.OpenAsync(); - - // Check if baseline period has any data at all - using var checkCmd = connection.CreateCommand(); - checkCmd.CommandText = @" -SELECT (SELECT COUNT(*) FROM v_blocked_process_reports - WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3) - + (SELECT COUNT(*) FROM v_deadlocks - WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3) - + (SELECT COUNT(*) FROM v_wait_stats - WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3)"; - checkCmd.Parameters.Add(new DuckDBParameter { Value = context.ServerId }); - checkCmd.Parameters.Add(new DuckDBParameter { Value = baselineStart }); - checkCmd.Parameters.Add(new DuckDBParameter { Value = baselineEnd }); - var baselineDataCount = Convert.ToInt64(await checkCmd.ExecuteScalarAsync() ?? 0); - if (baselineDataCount == 0) return; // No baseline data = can't detect anomaly - - using var cmd = connection.CreateCommand(); - cmd.CommandText = @" -SELECT - (SELECT COUNT(*) FROM v_blocked_process_reports - WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3) AS baseline_blocking, - (SELECT COUNT(*) FROM v_blocked_process_reports - WHERE server_id = $1 AND collection_time >= $4 AND collection_time <= $5) AS current_blocking, - (SELECT COUNT(*) FROM v_deadlocks - WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3) AS baseline_deadlocks, - (SELECT COUNT(*) FROM v_deadlocks - WHERE server_id = $1 AND collection_time >= $4 AND collection_time <= $5) AS current_deadlocks"; - - cmd.Parameters.Add(new DuckDBParameter { Value = context.ServerId }); - cmd.Parameters.Add(new DuckDBParameter { Value = baselineStart }); - cmd.Parameters.Add(new DuckDBParameter { Value = baselineEnd }); - cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeStart }); - cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeEnd }); - - using var reader = await cmd.ExecuteReaderAsync(); - if (!await reader.ReadAsync()) return; - - var baselineBlocking = Convert.ToInt64(reader.GetValue(0)); - var currentBlocking = Convert.ToInt64(reader.GetValue(1)); - var baselineDeadlocks = Convert.ToInt64(reader.GetValue(2)); - var currentDeadlocks = Convert.ToInt64(reader.GetValue(3)); - - // Normalize to per-hour rates (windows are different lengths) - var baselineHours = (baselineEnd - baselineStart).TotalHours; - var currentHours = (context.TimeRangeEnd - context.TimeRangeStart).TotalHours; - if (baselineHours <= 0) baselineHours = 1; - if (currentHours <= 0) currentHours = 1; - - var baselineBlockingRate = baselineBlocking / baselineHours; - var currentBlockingRate = currentBlocking / currentHours; - var blockingRatio = baselineBlocking > 0 ? currentBlockingRate / baselineBlockingRate : 100.0; - - var baselineDeadlockRate = baselineDeadlocks / baselineHours; - var currentDeadlockRate = currentDeadlocks / currentHours; - var deadlockRatio = baselineDeadlocks > 0 ? currentDeadlockRate / baselineDeadlockRate : 100.0; - - // Blocking spike: at least 5 events AND 3x baseline rate (or new) - if (currentBlocking >= 5 && (baselineBlocking == 0 || blockingRatio >= 3)) - { - anomalies.Add(new Fact - { - Source = "anomaly", - Key = "ANOMALY_BLOCKING_SPIKE", - Value = currentBlocking, - ServerId = context.ServerId, - Metadata = new Dictionary - { - ["current_count"] = currentBlocking, - ["baseline_count"] = baselineBlocking, - ["ratio"] = blockingRatio - } - }); - } - - // Deadlock spike: at least 3 events AND 3x baseline rate (or new) - if (currentDeadlocks >= 3 && (baselineDeadlocks == 0 || deadlockRatio >= 3)) - { - anomalies.Add(new Fact - { - Source = "anomaly", - Key = "ANOMALY_DEADLOCK_SPIKE", - Value = currentDeadlocks, - ServerId = context.ServerId, - Metadata = new Dictionary - { - ["current_count"] = currentDeadlocks, - ["baseline_count"] = baselineDeadlocks, - ["ratio"] = deadlockRatio - } - }); - } - } - catch (Exception ex) - { - AppLogger.Error("AnomalyDetector", $"Blocking anomaly detection failed: {ex.Message}"); - } - } - - /// - /// Detects I/O latency anomalies — significant increase in read/write latency - /// compared to baseline. - /// - private async Task DetectIoAnomalies(AnalysisContext context, - DateTime baselineStart, DateTime baselineEnd, List anomalies) - { - try - { - using var readLock = _duckDb.AcquireReadLock(); - using var connection = _duckDb.CreateConnection(); - await connection.OpenAsync(); - - using var cmd = connection.CreateCommand(); - cmd.CommandText = @" -WITH baseline AS ( - SELECT AVG(delta_stall_read_ms * 1.0 / NULLIF(delta_reads, 0)) AS avg_read_lat, - AVG(delta_stall_write_ms * 1.0 / NULLIF(delta_writes, 0)) AS avg_write_lat, - STDDEV_SAMP(delta_stall_read_ms * 1.0 / NULLIF(delta_reads, 0)) AS stddev_read, - STDDEV_SAMP(delta_stall_write_ms * 1.0 / NULLIF(delta_writes, 0)) AS stddev_write, - COUNT(*) AS samples - FROM v_file_io_stats - WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3 - AND (delta_reads > 0 OR delta_writes > 0) -), -current_window AS ( - SELECT AVG(delta_stall_read_ms * 1.0 / NULLIF(delta_reads, 0)) AS avg_read_lat, - AVG(delta_stall_write_ms * 1.0 / NULLIF(delta_writes, 0)) AS avg_write_lat - FROM v_file_io_stats - WHERE server_id = $1 AND collection_time >= $4 AND collection_time <= $5 - AND (delta_reads > 0 OR delta_writes > 0) -) -SELECT b.avg_read_lat, b.stddev_read, c.avg_read_lat, - b.avg_write_lat, b.stddev_write, c.avg_write_lat, - b.samples -FROM baseline b, current_window c"; - - cmd.Parameters.Add(new DuckDBParameter { Value = context.ServerId }); - cmd.Parameters.Add(new DuckDBParameter { Value = baselineStart }); - cmd.Parameters.Add(new DuckDBParameter { Value = baselineEnd }); - cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeStart }); - cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeEnd }); - - using var reader = await cmd.ExecuteReaderAsync(); - if (!await reader.ReadAsync()) return; - - var baselineReadLat = reader.IsDBNull(0) ? 0.0 : Convert.ToDouble(reader.GetValue(0)); - var stddevRead = reader.IsDBNull(1) ? 0.0 : Convert.ToDouble(reader.GetValue(1)); - var currentReadLat = reader.IsDBNull(2) ? 0.0 : Convert.ToDouble(reader.GetValue(2)); - var baselineWriteLat = reader.IsDBNull(3) ? 0.0 : Convert.ToDouble(reader.GetValue(3)); - var stddevWrite = reader.IsDBNull(4) ? 0.0 : Convert.ToDouble(reader.GetValue(4)); - var currentWriteLat = reader.IsDBNull(5) ? 0.0 : Convert.ToDouble(reader.GetValue(5)); - var samples = reader.IsDBNull(6) ? 0L : Convert.ToInt64(reader.GetValue(6)); - - if (samples < 3) return; - - // Read latency anomaly - if (stddevRead > 0 && currentReadLat > 10) // At least 10ms to matter - { - var readDeviation = (currentReadLat - baselineReadLat) / stddevRead; - if (readDeviation >= DeviationThreshold) - { - anomalies.Add(new Fact - { - Source = "anomaly", - Key = "ANOMALY_READ_LATENCY", - Value = currentReadLat, - ServerId = context.ServerId, - Metadata = new Dictionary - { - ["current_latency_ms"] = currentReadLat, - ["baseline_mean_ms"] = baselineReadLat, - ["baseline_stddev_ms"] = stddevRead, - ["deviation_sigma"] = readDeviation, - ["baseline_samples"] = samples - } - }); - } - } - - // Write latency anomaly - if (stddevWrite > 0 && currentWriteLat > 5) // At least 5ms to matter - { - var writeDeviation = (currentWriteLat - baselineWriteLat) / stddevWrite; - if (writeDeviation >= DeviationThreshold) - { - anomalies.Add(new Fact - { - Source = "anomaly", - Key = "ANOMALY_WRITE_LATENCY", - Value = currentWriteLat, - ServerId = context.ServerId, - Metadata = new Dictionary - { - ["current_latency_ms"] = currentWriteLat, - ["baseline_mean_ms"] = baselineWriteLat, - ["baseline_stddev_ms"] = stddevWrite, - ["deviation_sigma"] = writeDeviation, - ["baseline_samples"] = samples - } - }); - } - } - } - catch (Exception ex) - { - AppLogger.Error("AnomalyDetector", $"I/O anomaly detection failed: {ex.Message}"); - } - } -} +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using DuckDB.NET.Data; +using PerformanceMonitorLite.Database; +using PerformanceMonitorLite.Services; + +namespace PerformanceMonitorLite.Analysis; + +/// +/// Detects anomalies by comparing the analysis window's metrics against +/// time-bucketed baselines (hour-of-day x day-of-week, 30-day rolling window). +/// +/// Two detection patterns: +/// - Z-score: (observed - mean) / stddev — used for continuous metrics +/// (CPU, batch requests, I/O latency, session counts, query duration, memory) +/// - Ratio: currentRate / baselineRate — used for rate/event metrics +/// (wait stats, blocking, deadlocks) +/// +/// Baseline computation and caching are handled by BaselineProvider. +/// +public class AnomalyDetector +{ + private readonly DuckDbInitializer _duckDb; + private readonly BaselineProvider _baselineProvider; + + /// + /// Default number of standard deviations above baseline mean to flag as anomalous. + /// + private const double DefaultDeviationThreshold = 2.0; + + /// + /// Default ratio threshold for rate-based anomaly detection (wait stats). + /// + private const double DefaultRatioThreshold = 5.0; + + /// + /// Default ratio threshold for event-based anomaly detection (blocking/deadlocks). + /// + private const double DefaultEventRatioThreshold = 3.0; + + /// + /// Per-metric deviation thresholds. Metrics not listed use DefaultDeviationThreshold. + /// + private readonly Dictionary _deviationThresholds = new(); + + public AnomalyDetector(DuckDbInitializer duckDb, BaselineProvider baselineProvider) + { + _duckDb = duckDb; + _baselineProvider = baselineProvider; + } + + /// + /// Sets a custom deviation threshold for a specific metric. + /// + public void SetDeviationThreshold(string metricName, double threshold) + { + _deviationThresholds[metricName] = threshold; + } + + private double GetDeviationThreshold(string metricName) + { + return _deviationThresholds.TryGetValue(metricName, out var threshold) + ? threshold + : DefaultDeviationThreshold; + } + + /// + /// Adds baseline context metadata to an anomaly fact's metadata dictionary. + /// + private static void AddBaselineContext(Dictionary metadata, BaselineBucket baseline) + { + metadata["baseline_hour"] = baseline.HourOfDay; + metadata["baseline_dow"] = baseline.DayOfWeek; + metadata["baseline_tier"] = (double)baseline.Tier; + } + + /// + /// Detects anomalies by comparing the analysis window against time-bucketed baselines. + /// Returns anomaly facts to be merged into the main fact list. + /// + public async Task> DetectAnomaliesAsync(AnalysisContext context) + { + var anomalies = new List(); + + // Check if baseline period has any data at all — if not, skip all anomaly detection. + if (!await HasBaselineDataAsync(context.ServerId)) + return anomalies; + + // Existing detection methods (upgraded to time-bucketed baselines) + await DetectCpuAnomalies(context, anomalies); + await DetectWaitAnomalies(context, anomalies); + await DetectBlockingAnomalies(context, anomalies); + await DetectIoAnomalies(context, anomalies); + + // New detection methods + await DetectBatchRequestAnomalies(context, anomalies); + await DetectSessionAnomalies(context, anomalies); + await DetectQueryDurationAnomalies(context, anomalies); + await DetectMemoryAnomalies(context, anomalies); + + return anomalies; + } + + /// + /// Checks if the server has enough historical data for meaningful baselines. + /// Uses wait_stats as canary — if waits are collected, other data is too. + /// + private async Task HasBaselineDataAsync(int serverId) + { + try + { + using var readLock = _duckDb.AcquireReadLock(); + using var connection = _duckDb.CreateConnection(); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SELECT (SELECT COUNT(*) FROM v_wait_stats + WHERE server_id = $1 AND collection_time >= $2) + + (SELECT COUNT(*) FROM v_cpu_utilization_stats + WHERE server_id = $1 AND collection_time >= $2)"; + cmd.Parameters.Add(new DuckDBParameter { Value = serverId }); + cmd.Parameters.Add(new DuckDBParameter { Value = DateTime.UtcNow.AddDays(-30) }); + + var count = Convert.ToInt64(await cmd.ExecuteScalarAsync() ?? 0); + return count > 0; + } + catch { return false; } + } + + /// + /// Detects CPU utilization anomalies using z-score against time-bucketed baseline. + /// + private async Task DetectCpuAnomalies(AnalysisContext context, List anomalies) + { + try + { + var baseline = await _baselineProvider.GetBaselineAsync( + context.ServerId, MetricNames.Cpu, context.TimeRangeStart); + + if (baseline.SampleCount == 0) return; + var effectiveStdDev = baseline.EffectiveStdDev; + if (effectiveStdDev <= 0) return; // Zero mean + zero stddev — skip + + using var readLock = _duckDb.AcquireReadLock(); + using var connection = _duckDb.CreateConnection(); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SELECT MAX(sqlserver_cpu_utilization) AS peak_cpu, + AVG(sqlserver_cpu_utilization) AS avg_cpu, + COUNT(*) AS sample_count, + (SELECT collection_time FROM v_cpu_utilization_stats + WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3 + ORDER BY sqlserver_cpu_utilization DESC LIMIT 1) AS peak_time +FROM v_cpu_utilization_stats +WHERE server_id = $1 +AND collection_time >= $2 AND collection_time < $3"; + + cmd.Parameters.Add(new DuckDBParameter { Value = context.ServerId }); + cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeStart }); + cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeEnd }); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var peakCpu = reader.IsDBNull(0) ? 0.0 : Convert.ToDouble(reader.GetValue(0)); + var avgCpu = reader.IsDBNull(1) ? 0.0 : Convert.ToDouble(reader.GetValue(1)); + var windowSamples = reader.IsDBNull(2) ? 0L : Convert.ToInt64(reader.GetValue(2)); + var peakTime = reader.IsDBNull(3) ? (DateTime?)null : reader.GetDateTime(3); + + if (windowSamples == 0) return; + + var deviation = (peakCpu - baseline.Mean) / effectiveStdDev; + if (deviation < GetDeviationThreshold(MetricNames.Cpu) || peakCpu < 50) return; + + var metadata = new Dictionary + { + ["peak_cpu"] = peakCpu, + ["avg_cpu_in_window"] = avgCpu, + ["baseline_mean"] = baseline.Mean, + ["baseline_stddev"] = effectiveStdDev, + ["deviation_sigma"] = deviation, + ["baseline_samples"] = baseline.SampleCount, + ["window_samples"] = windowSamples, + ["confidence"] = 1.0, + ["peak_time_ticks"] = peakTime?.Ticks ?? 0 + }; + AddBaselineContext(metadata, baseline); + + anomalies.Add(new Fact + { + Source = "anomaly", + Key = "ANOMALY_CPU_SPIKE", + Value = peakCpu, + ServerId = context.ServerId, + Metadata = metadata + }); + } + catch (Exception ex) + { + AppLogger.Error("AnomalyDetector", $"CPU anomaly detection failed: {ex.Message}"); + } + } + + /// + /// Detects wait stat anomalies — total wait time significantly above + /// baseline rate for this time bucket. Uses ratio-based scoring. + /// + private async Task DetectWaitAnomalies(AnalysisContext context, List anomalies) + { + try + { + var baseline = await _baselineProvider.GetBaselineAsync( + context.ServerId, MetricNames.WaitStats, context.TimeRangeStart); + + // No baseline data at all — can't distinguish "new" waits from "always present." + // Skip rather than flagging everything as anomalous. + if (baseline.SampleCount == 0) return; + + using var readLock = _duckDb.AcquireReadLock(); + using var connection = _duckDb.CreateConnection(); + await connection.OpenAsync(); + + // Get per-wait-type totals in the analysis window + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SELECT wait_type, + SUM(delta_wait_time_ms)::BIGINT AS total_ms +FROM v_wait_stats +WHERE server_id = $1 AND collection_time >= $2 AND collection_time <= $3 +AND delta_wait_time_ms > 0 +GROUP BY wait_type +HAVING SUM(delta_wait_time_ms) > 10000 +ORDER BY total_ms DESC +LIMIT 10"; + + cmd.Parameters.Add(new DuckDBParameter { Value = context.ServerId }); + cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeStart }); + cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeEnd }); + + var currentHours = (context.TimeRangeEnd - context.TimeRangeStart).TotalHours; + if (currentHours <= 0) currentHours = 1; + + // Baseline mean is total wait ms per collection interval for this time bucket. + // If no baseline, use ratio=100 for significant new waits. + var baselineRate = baseline.SampleCount > 0 ? baseline.Mean : 0; + + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + var waitType = reader.GetString(0); + var currentMs = Convert.ToInt64(reader.GetValue(1)); + var currentRate = currentMs / currentHours; + + double ratio; + string anomalyType; + + if (baselineRate <= 0 || baseline.SampleCount == 0) + { + ratio = currentMs > 60_000 ? 100.0 : 0; + anomalyType = "new"; + } + else + { + ratio = currentRate / baselineRate; + anomalyType = "spike"; + } + + if (ratio < DefaultRatioThreshold) continue; + + var metadata = new Dictionary + { + ["current_ms"] = currentMs, + ["baseline_mean"] = baseline.Mean, + ["ratio"] = ratio, + ["is_new"] = anomalyType == "new" ? 1 : 0 + }; + AddBaselineContext(metadata, baseline); + + anomalies.Add(new Fact + { + Source = "anomaly", + Key = $"ANOMALY_WAIT_{waitType}", + Value = currentMs, + ServerId = context.ServerId, + Metadata = metadata + }); + } + } + catch (Exception ex) + { + AppLogger.Error("AnomalyDetector", $"Wait anomaly detection failed: {ex.Message}"); + } + } + + /// + /// Detects blocking/deadlock anomalies — event rates significantly above + /// baseline for this time bucket. Uses ratio-based scoring. + /// + private async Task DetectBlockingAnomalies(AnalysisContext context, List anomalies) + { + try + { + var blockingBaseline = await _baselineProvider.GetBaselineAsync( + context.ServerId, MetricNames.Blocking, context.TimeRangeStart); + var deadlockBaseline = await _baselineProvider.GetBaselineAsync( + context.ServerId, MetricNames.Deadlock, context.TimeRangeStart); + + using var readLock = _duckDb.AcquireReadLock(); + using var connection = _duckDb.CreateConnection(); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SELECT + (SELECT COUNT(*) FROM v_blocked_process_reports + WHERE server_id = $1 AND collection_time >= $2 AND collection_time <= $3) AS current_blocking, + (SELECT COUNT(*) FROM v_deadlocks + WHERE server_id = $1 AND collection_time >= $2 AND collection_time <= $3) AS current_deadlocks"; + + cmd.Parameters.Add(new DuckDBParameter { Value = context.ServerId }); + cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeStart }); + cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeEnd }); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var currentBlocking = Convert.ToInt64(reader.GetValue(0)); + var currentDeadlocks = Convert.ToInt64(reader.GetValue(1)); + + // Baseline mean = events per day for this hour+dow bucket + var baselineBlockingRate = blockingBaseline.SampleCount > 0 ? blockingBaseline.Mean : 0; + var baselineDeadlockRate = deadlockBaseline.SampleCount > 0 ? deadlockBaseline.Mean : 0; + + // Blocking spike: at least 5 events AND 3x baseline rate (or no baseline) + if (currentBlocking >= 5 && (baselineBlockingRate <= 0 || currentBlocking / Math.Max(baselineBlockingRate, 1) >= DefaultEventRatioThreshold)) + { + var metadata = new Dictionary + { + ["current_count"] = currentBlocking, + ["baseline_rate"] = baselineBlockingRate, + ["ratio"] = baselineBlockingRate > 0 ? currentBlocking / baselineBlockingRate : 100.0 + }; + AddBaselineContext(metadata, blockingBaseline); + + anomalies.Add(new Fact + { + Source = "anomaly", + Key = "ANOMALY_BLOCKING_SPIKE", + Value = currentBlocking, + ServerId = context.ServerId, + Metadata = metadata + }); + } + + // Deadlock spike: at least 3 events AND 3x baseline rate (or no baseline) + if (currentDeadlocks >= 3 && (baselineDeadlockRate <= 0 || currentDeadlocks / Math.Max(baselineDeadlockRate, 1) >= DefaultEventRatioThreshold)) + { + var metadata = new Dictionary + { + ["current_count"] = currentDeadlocks, + ["baseline_rate"] = baselineDeadlockRate, + ["ratio"] = baselineDeadlockRate > 0 ? currentDeadlocks / baselineDeadlockRate : 100.0 + }; + AddBaselineContext(metadata, deadlockBaseline); + + anomalies.Add(new Fact + { + Source = "anomaly", + Key = "ANOMALY_DEADLOCK_SPIKE", + Value = currentDeadlocks, + ServerId = context.ServerId, + Metadata = metadata + }); + } + } + catch (Exception ex) + { + AppLogger.Error("AnomalyDetector", $"Blocking anomaly detection failed: {ex.Message}"); + } + } + + /// + /// Detects I/O latency anomalies using z-score against time-bucketed baseline. + /// + private async Task DetectIoAnomalies(AnalysisContext context, List anomalies) + { + try + { + var baseline = await _baselineProvider.GetBaselineAsync( + context.ServerId, MetricNames.IoLatency, context.TimeRangeStart); + + if (baseline.SampleCount == 0) return; + var effectiveStdDev = baseline.EffectiveStdDev; + if (effectiveStdDev <= 0) return; + + using var readLock = _duckDb.AcquireReadLock(); + using var connection = _duckDb.CreateConnection(); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SELECT AVG(delta_stall_read_ms * 1.0 / NULLIF(delta_reads, 0)) AS avg_read_lat, + AVG(delta_stall_write_ms * 1.0 / NULLIF(delta_writes, 0)) AS avg_write_lat +FROM v_file_io_stats +WHERE server_id = $1 AND collection_time >= $2 AND collection_time <= $3 +AND (delta_reads > 0 OR delta_writes > 0)"; + + cmd.Parameters.Add(new DuckDBParameter { Value = context.ServerId }); + cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeStart }); + cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeEnd }); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var currentReadLat = reader.IsDBNull(0) ? 0.0 : Convert.ToDouble(reader.GetValue(0)); + var currentWriteLat = reader.IsDBNull(1) ? 0.0 : Convert.ToDouble(reader.GetValue(1)); + + var ioThreshold = GetDeviationThreshold(MetricNames.IoLatency); + + // Read latency anomaly + if (currentReadLat > 10) + { + var readDeviation = (currentReadLat - baseline.Mean) / effectiveStdDev; + if (readDeviation >= ioThreshold) + { + var metadata = new Dictionary + { + ["current_latency_ms"] = currentReadLat, + ["baseline_mean_ms"] = baseline.Mean, + ["baseline_stddev_ms"] = effectiveStdDev, + ["deviation_sigma"] = readDeviation, + ["baseline_samples"] = baseline.SampleCount + }; + AddBaselineContext(metadata, baseline); + + anomalies.Add(new Fact + { + Source = "anomaly", + Key = "ANOMALY_READ_LATENCY", + Value = currentReadLat, + ServerId = context.ServerId, + Metadata = metadata + }); + } + } + + // Write latency anomaly + if (currentWriteLat > 5) + { + var writeDeviation = (currentWriteLat - baseline.Mean) / effectiveStdDev; + if (writeDeviation >= ioThreshold) + { + var metadata = new Dictionary + { + ["current_latency_ms"] = currentWriteLat, + ["baseline_mean_ms"] = baseline.Mean, + ["baseline_stddev_ms"] = effectiveStdDev, + ["deviation_sigma"] = writeDeviation, + ["baseline_samples"] = baseline.SampleCount + }; + AddBaselineContext(metadata, baseline); + + anomalies.Add(new Fact + { + Source = "anomaly", + Key = "ANOMALY_WRITE_LATENCY", + Value = currentWriteLat, + ServerId = context.ServerId, + Metadata = metadata + }); + } + } + } + catch (Exception ex) + { + AppLogger.Error("AnomalyDetector", $"I/O anomaly detection failed: {ex.Message}"); + } + } + + /// + /// Detects batch requests/sec anomalies using z-score against time-bucketed baseline. + /// + private async Task DetectBatchRequestAnomalies(AnalysisContext context, List anomalies) + { + try + { + var baseline = await _baselineProvider.GetBaselineAsync( + context.ServerId, MetricNames.BatchRequests, context.TimeRangeStart); + + if (baseline.SampleCount == 0) return; + var effectiveStdDev = baseline.EffectiveStdDev; + if (effectiveStdDev <= 0) return; + + using var readLock = _duckDb.AcquireReadLock(); + using var connection = _duckDb.CreateConnection(); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SELECT AVG(delta_cntr_value) AS avg_batch, + MAX(delta_cntr_value) AS peak_batch, + COUNT(*) AS sample_count +FROM v_perfmon_stats +WHERE server_id = $1 AND collection_time >= $2 AND collection_time <= $3 +AND counter_name = 'Batch Requests/sec' +AND delta_cntr_value >= 0"; + + cmd.Parameters.Add(new DuckDBParameter { Value = context.ServerId }); + cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeStart }); + cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeEnd }); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var avgBatch = reader.IsDBNull(0) ? 0.0 : Convert.ToDouble(reader.GetValue(0)); + var peakBatch = reader.IsDBNull(1) ? 0.0 : Convert.ToDouble(reader.GetValue(1)); + var windowSamples = reader.IsDBNull(2) ? 0L : Convert.ToInt64(reader.GetValue(2)); + + if (windowSamples == 0) return; + + var deviation = (peakBatch - baseline.Mean) / effectiveStdDev; + if (deviation < GetDeviationThreshold(MetricNames.BatchRequests)) return; + + var metadata = new Dictionary + { + ["peak_batch_requests"] = peakBatch, + ["avg_batch_requests"] = avgBatch, + ["baseline_mean"] = baseline.Mean, + ["baseline_stddev"] = effectiveStdDev, + ["deviation_sigma"] = deviation, + ["baseline_samples"] = baseline.SampleCount, + ["window_samples"] = windowSamples + }; + AddBaselineContext(metadata, baseline); + + anomalies.Add(new Fact + { + Source = "anomaly", + Key = "ANOMALY_BATCH_REQUESTS", + Value = peakBatch, + ServerId = context.ServerId, + Metadata = metadata + }); + } + catch (Exception ex) + { + AppLogger.Error("AnomalyDetector", $"Batch request anomaly detection failed: {ex.Message}"); + } + } + + /// + /// Detects session/connection count anomalies using z-score against time-bucketed baseline. + /// + private async Task DetectSessionAnomalies(AnalysisContext context, List anomalies) + { + try + { + var baseline = await _baselineProvider.GetBaselineAsync( + context.ServerId, MetricNames.SessionCount, context.TimeRangeStart); + + if (baseline.SampleCount == 0) return; + var effectiveStdDev = baseline.EffectiveStdDev; + if (effectiveStdDev <= 0) return; + + using var readLock = _duckDb.AcquireReadLock(); + using var connection = _duckDb.CreateConnection(); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +WITH per_collection AS ( + SELECT collection_time, + SUM(connection_count)::DOUBLE AS total_connections + FROM v_session_stats + WHERE server_id = $1 AND collection_time >= $2 AND collection_time <= $3 + GROUP BY collection_time +) +SELECT AVG(total_connections) AS avg_connections, + MAX(total_connections) AS peak_connections, + COUNT(*) AS sample_count +FROM per_collection"; + + cmd.Parameters.Add(new DuckDBParameter { Value = context.ServerId }); + cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeStart }); + cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeEnd }); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var avgConnections = reader.IsDBNull(0) ? 0.0 : Convert.ToDouble(reader.GetValue(0)); + var peakConnections = reader.IsDBNull(1) ? 0.0 : Convert.ToDouble(reader.GetValue(1)); + var windowSamples = reader.IsDBNull(2) ? 0L : Convert.ToInt64(reader.GetValue(2)); + + if (windowSamples == 0) return; + + var deviation = (peakConnections - baseline.Mean) / effectiveStdDev; + if (deviation < GetDeviationThreshold(MetricNames.SessionCount)) return; + + var metadata = new Dictionary + { + ["peak_connections"] = peakConnections, + ["avg_connections"] = avgConnections, + ["baseline_mean"] = baseline.Mean, + ["baseline_stddev"] = effectiveStdDev, + ["deviation_sigma"] = deviation, + ["baseline_samples"] = baseline.SampleCount, + ["window_samples"] = windowSamples + }; + AddBaselineContext(metadata, baseline); + + anomalies.Add(new Fact + { + Source = "anomaly", + Key = "ANOMALY_SESSION_SPIKE", + Value = peakConnections, + ServerId = context.ServerId, + Metadata = metadata + }); + } + catch (Exception ex) + { + AppLogger.Error("AnomalyDetector", $"Session anomaly detection failed: {ex.Message}"); + } + } + + /// + /// Detects query duration aggregate anomalies using z-score against time-bucketed baseline. + /// Measures total elapsed time across all queries per collection interval. + /// + private async Task DetectQueryDurationAnomalies(AnalysisContext context, List anomalies) + { + try + { + var baseline = await _baselineProvider.GetBaselineAsync( + context.ServerId, MetricNames.QueryDuration, context.TimeRangeStart); + + if (baseline.SampleCount == 0) return; + var effectiveStdDev = baseline.EffectiveStdDev; + if (effectiveStdDev <= 0) return; + + using var readLock = _duckDb.AcquireReadLock(); + using var connection = _duckDb.CreateConnection(); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +WITH per_collection AS ( + SELECT collection_time, + SUM(delta_elapsed_time)::DOUBLE AS total_elapsed + FROM v_query_stats + WHERE server_id = $1 AND collection_time >= $2 AND collection_time <= $3 + AND delta_execution_count > 0 + AND delta_elapsed_time >= 0 + GROUP BY collection_time +) +SELECT AVG(total_elapsed) AS avg_elapsed, + MAX(total_elapsed) AS peak_elapsed, + COUNT(*) AS sample_count +FROM per_collection"; + + cmd.Parameters.Add(new DuckDBParameter { Value = context.ServerId }); + cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeStart }); + cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeEnd }); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var avgElapsed = reader.IsDBNull(0) ? 0.0 : Convert.ToDouble(reader.GetValue(0)); + var peakElapsed = reader.IsDBNull(1) ? 0.0 : Convert.ToDouble(reader.GetValue(1)); + var windowSamples = reader.IsDBNull(2) ? 0L : Convert.ToInt64(reader.GetValue(2)); + + if (windowSamples == 0) return; + + var deviation = (peakElapsed - baseline.Mean) / effectiveStdDev; + if (deviation < GetDeviationThreshold(MetricNames.QueryDuration)) return; + + var metadata = new Dictionary + { + ["peak_total_elapsed_us"] = peakElapsed, + ["avg_total_elapsed_us"] = avgElapsed, + ["baseline_mean"] = baseline.Mean, + ["baseline_stddev"] = effectiveStdDev, + ["deviation_sigma"] = deviation, + ["baseline_samples"] = baseline.SampleCount, + ["window_samples"] = windowSamples + }; + AddBaselineContext(metadata, baseline); + + anomalies.Add(new Fact + { + Source = "anomaly", + Key = "ANOMALY_QUERY_DURATION", + Value = peakElapsed, + ServerId = context.ServerId, + Metadata = metadata + }); + } + catch (Exception ex) + { + AppLogger.Error("AnomalyDetector", $"Query duration anomaly detection failed: {ex.Message}"); + } + } + + /// + /// Detects memory utilization anomalies using z-score against time-bucketed baseline. + /// Lite-only — Dashboard does not collect memory metrics. + /// Measures total_server_memory_mb / target_server_memory_mb as memory pressure %. + /// + private async Task DetectMemoryAnomalies(AnalysisContext context, List anomalies) + { + try + { + var baseline = await _baselineProvider.GetBaselineAsync( + context.ServerId, MetricNames.Memory, context.TimeRangeStart); + + if (baseline.SampleCount == 0) return; + var effectiveStdDev = baseline.EffectiveStdDev; + if (effectiveStdDev <= 0) return; + + using var readLock = _duckDb.AcquireReadLock(); + using var connection = _duckDb.CreateConnection(); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SELECT AVG(total_server_memory_mb::DOUBLE / NULLIF(target_server_memory_mb::DOUBLE, 0) * 100) AS avg_pressure, + MAX(total_server_memory_mb::DOUBLE / NULLIF(target_server_memory_mb::DOUBLE, 0) * 100) AS peak_pressure, + COUNT(*) AS sample_count +FROM v_memory_stats +WHERE server_id = $1 AND collection_time >= $2 AND collection_time <= $3 +AND target_server_memory_mb > 0"; + + cmd.Parameters.Add(new DuckDBParameter { Value = context.ServerId }); + cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeStart }); + cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeEnd }); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var avgPressure = reader.IsDBNull(0) ? 0.0 : Convert.ToDouble(reader.GetValue(0)); + var peakPressure = reader.IsDBNull(1) ? 0.0 : Convert.ToDouble(reader.GetValue(1)); + var windowSamples = reader.IsDBNull(2) ? 0L : Convert.ToInt64(reader.GetValue(2)); + + if (windowSamples == 0) return; + + var deviation = (peakPressure - baseline.Mean) / effectiveStdDev; + if (deviation < GetDeviationThreshold(MetricNames.Memory)) return; + + var metadata = new Dictionary + { + ["peak_memory_pressure_pct"] = peakPressure, + ["avg_memory_pressure_pct"] = avgPressure, + ["baseline_mean"] = baseline.Mean, + ["baseline_stddev"] = effectiveStdDev, + ["deviation_sigma"] = deviation, + ["baseline_samples"] = baseline.SampleCount, + ["window_samples"] = windowSamples + }; + AddBaselineContext(metadata, baseline); + + anomalies.Add(new Fact + { + Source = "anomaly", + Key = "ANOMALY_MEMORY_PRESSURE", + Value = peakPressure, + ServerId = context.ServerId, + Metadata = metadata + }); + } + catch (Exception ex) + { + AppLogger.Error("AnomalyDetector", $"Memory anomaly detection failed: {ex.Message}"); + } + } +} diff --git a/Lite/Analysis/BaselineProvider.cs b/Lite/Analysis/BaselineProvider.cs new file mode 100644 index 00000000..fc40d2be --- /dev/null +++ b/Lite/Analysis/BaselineProvider.cs @@ -0,0 +1,544 @@ +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using DuckDB.NET.Data; +using PerformanceMonitorLite.Database; +using PerformanceMonitorLite.Services; + +namespace PerformanceMonitorLite.Analysis; + +/// +/// Provides time-bucketed baselines (hour-of-day x day-of-week) computed from +/// 30-day rolling history in DuckDB. Replaces the flat 24-hour lookback used +/// by the previous anomaly detection implementation. +/// +/// Each baseline bucket contains mean, stddev, and sample count for a metric +/// at a specific (hour, day-of-week) combination. When a bucket has insufficient +/// samples, the provider collapses to less-specific tiers: +/// Full (hour+dow) -> Hour-only -> Flat (global mean/stddev) +/// +/// Baselines are cached in memory with a 1-hour TTL to avoid redundant +/// recomputation during rapid re-analysis. +/// +public class BaselineProvider +{ + private readonly DuckDbInitializer _duckDb; + + /// Rolling window for baseline computation. + private const int BaselineWindowDays = 30; + + /// Collapse to hour-only when full bucket has fewer than this many samples. + private const int CollapseThreshold = 10; + + /// Restore to full bucket when sample count reaches this level (hysteresis). + private const int RestoreThreshold = 15; + + /// Cache TTL — baselines are recomputed after this interval. + public static TimeSpan CacheTtl { get; set; } = TimeSpan.FromHours(1); + + private readonly ConcurrentDictionary _cache = new(); + + public BaselineProvider(DuckDbInitializer duckDb) + { + _duckDb = duckDb; + } + + /// + /// Gets the baseline for a specific metric, server, and time bucket. + /// Returns the most specific bucket available, collapsing as needed. + /// + public async Task GetBaselineAsync( + int serverId, string metricName, DateTime analysisTime) + { + var hourOfDay = analysisTime.Hour; + var dayOfWeek = (int)analysisTime.DayOfWeek; // Sunday=0 + + var baselines = await GetOrComputeBaselinesAsync(serverId, metricName, analysisTime); + if (baselines == null || baselines.Count == 0) + return BaselineBucket.Empty; + + // Try full bucket (hour + day-of-week) + var fullKey = (hourOfDay, dayOfWeek); + if (baselines.TryGetValue(fullKey, out var fullBucket) && fullBucket.SampleCount >= RestoreThreshold) + return fullBucket; + + // If full bucket exists but below restore threshold, check if it's above collapse threshold + // (hysteresis: don't collapse if we're between 10-14 samples and were previously using full) + if (fullBucket != null && fullBucket.SampleCount >= CollapseThreshold) + return fullBucket; + + // Collapse to hour-only: aggregate all days for this hour + var hourBuckets = baselines + .Where(kvp => kvp.Key.HourOfDay == hourOfDay) + .Select(kvp => kvp.Value) + .ToList(); + + if (hourBuckets.Count > 0) + { + var collapsed = CollapseToHourOnly(hourBuckets); + if (collapsed.SampleCount >= CollapseThreshold) + return collapsed; + } + + // Collapse to flat: aggregate everything + var allBuckets = baselines.Values.ToList(); + if (allBuckets.Count > 0) + { + var flat = CollapseToFlat(allBuckets); + if (flat.SampleCount >= 3) // Minimum viable baseline + return flat; + } + + return BaselineBucket.Empty; + } + + /// + /// Gets all baseline buckets for a metric/server. Used by UI for rendering + /// expected-range bands across all time slots. + /// + public async Task?> GetAllBaselinesAsync( + int serverId, string metricName, DateTime analysisTime) + { + return await GetOrComputeBaselinesAsync(serverId, metricName, analysisTime); + } + + /// Forces cache eviction for a server — used during testing. + public void InvalidateCache(int serverId) + { + var keysToRemove = _cache.Keys.Where(k => k.StartsWith($"{serverId}:", StringComparison.Ordinal)).ToList(); + foreach (var key in keysToRemove) + _cache.TryRemove(key, out _); + } + + /// Forces full cache clear — used during testing. + public void ClearCache() => _cache.Clear(); + + private async Task?> GetOrComputeBaselinesAsync( + int serverId, string metricName, DateTime analysisTime) + { + var cacheKey = $"{serverId}:{metricName}"; + var roundedHour = new DateTime(analysisTime.Year, analysisTime.Month, analysisTime.Day, analysisTime.Hour, 0, 0); + + if (_cache.TryGetValue(cacheKey, out var cached) && + cached.ComputedAt == roundedHour && + (DateTime.UtcNow - cached.RealTime) < CacheTtl) + { + return cached.Buckets; + } + + var buckets = await ComputeBaselinesAsync(serverId, metricName, analysisTime); + + _cache[cacheKey] = new CachedBaseline + { + ComputedAt = roundedHour, + RealTime = DateTime.UtcNow, + Buckets = buckets + }; + + return buckets; + } + + private async Task?> ComputeBaselinesAsync( + int serverId, string metricName, DateTime analysisTime) + { + var query = GetBaselineQuery(metricName); + if (query == null) return null; + + var windowStart = analysisTime.AddDays(-BaselineWindowDays); + + try + { + using var readLock = _duckDb.AcquireReadLock(); + using var connection = _duckDb.CreateConnection(); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = query; + cmd.Parameters.Add(new DuckDBParameter { Value = serverId }); + cmd.Parameters.Add(new DuckDBParameter { Value = windowStart }); + cmd.Parameters.Add(new DuckDBParameter { Value = analysisTime }); + + var buckets = new Dictionary<(int, int), BaselineBucket>(); + + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + var hour = Convert.ToInt32(reader.GetValue(0)); + var dow = Convert.ToInt32(reader.GetValue(1)); + var mean = reader.IsDBNull(2) ? 0.0 : Convert.ToDouble(reader.GetValue(2)); + var stddev = reader.IsDBNull(3) ? 0.0 : Convert.ToDouble(reader.GetValue(3)); + var count = reader.IsDBNull(4) ? 0L : Convert.ToInt64(reader.GetValue(4)); + + buckets[(hour, dow)] = new BaselineBucket + { + HourOfDay = hour, + DayOfWeek = dow, + Mean = mean, + StdDev = stddev, + SampleCount = count, + Tier = count >= RestoreThreshold ? BaselineTier.Full + : count >= CollapseThreshold ? BaselineTier.Full + : BaselineTier.HourOnly + }; + } + + return buckets; + } + catch (Exception ex) + { + AppLogger.Error("BaselineProvider", $"Failed to compute baselines for {metricName}: {ex.Message}"); + return null; + } + } + + private static string? GetBaselineQuery(string metricName) + { + // All queries return: hour_of_day, day_of_week, mean_val, stddev_val, sample_count + // Cumulative metrics (batch requests, wait stats, query duration) use CTEs for + // restart poisoning exclusion — exclude samples where value drops to near-zero + // when the prior sample was significantly higher. + // Multi-row-per-collection metrics (waits, sessions, queries) aggregate per + // collection_time first, then bucket by hour+dow. + return metricName switch + { + // Point-in-time metric — no restart exclusion needed + MetricNames.Cpu => @" +SELECT EXTRACT(HOUR FROM collection_time)::INT AS hour_of_day, + EXTRACT(DOW FROM collection_time)::INT AS day_of_week, + AVG(sqlserver_cpu_utilization) AS mean_val, + STDDEV_SAMP(sqlserver_cpu_utilization) AS stddev_val, + COUNT(*) AS sample_count +FROM v_cpu_utilization_stats +WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3 +GROUP BY hour_of_day, day_of_week", + + // Cumulative counter — restart exclusion via subquery with QUALIFY. + // Excludes samples where delta drops to 0 when prior sample was > 1000 + // (restart signature for cumulative counters). + MetricNames.BatchRequests => @" +SELECT EXTRACT(HOUR FROM collection_time)::INT AS hour_of_day, + EXTRACT(DOW FROM collection_time)::INT AS day_of_week, + AVG(delta_cntr_value) AS mean_val, + STDDEV_SAMP(delta_cntr_value) AS stddev_val, + COUNT(*) AS sample_count +FROM ( + SELECT collection_time, delta_cntr_value + FROM v_perfmon_stats + WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3 + AND counter_name = 'Batch Requests/sec' + AND delta_cntr_value >= 0 + QUALIFY NOT (delta_cntr_value = 0 + AND COALESCE(LAG(delta_cntr_value) OVER (ORDER BY collection_time), 0) > 1000) +) +GROUP BY hour_of_day, day_of_week", + + // Cumulative counter, multiple rows per collection (per wait type) — + // aggregate to total wait ms per collection first, then QUALIFY for restart exclusion + MetricNames.WaitStats => @" +WITH per_collection AS ( + SELECT collection_time, + SUM(delta_wait_time_ms) AS total_wait_ms + FROM v_wait_stats + WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3 + AND delta_wait_time_ms >= 0 + GROUP BY collection_time + QUALIFY NOT (total_wait_ms = 0 + AND COALESCE(LAG(total_wait_ms) OVER (ORDER BY collection_time), 0) > 10000) +) +SELECT EXTRACT(HOUR FROM collection_time)::INT AS hour_of_day, + EXTRACT(DOW FROM collection_time)::INT AS day_of_week, + AVG(total_wait_ms) AS mean_val, + STDDEV_SAMP(total_wait_ms) AS stddev_val, + COUNT(*) AS sample_count +FROM per_collection +GROUP BY hour_of_day, day_of_week", + + // Point-in-time, multiple rows per collection (per program_name) — + // aggregate to total connections per collection first + MetricNames.SessionCount => @" +WITH per_collection AS ( + SELECT collection_time, + SUM(connection_count) AS total_connections + FROM v_session_stats + WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3 + GROUP BY collection_time +) +SELECT EXTRACT(HOUR FROM collection_time)::INT AS hour_of_day, + EXTRACT(DOW FROM collection_time)::INT AS day_of_week, + AVG(total_connections) AS mean_val, + STDDEV_SAMP(total_connections) AS stddev_val, + COUNT(*) AS sample_count +FROM per_collection +GROUP BY hour_of_day, day_of_week", + + // Cumulative (plan cache), multiple rows per collection (per query) — + // use delta columns, aggregate total elapsed per collection, QUALIFY for restart exclusion + MetricNames.QueryDuration => @" +WITH per_collection AS ( + SELECT collection_time, + SUM(delta_elapsed_time) AS total_elapsed + FROM v_query_stats + WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3 + AND delta_execution_count > 0 + AND delta_elapsed_time >= 0 + GROUP BY collection_time + QUALIFY NOT (total_elapsed = 0 + AND COALESCE(LAG(total_elapsed) OVER (ORDER BY collection_time), 0) > 100000) +) +SELECT EXTRACT(HOUR FROM collection_time)::INT AS hour_of_day, + EXTRACT(DOW FROM collection_time)::INT AS day_of_week, + AVG(total_elapsed) AS mean_val, + STDDEV_SAMP(total_elapsed) AS stddev_val, + COUNT(*) AS sample_count +FROM per_collection +GROUP BY hour_of_day, day_of_week", + + // Point-in-time metric — no restart exclusion needed + MetricNames.IoLatency => @" +SELECT EXTRACT(HOUR FROM collection_time)::INT AS hour_of_day, + EXTRACT(DOW FROM collection_time)::INT AS day_of_week, + AVG(delta_stall_read_ms * 1.0 / NULLIF(delta_reads, 0)) AS mean_val, + STDDEV_SAMP(delta_stall_read_ms * 1.0 / NULLIF(delta_reads, 0)) AS stddev_val, + COUNT(*) AS sample_count +FROM v_file_io_stats +WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3 +AND (delta_reads > 0 OR delta_writes > 0) +GROUP BY hour_of_day, day_of_week", + + // Event-based — mean = events per day for this bucket, sample_count = distinct days observed. + // No restart exclusion needed (event counts, not cumulative). + MetricNames.Blocking => @" +SELECT EXTRACT(HOUR FROM collection_time)::INT AS hour_of_day, + EXTRACT(DOW FROM collection_time)::INT AS day_of_week, + COUNT(*)::DOUBLE / GREATEST(COUNT(DISTINCT collection_time::DATE), 1) AS mean_val, + 0::DOUBLE AS stddev_val, + COUNT(DISTINCT collection_time::DATE) AS sample_count +FROM v_blocked_process_reports +WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3 +GROUP BY hour_of_day, day_of_week", + + // Event-based — same approach as blocking + MetricNames.Deadlock => @" +SELECT EXTRACT(HOUR FROM collection_time)::INT AS hour_of_day, + EXTRACT(DOW FROM collection_time)::INT AS day_of_week, + COUNT(*)::DOUBLE / GREATEST(COUNT(DISTINCT collection_time::DATE), 1) AS mean_val, + 0::DOUBLE AS stddev_val, + COUNT(DISTINCT collection_time::DATE) AS sample_count +FROM v_deadlocks +WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3 +GROUP BY hour_of_day, day_of_week", + + // Point-in-time metric (memory pressure %) — no restart exclusion needed + MetricNames.Memory => @" +SELECT EXTRACT(HOUR FROM collection_time)::INT AS hour_of_day, + EXTRACT(DOW FROM collection_time)::INT AS day_of_week, + AVG(total_server_memory_mb::DOUBLE / NULLIF(target_server_memory_mb::DOUBLE, 0) * 100) AS mean_val, + STDDEV_SAMP(total_server_memory_mb::DOUBLE / NULLIF(target_server_memory_mb::DOUBLE, 0) * 100) AS stddev_val, + COUNT(*) AS sample_count +FROM v_memory_stats +WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3 +AND target_server_memory_mb > 0 +GROUP BY hour_of_day, day_of_week", + + // ── Chart-unit baselines (for UI bands — units match what the chart displays) ── + + // Buffer pool MB (chart shows this, not pressure %) + MetricNames.MemoryBufferPoolMb => @" +SELECT EXTRACT(HOUR FROM collection_time)::INT AS hour_of_day, + EXTRACT(DOW FROM collection_time)::INT AS day_of_week, + AVG(buffer_pool_mb::DOUBLE) AS mean_val, + STDDEV_SAMP(buffer_pool_mb::DOUBLE) AS stddev_val, + COUNT(*) AS sample_count +FROM v_memory_stats +WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3 +AND buffer_pool_mb > 0 +GROUP BY hour_of_day, day_of_week", + + // Wait ms per second (chart shows this, not total ms per collection) + MetricNames.WaitMsPerSec => @" +WITH per_collection AS ( + SELECT collection_time, + SUM(delta_wait_time_ms)::DOUBLE AS total_wait_ms, + date_diff('second', LAG(collection_time) OVER (ORDER BY collection_time), collection_time) AS interval_sec + FROM v_wait_stats + WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3 + AND delta_wait_time_ms >= 0 + GROUP BY collection_time +), +with_rate AS ( + SELECT collection_time, + CASE WHEN interval_sec > 0 THEN total_wait_ms / interval_sec ELSE 0 END AS ms_per_sec + FROM per_collection + WHERE interval_sec IS NOT NULL + QUALIFY NOT (ms_per_sec = 0 + AND COALESCE(LAG(ms_per_sec) OVER (ORDER BY collection_time), 0) > 100) +) +SELECT EXTRACT(HOUR FROM collection_time)::INT AS hour_of_day, + EXTRACT(DOW FROM collection_time)::INT AS day_of_week, + AVG(ms_per_sec) AS mean_val, + STDDEV_SAMP(ms_per_sec) AS stddev_val, + COUNT(*) AS sample_count +FROM with_rate +GROUP BY hour_of_day, day_of_week", + + // Blocking events per minute (chart shows event bars bucketed by minute) + MetricNames.BlockingPerMinute => @" +WITH per_minute AS ( + SELECT DATE_TRUNC('minute', collection_time) AS minute_bucket, + COUNT(*)::DOUBLE AS event_count + FROM v_blocked_process_reports + WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3 + GROUP BY minute_bucket +) +SELECT EXTRACT(HOUR FROM minute_bucket)::INT AS hour_of_day, + EXTRACT(DOW FROM minute_bucket)::INT AS day_of_week, + AVG(event_count) AS mean_val, + STDDEV_SAMP(event_count) AS stddev_val, + COUNT(*) AS sample_count +FROM per_minute +GROUP BY hour_of_day, day_of_week", + + _ => null + }; + } + + /// + /// Collapses multiple day-of-week buckets for the same hour into a single + /// hour-only bucket using pooled statistics. + /// + private static BaselineBucket CollapseToHourOnly(List hourBuckets) + { + var totalSamples = hourBuckets.Sum(b => b.SampleCount); + if (totalSamples == 0) + return BaselineBucket.Empty; + + // Weighted mean across all day-of-week buckets for this hour + var weightedMean = hourBuckets.Sum(b => b.Mean * b.SampleCount) / totalSamples; + + // Pooled standard deviation + var pooledVariance = PoolVariance(hourBuckets, weightedMean); + + return new BaselineBucket + { + HourOfDay = hourBuckets[0].HourOfDay, + DayOfWeek = -1, // Indicates hour-only + Mean = weightedMean, + StdDev = Math.Sqrt(pooledVariance), + SampleCount = totalSamples, + Tier = BaselineTier.HourOnly + }; + } + + /// + /// Collapses all buckets into a single flat baseline (equivalent to old 24h behavior). + /// + private static BaselineBucket CollapseToFlat(List allBuckets) + { + var totalSamples = allBuckets.Sum(b => b.SampleCount); + if (totalSamples == 0) + return BaselineBucket.Empty; + + var weightedMean = allBuckets.Sum(b => b.Mean * b.SampleCount) / totalSamples; + var pooledVariance = PoolVariance(allBuckets, weightedMean); + + return new BaselineBucket + { + HourOfDay = -1, + DayOfWeek = -1, + Mean = weightedMean, + StdDev = Math.Sqrt(pooledVariance), + SampleCount = totalSamples, + Tier = BaselineTier.Flat + }; + } + + /// + /// Computes pooled variance from multiple buckets, accounting for both + /// within-bucket variance and between-bucket mean differences. + /// + private static double PoolVariance(List buckets, double grandMean) + { + var totalSamples = buckets.Sum(b => b.SampleCount); + if (totalSamples <= 1) return 0; + + double totalSumSq = 0; + foreach (var b in buckets) + { + if (b.SampleCount <= 0) continue; + // Within-bucket variance contribution + totalSumSq += (b.StdDev * b.StdDev) * (b.SampleCount - 1); + // Between-bucket mean difference contribution + totalSumSq += b.SampleCount * (b.Mean - grandMean) * (b.Mean - grandMean); + } + + return totalSumSq / (totalSamples - 1); + } + + private class CachedBaseline + { + public DateTime ComputedAt { get; init; } + public DateTime RealTime { get; init; } + public Dictionary<(int HourOfDay, int DayOfWeek), BaselineBucket>? Buckets { get; init; } + } +} + +/// +/// Represents the computed baseline statistics for a single time bucket. +/// +public class BaselineBucket +{ + public int HourOfDay { get; init; } + public int DayOfWeek { get; init; } + public double Mean { get; init; } + public double StdDev { get; init; } + public long SampleCount { get; init; } + public BaselineTier Tier { get; init; } + + public static BaselineBucket Empty => new() + { + HourOfDay = -1, DayOfWeek = -1, Mean = 0, StdDev = 0, + SampleCount = 0, Tier = BaselineTier.Flat + }; + + /// + /// Returns the effective stddev with a proportional minimum floor to prevent + /// division-by-zero in z-score calculations. When both mean and stddev are 0 + /// (zero activity), returns 0 — callers should skip scoring. + /// + public double EffectiveStdDev + { + get + { + if (Mean == 0 && StdDev <= 0) return 0; // Zero activity — skip scoring + return Math.Max(StdDev, Mean * 0.01); + } + } +} + +public enum BaselineTier +{ + Full, // hour + day-of-week (168 buckets) + HourOnly, // hour only (24 buckets) + Flat // global mean/stddev +} + +/// Metric name constants used as baseline cache keys. +public static class MetricNames +{ + public const string Cpu = "cpu"; + public const string BatchRequests = "batch_requests"; + public const string WaitStats = "wait_stats"; + public const string SessionCount = "session_count"; + public const string QueryDuration = "query_duration"; + public const string IoLatency = "io_latency"; + public const string Blocking = "blocking"; + public const string Deadlock = "deadlock"; + public const string Memory = "memory"; + + // Chart-unit metrics (for UI bands — units match what the chart displays) + public const string MemoryBufferPoolMb = "memory_buffer_pool_mb"; + public const string WaitMsPerSec = "wait_ms_per_sec"; + public const string BlockingPerMinute = "blocking_per_minute"; +} diff --git a/Lite/Analysis/FactScorer.cs b/Lite/Analysis/FactScorer.cs index 1ec47f7a..e42b4e9d 100644 --- a/Lite/Analysis/FactScorer.cs +++ b/Lite/Analysis/FactScorer.cs @@ -308,8 +308,13 @@ private static double ScoreBadActorFact(Fact fact) /// private static double ScoreAnomalyFact(Fact fact) { - if (fact.Key.StartsWith("ANOMALY_CPU_SPIKE", StringComparison.OrdinalIgnoreCase) || fact.Key.StartsWith("ANOMALY_READ_LATENCY", StringComparison.OrdinalIgnoreCase) - || fact.Key.StartsWith("ANOMALY_WRITE_LATENCY", StringComparison.OrdinalIgnoreCase)) + if (fact.Key.StartsWith("ANOMALY_CPU_SPIKE", StringComparison.OrdinalIgnoreCase) + || fact.Key.StartsWith("ANOMALY_READ_LATENCY", StringComparison.OrdinalIgnoreCase) + || fact.Key.StartsWith("ANOMALY_WRITE_LATENCY", StringComparison.OrdinalIgnoreCase) + || fact.Key.StartsWith("ANOMALY_BATCH_REQUESTS", StringComparison.OrdinalIgnoreCase) + || fact.Key.StartsWith("ANOMALY_SESSION_SPIKE", StringComparison.OrdinalIgnoreCase) + || fact.Key.StartsWith("ANOMALY_QUERY_DURATION", StringComparison.OrdinalIgnoreCase) + || fact.Key.StartsWith("ANOMALY_MEMORY_PRESSURE", StringComparison.OrdinalIgnoreCase)) { // Deviation-based scoring: 2σ = 0.5, 4σ = 1.0 var deviation = fact.Metadata.GetValueOrDefault("deviation_sigma"); diff --git a/Lite/Analysis/FindingStore.cs b/Lite/Analysis/FindingStore.cs index 6724445c..7611e512 100644 --- a/Lite/Analysis/FindingStore.cs +++ b/Lite/Analysis/FindingStore.cs @@ -60,7 +60,8 @@ public async Task> SaveFindingsAsync( RootFactValue = story.RootFactValue, LeafFactKey = story.LeafFactKey, LeafFactValue = story.LeafFactValue, - FactCount = story.FactCount + FactCount = story.FactCount, + RootFactMetadata = story.RootFactMetadata }; await InsertFindingAsync(finding); diff --git a/Lite/Analysis/InferenceEngine.cs b/Lite/Analysis/InferenceEngine.cs index 4ef4dc89..fdc7985b 100644 --- a/Lite/Analysis/InferenceEngine.cs +++ b/Lite/Analysis/InferenceEngine.cs @@ -150,7 +150,8 @@ private static AnalysisStory BuildStory(List path, Dictionary - - @@ -34,15 +30,11 @@ - - @@ -50,7 +42,6 @@ - - - + - - - @@ -86,15 +70,11 @@ - - diff --git a/Lite/Controls/CorrelatedTimelineLanesControl.xaml.cs b/Lite/Controls/CorrelatedTimelineLanesControl.xaml.cs index 04cfc77d..d42a7bc5 100644 --- a/Lite/Controls/CorrelatedTimelineLanesControl.xaml.cs +++ b/Lite/Controls/CorrelatedTimelineLanesControl.xaml.cs @@ -15,6 +15,7 @@ using System.Threading.Tasks; using System.Windows; using System.Windows.Controls; +using PerformanceMonitorLite.Analysis; using PerformanceMonitorLite.Helpers; using PerformanceMonitorLite.Services; @@ -51,17 +52,18 @@ public void Initialize(LocalDataService dataService, int serverId) } _crosshairManager = new CorrelatedCrosshairManager(); - _crosshairManager.AddLane(CpuChart, "CPU", "%", CpuValueLabel); - _crosshairManager.AddLane(WaitStatsChart, "Wait Stats", "ms/sec", WaitStatsValueLabel); - _crosshairManager.AddLane(BlockingChart, "Blocking", "events", BlockingValueLabel); - _crosshairManager.AddLane(MemoryChart, "Memory", "MB", MemoryValueLabel); - _crosshairManager.AddLane(FileIoChart, "I/O Latency", "ms", FileIoValueLabel); + _crosshairManager.AddLane(CpuChart, "CPU", "%"); + _crosshairManager.AddLane(WaitStatsChart, "Wait Stats", "ms/sec"); + _crosshairManager.AddLane(BlockingChart, "Blocking", "events"); + _crosshairManager.AddLane(MemoryChart, "Buffer Pool", "MB"); + _crosshairManager.AddLane(FileIoChart, "I/O Latency", "ms"); } /// /// Refreshes all lane data for the given time range. /// - public async Task RefreshAsync(int hoursBack, DateTime? fromDate, DateTime? toDate) + public async Task RefreshAsync(int hoursBack, DateTime? fromDate, DateTime? toDate, + (DateTime From, DateTime To)? comparisonRange = null) { if (_dataService == null || _isRefreshing) return; _isRefreshing = true; @@ -77,28 +79,43 @@ public async Task RefreshAsync(int hoursBack, DateTime? fromDate, DateTime? toDa var memoryTask = _dataService.GetMemoryTrendAsync(_serverId, hoursBack, fromDate, toDate); var fileIoTask = _dataService.GetFileIoLatencyTrendAsync(_serverId, hoursBack, fromDate, toDate); + // Fetch baselines for band rendering — chart-unit-matched metrics + var referenceTime = fromDate ?? DateTime.UtcNow.AddHours(-hoursBack); + var cpuBaselineTask = _dataService.GetBaselineForLaneAsync(_serverId, MetricNames.Cpu, referenceTime); + var waitBaselineTask = _dataService.GetBaselineForLaneAsync(_serverId, MetricNames.WaitMsPerSec, referenceTime); + var ioBaselineTask = _dataService.GetBaselineForLaneAsync(_serverId, MetricNames.IoLatency, referenceTime); + var blockingBaselineTask = _dataService.GetBaselineForLaneAsync(_serverId, MetricNames.BlockingPerMinute, referenceTime); + try { - await Task.WhenAll(cpuTask, waitTask, blockingTask, deadlockTask, memoryTask, fileIoTask); + await Task.WhenAll(cpuTask, waitTask, blockingTask, deadlockTask, memoryTask, fileIoTask, + cpuBaselineTask, waitBaselineTask, ioBaselineTask, blockingBaselineTask); } catch (Exception ex) { AppLogger.Info("CorrelatedLanes", $"Data fetch failed: {ex.Message}"); } + var cpuBaseline = cpuBaselineTask.IsCompletedSuccessfully ? cpuBaselineTask.Result : null; + var waitBaseline = waitBaselineTask.IsCompletedSuccessfully ? waitBaselineTask.Result : null; + var ioBaseline = ioBaselineTask.IsCompletedSuccessfully ? ioBaselineTask.Result : null; + var blockingBaseline = blockingBaselineTask.IsCompletedSuccessfully ? blockingBaselineTask.Result : null; + var utcOffset = ServerTimeHelper.UtcOffsetMinutes; + // minAnomalyValue: absolute floor below which dots/arrows are suppressed even if outside band. + // Prevents "1% CPU above 0.5% baseline" false alarms on idle servers. if (cpuTask.IsCompletedSuccessfully) UpdateLane(CpuChart, "CPU %", cpuTask.Result.Select(d => (d.SampleTime.ToOADate(), (double)d.SqlServerCpu)).ToList(), - "#4FC3F7", 0, 105); + "#4FC3F7", 0, 105, cpuBaseline, minAnomalyValue: 10); else ShowEmpty(CpuChart, "CPU %"); if (waitTask.IsCompletedSuccessfully) UpdateLane(WaitStatsChart, "Wait ms/sec", waitTask.Result.Select(d => (d.CollectionTime.AddMinutes(utcOffset).ToOADate(), d.WaitTimeMsPerSecond)).ToList(), - "#FFB74D"); + "#FFB74D", baseline: waitBaseline, minAnomalyValue: 100); else ShowEmpty(WaitStatsChart, "Wait ms/sec"); @@ -109,11 +126,11 @@ public async Task RefreshAsync(int hoursBack, DateTime? fromDate, DateTime? toDa var deadlockData = deadlockTask.IsCompletedSuccessfully ? deadlockTask.Result.Select(d => (d.Time.AddMinutes(utcOffset).ToOADate(), (double)d.Count)).ToList() : new List<(double, double)>(); - UpdateBlockingLane(blockingData, deadlockData); + UpdateBlockingLane(blockingData, deadlockData, blockingBaseline); } if (memoryTask.IsCompletedSuccessfully) - UpdateLane(MemoryChart, "Memory MB", + UpdateLane(MemoryChart, "Buffer Pool MB", memoryTask.Result.Select(d => (d.CollectionTime.AddMinutes(utcOffset).ToOADate(), d.BufferPoolMb)).ToList(), "#CE93D8"); else @@ -126,11 +143,66 @@ public async Task RefreshAsync(int hoursBack, DateTime? fromDate, DateTime? toDa .OrderBy(g => g.Key) .Select(g => (g.Key.AddMinutes(utcOffset).ToOADate(), g.Average(x => x.AvgReadLatencyMs))) .ToList(); - UpdateLane(FileIoChart, "I/O ms", ioGrouped, "#81C784"); + UpdateLane(FileIoChart, "I/O ms", ioGrouped, "#81C784", baseline: ioBaseline, minAnomalyValue: 2); } else ShowEmpty(FileIoChart, "I/O ms"); + // Comparison overlay — fetch reference period data and render as ghost lines + if (comparisonRange.HasValue) + { + var refFrom = comparisonRange.Value.From; + var refTo = comparisonRange.Value.To; + // Time shift: offset to align reference data with current chart X axis + var timeShift = (fromDate ?? DateTime.UtcNow.AddHours(-hoursBack)) - refFrom; + + var refCpuTask = _dataService.GetCpuUtilizationAsync(_serverId, 0, refFrom, refTo); + var refWaitTask = _dataService.GetTotalWaitTrendAsync(_serverId, 0, refFrom, refTo); + var refBlockingTask = _dataService.GetBlockingTrendAsync(_serverId, 0, refFrom, refTo); + var refMemoryTask = _dataService.GetMemoryTrendAsync(_serverId, 0, refFrom, refTo); + var refIoTask = _dataService.GetFileIoLatencyTrendAsync(_serverId, 0, refFrom, refTo); + + try { await Task.WhenAll(refCpuTask, refWaitTask, refBlockingTask, refMemoryTask, refIoTask); } + catch (Exception ex) { AppLogger.Info("CorrelatedLanes", $"Comparison fetch failed: {ex.Message}"); } + + AppLogger.Info("CorrelatedLanes", + $"Comparison: refFrom={refFrom:o}, refTo={refTo:o}, shift={timeShift.TotalHours:F1}h, " + + $"cpuRows={refCpuTask.Result?.Count ?? 0}, waitRows={refWaitTask.Result?.Count ?? 0}"); + + if (refCpuTask.IsCompletedSuccessfully) + AddGhostLine(CpuChart, refCpuTask.Result + .Select(d => (d.SampleTime.Add(timeShift).ToOADate(), (double)d.SqlServerCpu)).ToList(), "#4FC3F7"); + + if (refWaitTask.IsCompletedSuccessfully) + AddGhostLine(WaitStatsChart, refWaitTask.Result + .Select(d => (d.CollectionTime.AddMinutes(utcOffset).Add(timeShift).ToOADate(), d.WaitTimeMsPerSecond)).ToList(), "#FFB74D"); + + if (refBlockingTask.IsCompletedSuccessfully) + { + var refBlocking = refBlockingTask.Result + .Select(d => (d.Time.AddMinutes(utcOffset).Add(timeShift).ToOADate(), (double)d.Count)).ToList(); + if (refBlocking.Count > 0) + AddGhostLine(BlockingChart, refBlocking, "#E57373"); + } + + if (refMemoryTask.IsCompletedSuccessfully) + AddGhostLine(MemoryChart, refMemoryTask.Result + .Select(d => (d.CollectionTime.AddMinutes(utcOffset).Add(timeShift).ToOADate(), d.BufferPoolMb)).ToList(), "#CE93D8"); + + if (refIoTask.IsCompletedSuccessfully) + { + var refIo = refIoTask.Result + .GroupBy(d => d.CollectionTime) + .OrderBy(g => g.Key) + .Select(g => (g.Key.AddMinutes(utcOffset).Add(timeShift).ToOADate(), g.Average(x => x.AvgReadLatencyMs))) + .ToList(); + AddGhostLine(FileIoChart, refIo, "#81C784"); + } + + // Register reference data with crosshair manager for tooltip + _crosshairManager?.SetComparisonLabel(ComparisonLabel(comparisonRange.Value, fromDate, hoursBack)); + } + _crosshairManager?.ReattachVLines(); SyncXAxes(hoursBack, fromDate, toDate, utcOffset); } @@ -141,7 +213,7 @@ public async Task RefreshAsync(int hoursBack, DateTime? fromDate, DateTime? toDa } private void UpdateBlockingLane(List<(double Time, double Value)> blockingData, - List<(double Time, double Value)> deadlockData) + List<(double Time, double Value)> deadlockData, BaselineBucket? baseline = null) { ClearChart(BlockingChart); ApplyTheme(BlockingChart); @@ -197,6 +269,24 @@ private void UpdateBlockingLane(List<(double Time, double Value)> blockingData, maxCount = Math.Max(maxCount, deadlockData.Max(d => d.Value)); } + // Baseline band for blocking + if (baseline != null && baseline.SampleCount > 0 && baseline.EffectiveStdDev > 0) + { + var upper = baseline.Mean + 2 * baseline.EffectiveStdDev; + var lower = Math.Max(0, baseline.Mean - 2 * baseline.EffectiveStdDev); + + _crosshairManager?.SetLaneBaseline(BlockingChart, lower, upper); + + var band = BlockingChart.Plot.Add.HorizontalSpan(lower, upper); + band.FillStyle.Color = ScottPlot.Color.FromHex("#E57373").WithAlpha(25); + band.LineStyle.Width = 0; + + var meanLine = BlockingChart.Plot.Add.HorizontalLine(baseline.Mean); + meanLine.Color = ScottPlot.Color.FromHex("#E57373").WithAlpha(60); + meanLine.LinePattern = ScottPlot.LinePattern.Dashed; + meanLine.LineWidth = 1; + } + BlockingChart.Plot.Axes.DateTimeTicksBottom(); BlockingChart.Plot.Axes.Bottom.TickLabelStyle.IsVisible = false; ReapplyAxisColors(BlockingChart); @@ -212,7 +302,8 @@ private void UpdateBlockingLane(List<(double Time, double Value)> blockingData, private void UpdateLane(ScottPlot.WPF.WpfPlot chart, string title, List<(double Time, double Value)> data, string colorHex, - double? yMin = null, double? yMax = null) + double? yMin = null, double? yMax = null, BaselineBucket? baseline = null, + double minAnomalyValue = 0) { ClearChart(chart); ApplyTheme(chart); @@ -226,6 +317,43 @@ private void UpdateLane(ScottPlot.WPF.WpfPlot chart, string title, var times = data.Select(d => d.Time).ToArray(); var values = data.Select(d => d.Value).ToArray(); + // Render baseline band FIRST (behind the data line) + if (baseline != null && baseline.SampleCount > 0 && baseline.EffectiveStdDev > 0) + { + var upper = baseline.Mean + 2 * baseline.EffectiveStdDev; + var lower = Math.Max(0, baseline.Mean - 2 * baseline.EffectiveStdDev); + + _crosshairManager?.SetLaneBaseline(chart, lower, upper, minAnomalyValue); + + var band = chart.Plot.Add.HorizontalSpan(lower, upper); + band.FillStyle.Color = ScottPlot.Color.FromHex(colorHex).WithAlpha(25); + band.LineStyle.Width = 0; + + var meanLine = chart.Plot.Add.HorizontalLine(baseline.Mean); + meanLine.Color = ScottPlot.Color.FromHex(colorHex).WithAlpha(60); + meanLine.LinePattern = ScottPlot.LinePattern.Dashed; + meanLine.LineWidth = 1; + + // Highlight anomalous points (outside ± 2σ band AND above absolute minimum) + var anomalyIndices = new List(); + for (int i = 0; i < values.Length; i++) + { + if ((values[i] > upper && values[i] >= minAnomalyValue) || values[i] < lower) + anomalyIndices.Add(i); + } + + if (anomalyIndices.Count > 0) + { + var anomalyTimes = anomalyIndices.Select(i => times[i]).ToArray(); + var anomalyValues = anomalyIndices.Select(i => values[i]).ToArray(); + var anomalyScatter = chart.Plot.Add.Scatter(anomalyTimes, anomalyValues); + anomalyScatter.Color = ScottPlot.Color.FromHex("#FF5252"); + anomalyScatter.MarkerSize = 6; + anomalyScatter.MarkerShape = ScottPlot.MarkerShape.FilledCircle; + anomalyScatter.LineWidth = 0; + } + } + var scatter = chart.Plot.Add.Scatter(times, values); scatter.Color = ScottPlot.Color.FromHex(colorHex); scatter.MarkerSize = 0; @@ -289,6 +417,38 @@ private void SyncXAxes(int hoursBack, DateTime? fromDate, DateTime? toDate, doub } } + /// + /// Renders a semi-transparent dashed ghost line for comparison overlay. + /// + private static void AddGhostLine(ScottPlot.WPF.WpfPlot chart, + List<(double Time, double Value)> data, string colorHex) + { + if (data.Count == 0) return; + + var times = data.Select(d => d.Time).ToArray(); + var values = data.Select(d => d.Value).ToArray(); + + var scatter = chart.Plot.Add.Scatter(times, values); + // White-ish ghost line — distinct from the primary colored line + scatter.Color = ScottPlot.Colors.White.WithAlpha(140); + scatter.MarkerSize = 0; + scatter.LineWidth = 1.5f; + scatter.LinePattern = ScottPlot.LinePattern.Dashed; + + chart.Refresh(); + } + + private static string ComparisonLabel((DateTime From, DateTime To) range, + DateTime? fromDate, int hoursBack) + { + var currentStart = fromDate ?? DateTime.UtcNow.AddHours(-hoursBack); + var daysBack = (currentStart - range.From).TotalDays; + + if (Math.Abs(daysBack - 1) < 0.5) return "yesterday"; + if (Math.Abs(daysBack - 7) < 0.5) return "last week"; + return $"{daysBack:N0}d ago"; + } + private static void ClearChart(ScottPlot.WPF.WpfPlot chart) { chart.Reset(); diff --git a/Lite/Controls/ServerTab.xaml b/Lite/Controls/ServerTab.xaml index a845687e..6202f649 100644 --- a/Lite/Controls/ServerTab.xaml +++ b/Lite/Controls/ServerTab.xaml @@ -103,6 +103,16 @@ SelectionChanged="CustomTimeCombo_Changed" ToolTip="Minute"/> + private (DateTime From, DateTime To)? GetComparisonRange() + { + if (CompareToCombo == null || CompareToCombo.SelectedIndex <= 0) return null; + + var hoursBack = GetHoursBack(); + DateTime? fromDate = null, toDate = null; + if (IsCustomRange) + { + var fromLocal = GetDateTimeFromPickers(FromDatePicker!, FromHourCombo, FromMinuteCombo); + var toLocal = GetDateTimeFromPickers(ToDatePicker!, ToHourCombo, ToMinuteCombo); + if (fromLocal.HasValue && toLocal.HasValue) + { + fromDate = ServerTimeHelper.DisplayTimeToServerTime(fromLocal.Value, ServerTimeHelper.CurrentDisplayMode); + toDate = ServerTimeHelper.DisplayTimeToServerTime(toLocal.Value, ServerTimeHelper.CurrentDisplayMode); + } + } + + var currentEnd = toDate ?? DateTime.UtcNow; + var currentStart = fromDate ?? currentEnd.AddHours(-hoursBack); + + return CompareToCombo.SelectedIndex switch + { + 1 => (currentStart.AddDays(-1), currentEnd.AddDays(-1)), // Yesterday + 2 => (currentStart.AddDays(-7), currentEnd.AddDays(-7)), // Last week + 3 => (currentStart.AddDays(-7), currentEnd.AddDays(-7)), // Same day last week + _ => null + }; + } + private async void CustomDateRange_Changed(object sender, SelectionChangedEventArgs e) { if (!IsLoaded || _isRefreshing) return; @@ -1098,7 +1152,8 @@ private async System.Threading.Tasks.Task RefreshOverviewAsync(int hoursBack, Da { try { - await CorrelatedLanes.RefreshAsync(hoursBack, fromDate, toDate); + var comparison = GetComparisonRange(); + await CorrelatedLanes.RefreshAsync(hoursBack, fromDate, toDate, comparison); } catch (Exception ex) { diff --git a/Lite/Helpers/CorrelatedCrosshairManager.cs b/Lite/Helpers/CorrelatedCrosshairManager.cs index da7267cb..1cb52a12 100644 --- a/Lite/Helpers/CorrelatedCrosshairManager.cs +++ b/Lite/Helpers/CorrelatedCrosshairManager.cs @@ -15,6 +15,7 @@ using System.Windows; using System.Windows.Controls; using System.Windows.Controls.Primitives; +using System.Windows.Documents; using System.Windows.Input; using System.Windows.Media; using PerformanceMonitorLite.Services; @@ -62,14 +63,13 @@ public CorrelatedCrosshairManager() /// /// Registers a chart lane for crosshair synchronization. /// - public void AddLane(ScottPlot.WPF.WpfPlot chart, string label, string unit, TextBlock valueLabel) + public void AddLane(ScottPlot.WPF.WpfPlot chart, string label, string unit) { var lane = new LaneInfo { Chart = chart, Label = label, - Unit = unit, - ValueLabel = valueLabel + Unit = unit }; chart.MouseMove += (s, e) => OnMouseMove(lane, e); @@ -78,6 +78,20 @@ public void AddLane(ScottPlot.WPF.WpfPlot chart, string label, string unit, Text _lanes.Add(lane); } + /// + /// Sets the expected baseline range for a lane (upper/lower bounds). + /// Values outside this range get ▲/▼ indicators in the tooltip. + /// + public void SetLaneBaseline(ScottPlot.WPF.WpfPlot chart, double lower, double upper, + double minAnomalyValue = 0) + { + var lane = _lanes.Find(l => l.Chart == chart); + if (lane == null) return; + lane.BaselineLower = lower; + lane.BaselineUpper = upper; + lane.MinAnomalyValue = minAnomalyValue; + } + /// /// Sets a single data series for a lane (most lanes have one series). /// @@ -117,6 +131,16 @@ public void AddLaneSeries(ScottPlot.WPF.WpfPlot chart, string name, string unit, }); } + /// + /// Sets the label shown in the tooltip for comparison data (e.g., "yesterday"). + /// + public void SetComparisonLabel(string label) + { + _comparisonLabel = label; + } + + private string? _comparisonLabel; + /// /// Clears data and VLines. Call before re-populating charts. /// @@ -124,10 +148,14 @@ public void PrepareForRefresh() { _isRefreshing = true; _tooltip.IsOpen = false; + _comparisonLabel = null; foreach (var lane in _lanes) { lane.Series.Clear(); lane.VLine = null; + lane.BaselineUpper = null; + lane.BaselineLower = null; + lane.MinAnomalyValue = 0; } } @@ -165,10 +193,14 @@ private void OnMouseMove(LaneInfo sourceLane, MouseEventArgs e) var mouseCoords = sourceLane.Chart.Plot.GetCoordinates(pixel); double xValue = mouseCoords.X; - var tooltipLines = new List(); + _tooltipText.Inlines.Clear(); var time = DateTime.FromOADate(xValue); var displayTime = ServerTimeHelper.ConvertForDisplay(time, ServerTimeHelper.CurrentDisplayMode); - tooltipLines.Add(displayTime.ToString("yyyy-MM-dd HH:mm:ss")); + _tooltipText.Inlines.Add(new Run(displayTime.ToString("yyyy-MM-dd HH:mm:ss"))); + if (_comparisonLabel != null) + _tooltipText.Inlines.Add(new Run($" (dashed = {_comparisonLabel})") { Foreground = DimBrush }); + + var defaultBrush = new SolidColorBrush(Color.FromRgb(0xE0, 0xE0, 0xE0)); foreach (var lane in _lanes) { @@ -179,51 +211,47 @@ private void OnMouseMove(LaneInfo sourceLane, MouseEventArgs e) if (lane.Series.Count == 1) { - // Single series — use lane label and unit var series = lane.Series[0]; double? value = FindNearestValue(series, xValue); if (value.HasValue) { - lane.ValueLabel.Text = $"{value.Value:N1} {lane.Unit}"; - tooltipLines.Add($"{lane.Label}: {value.Value:N1} {lane.Unit}"); + var indicator = GetBaselineIndicator(lane, value.Value); + + // Tooltip: value + arrow + "30d avg" context + _tooltipText.Inlines.Add(new Run($"\n{lane.Label}: {value.Value:N1} {lane.Unit}") { Foreground = defaultBrush }); + if (indicator != null) + { + _tooltipText.Inlines.Add(new Run($" {indicator.Value.Symbol}") { Foreground = indicator.Value.Brush }); + var ctx = FormatBaselineContext(lane); + if (ctx != null) + _tooltipText.Inlines.Add(new Run($" ({ctx})") { Foreground = DimBrush }); + } } else { - lane.ValueLabel.Text = ""; - tooltipLines.Add($"{lane.Label}: —"); + _tooltipText.Inlines.Add(new Run($"\n{lane.Label}: —") { Foreground = defaultBrush }); } } else if (lane.Series.Count > 1) { - // Multiple series — show each with its own name - var valueParts = new List(); foreach (var series in lane.Series) { double? value = FindNearestValue(series, xValue); string unit = series.Unit ?? lane.Unit; if (value.HasValue) - { - valueParts.Add($"{value.Value:N0}"); - tooltipLines.Add($"{series.Name}: {value.Value:N0} {unit}"); - } + _tooltipText.Inlines.Add(new Run($"\n{series.Name}: {value.Value:N0} {unit}") { Foreground = defaultBrush }); else - { - tooltipLines.Add($"{series.Name}: —"); - } + _tooltipText.Inlines.Add(new Run($"\n{series.Name}: —") { Foreground = defaultBrush }); } - lane.ValueLabel.Text = valueParts.Count > 0 ? string.Join("/", valueParts) : ""; } else { - lane.ValueLabel.Text = ""; - tooltipLines.Add($"{lane.Label}: —"); + _tooltipText.Inlines.Add(new Run($"\n{lane.Label}: —") { Foreground = defaultBrush }); } lane.Chart.Refresh(); } - - _tooltipText.Text = string.Join("\n", tooltipLines); _tooltip.PlacementTarget = sourceLane.Chart; _tooltip.HorizontalOffset = pos.X + 15; _tooltip.VerticalOffset = pos.Y + 15; @@ -265,6 +293,32 @@ private void OnMouseMove(LaneInfo sourceLane, MouseEventArgs e) return val; } + private static readonly SolidColorBrush RedBrush = new(Color.FromRgb(0xFF, 0x52, 0x52)); + private static readonly SolidColorBrush GreenBrush = new(Color.FromRgb(0x69, 0xF0, 0x69)); + private static readonly SolidColorBrush DimBrush = new(Color.FromRgb(0x90, 0x96, 0xA0)); + + private record struct BaselineIndicator(string Symbol, SolidColorBrush Brush); + + private static string? FormatBaselineContext(LaneInfo lane) + { + if (lane.BaselineUpper == null || lane.BaselineLower == null) return null; + var mean = (lane.BaselineUpper.Value + lane.BaselineLower.Value) / 2.0; + var formatted = mean >= 1000 ? $"{mean:N0}" : mean >= 10 ? $"{mean:N1}" : $"{mean:N2}"; + return $"30d avg: ~{formatted}"; + } + + private static BaselineIndicator? GetBaselineIndicator(LaneInfo lane, double value) + { + if (lane.BaselineUpper == null || lane.BaselineLower == null) return null; + // ▲ requires both: outside band AND above absolute minimum (prevents 1% CPU false alarms) + if (value > lane.BaselineUpper.Value && value >= lane.MinAnomalyValue) + return new BaselineIndicator("▲", RedBrush); + // ▼ always shown when below band (drops are always interesting — tuning feedback) + if (value < lane.BaselineLower.Value) + return new BaselineIndicator("▼", GreenBrush); + return null; + } + private void OnMouseLeave() { _tooltip.IsOpen = false; @@ -272,7 +326,6 @@ private void OnMouseLeave() { if (lane.VLine != null) lane.VLine.IsVisible = false; - lane.ValueLabel.Text = ""; lane.Chart.Refresh(); } } @@ -303,7 +356,9 @@ private class LaneInfo public string Label { get; set; } = ""; public string Unit { get; set; } = ""; public ScottPlot.Plottables.VerticalLine? VLine { get; set; } - public TextBlock ValueLabel { get; set; } = null!; public List Series { get; set; } = new(); + public double? BaselineUpper { get; set; } + public double? BaselineLower { get; set; } + public double MinAnomalyValue { get; set; } } } diff --git a/Lite/Mcp/McpAnalysisTools.cs b/Lite/Mcp/McpAnalysisTools.cs index 9544b46f..c29dca0e 100644 --- a/Lite/Mcp/McpAnalysisTools.cs +++ b/Lite/Mcp/McpAnalysisTools.cs @@ -9,7 +9,7 @@ namespace PerformanceMonitorLite.Mcp; [McpServerToolType] public sealed class McpAnalysisTools { - [McpServerTool(Name = "analyze_server"), Description("Runs the diagnostic inference engine against a server's collected data. Scores wait stats, blocking, memory, config, and other facts, then traverses a relationship graph to build evidence-backed stories about what's wrong and why. Returns structured findings with severity scores, evidence chains, and recommended next tools to call. The AI client should interpret the findings and provide recommendations — the engine provides the reasoning, not the prose.")] + [McpServerTool(Name = "analyze_server"), Description("Runs the diagnostic inference engine against a server's collected data. Scores wait stats, blocking, memory, config, and other facts, then traverses a relationship graph to build evidence-backed stories about what's wrong and why. Anomaly detection compares the analysis window against 30-day time-bucketed baselines (hour-of-day x day-of-week) to identify deviations that are unusual for this specific time slot, not just unusual overall. Returns structured findings with severity scores, evidence chains, baseline context for anomalies, and recommended next tools to call.")] public static async Task AnalyzeServer( AnalysisService analysisService, ServerManager serverManager, @@ -162,13 +162,13 @@ public static async Task GetAnalysisFacts( } } - [McpServerTool(Name = "compare_analysis"), Description("Compares two time periods by running the inference engine's fact collection and scoring on each, then showing what changed. Use this to compare peak vs off-peak, before vs after a change, or yesterday vs today. Returns facts from both periods side-by-side with severity deltas.")] + [McpServerTool(Name = "compare_analysis"), Description("Compares two time periods by running the inference engine's fact collection and scoring on each, then showing what changed. Use this to compare peak vs off-peak, before vs after a change, or yesterday vs today. Returns facts from both periods side-by-side with severity deltas. Note: for routine anomaly detection, use analyze_server instead — it automatically compares against 30-day time-bucketed baselines (hour-of-day x day-of-week). This tool is for explicit window-to-window comparisons.")] public static async Task CompareAnalysis( AnalysisService analysisService, ServerManager serverManager, [Description("Server name or display name.")] string? server_name = null, [Description("Hours back for the comparison (recent) period. Default 4.")] int hours_back = 4, - [Description("Hours back for the baseline period start, measured from now. Default 28 (yesterday same time, assuming 4-hour windows). The baseline period will be the same duration as the comparison period.")] int baseline_hours_back = 28) + [Description("Hours back for the baseline period start, measured from now. Default 28 (yesterday same time). The baseline period will be the same duration as the comparison period.")] int baseline_hours_back = 28) { var resolved = ServerResolver.Resolve(serverManager, server_name); if (resolved == null) @@ -833,6 +833,48 @@ public static List GetForStoryPath(string storyPath) return result; } + + private static readonly string[] DayNames = ["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"]; + + /// + /// Formats baseline context from anomaly fact metadata into a human-readable object + /// for MCP output. Example: "4.1σ above baseline for Tue 14:00, mean 68.2" + /// + private static object? FormatBaselineContext(Dictionary metadata) + { + var result = new Dictionary(); + + if (metadata.TryGetValue("deviation_sigma", out var sigma)) + result["deviation"] = $"{sigma:F1}σ"; + + if (metadata.TryGetValue("ratio", out var ratio)) + result["ratio"] = $"{ratio:F1}x"; + + if (metadata.TryGetValue("baseline_mean", out var mean)) + result["baseline_mean"] = Math.Round(mean, 2); + + if (metadata.TryGetValue("baseline_mean_ms", out var meanMs)) + result["baseline_mean"] = Math.Round(meanMs, 2); + + if (metadata.TryGetValue("baseline_stddev", out var stddev)) + result["baseline_stddev"] = Math.Round(stddev, 2); + + if (metadata.TryGetValue("baseline_hour", out var hour) && + metadata.TryGetValue("baseline_dow", out var dow)) + { + var dowIdx = (int)dow; + var dayName = dowIdx >= 0 && dowIdx < DayNames.Length ? DayNames[dowIdx] : "?"; + result["bucket"] = hour >= 0 ? $"{dayName} {(int)hour:00}:00" : "flat"; + } + + if (metadata.TryGetValue("baseline_tier", out var tier)) + result["tier"] = tier switch { 0 => "full", 1 => "hour_only", _ => "flat" }; + + if (metadata.TryGetValue("baseline_samples", out var samples)) + result["baseline_samples"] = (int)samples; + + return result.Count > 0 ? result : null; + } } internal record ToolRecommendation( diff --git a/Lite/Services/LocalDataService.Baselines.cs b/Lite/Services/LocalDataService.Baselines.cs new file mode 100644 index 00000000..93fd344b --- /dev/null +++ b/Lite/Services/LocalDataService.Baselines.cs @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2026 Erik Darling, Darling Data LLC + * + * This file is part of the SQL Server Performance Monitor Lite. + * + * Licensed under the MIT License. See LICENSE file in the project root for full license information. + */ + +using PerformanceMonitorLite.Analysis; + +namespace PerformanceMonitorLite.Services; + +public partial class LocalDataService +{ + private BaselineProvider? _baselineProvider; + + private BaselineProvider GetBaselineProvider() + { + return _baselineProvider ??= new BaselineProvider(_duckDb); + } + + /// + /// Gets the baseline (mean ± stddev) for a metric at a specific time. + /// Returns null if no baseline data is available. + /// + public async Task GetBaselineForLaneAsync( + int serverId, string metricName, DateTime referenceTime) + { + var baseline = await GetBaselineProvider().GetBaselineAsync(serverId, metricName, referenceTime); + return baseline.SampleCount > 0 ? baseline : BaselineBucket.Empty; + } +} From e2dfcdb06c7a00b48e3f1cf53d94f51e52ca172a Mon Sep 17 00:00:00 2001 From: Erik Darling <2136037+erikdarlingdata@users.noreply.github.com> Date: Mon, 6 Apr 2026 23:32:00 -0400 Subject: [PATCH 3/3] Port baseline engine and anomaly detection to Dashboard (Phases 5-8) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add SqlServerBaselineProvider: time-bucketed baselines from SQL Server collect.* tables with @@DATEFIRST normalization, CTE restart exclusion, bucket collapse hierarchy (full → hour-only → flat), 1-hour TTL cache - Upgrade SqlServerAnomalyDetector: inject baseline provider, upgrade 4 existing detectors to time-bucketed baselines, add 3 new detectors (batch requests, sessions, query duration) - Update FactScorer: score ANOMALY_BATCH_REQUESTS, ANOMALY_SESSION_SPIKE, ANOMALY_QUERY_DURATION (deviation-based) - Port correlated timeline lanes UI: baseline bands, anomaly dots, Compare dropdown with ghost line overlay, remove right-side value labels, rename Memory MB → Buffer Pool MB, fix auto-refresh flicker - Port crosshair manager: ▲/▼ baseline indicators, comparison label, event-based spike detection for blocking/deadlocks - Fix pre-existing CPU tooltip bug: data returned DESC, binary search assumed ASC — added OrderBy - Fix event-based baseline: use wait_stats spine for zero-inclusive baselines, handle zero-mean EffectiveStdDev for event metrics - Cross-server validated on sql2016–sql2025, all queries < 250ms Co-Authored-By: Claude Opus 4.6 (1M context) --- Dashboard/Analysis/AnalysisService.cs | 4 +- Dashboard/Analysis/FactScorer.cs | 9 +- .../Analysis/SqlServerAnomalyDetector.cs | 1251 ++++++++++------- .../Analysis/SqlServerBaselineProvider.cs | 525 +++++++ .../CorrelatedTimelineLanesControl.xaml | 24 +- .../CorrelatedTimelineLanesControl.xaml.cs | 244 +++- .../Controls/ResourceMetricsContent.xaml | 16 +- .../Controls/ResourceMetricsContent.xaml.cs | 36 +- .../Helpers/CorrelatedCrosshairManager.cs | 115 +- Dashboard/ServerTab.xaml.cs | 3 +- .../CorrelatedTimelineLanesControl.xaml.cs | 2 +- Lite/Helpers/CorrelatedCrosshairManager.cs | 18 +- 12 files changed, 1609 insertions(+), 638 deletions(-) create mode 100644 Dashboard/Analysis/SqlServerBaselineProvider.cs diff --git a/Dashboard/Analysis/AnalysisService.cs b/Dashboard/Analysis/AnalysisService.cs index a0d7b2fc..e982c19f 100644 --- a/Dashboard/Analysis/AnalysisService.cs +++ b/Dashboard/Analysis/AnalysisService.cs @@ -23,6 +23,7 @@ public class AnalysisService private readonly InferenceEngine _engine; private readonly SqlServerDrillDownCollector _drillDown; private readonly SqlServerAnomalyDetector _anomalyDetector; + private readonly SqlServerBaselineProvider _baselineProvider; /// /// Minimum hours of collected data required before analysis will run. @@ -60,7 +61,8 @@ public AnalysisService(string connectionString, IPlanFetcher? planFetcher = null _graph = new RelationshipGraph(); _engine = new InferenceEngine(_graph); _drillDown = new SqlServerDrillDownCollector(connectionString, planFetcher); - _anomalyDetector = new SqlServerAnomalyDetector(connectionString); + _baselineProvider = new SqlServerBaselineProvider(connectionString); + _anomalyDetector = new SqlServerAnomalyDetector(connectionString, _baselineProvider); } /// diff --git a/Dashboard/Analysis/FactScorer.cs b/Dashboard/Analysis/FactScorer.cs index 78347953..5605a88b 100644 --- a/Dashboard/Analysis/FactScorer.cs +++ b/Dashboard/Analysis/FactScorer.cs @@ -308,9 +308,12 @@ private static double ScoreBadActorFact(Fact fact) /// private static double ScoreAnomalyFact(Fact fact) { - if ( fact.Key.StartsWith("ANOMALY_CPU_SPIKE" , StringComparison.OrdinalIgnoreCase) - || fact.Key.StartsWith("ANOMALY_READ_LATENCY" , StringComparison.OrdinalIgnoreCase) - || fact.Key.StartsWith("ANOMALY_WRITE_LATENCY", StringComparison.OrdinalIgnoreCase) + if ( fact.Key.StartsWith("ANOMALY_CPU_SPIKE" , StringComparison.OrdinalIgnoreCase) + || fact.Key.StartsWith("ANOMALY_READ_LATENCY" , StringComparison.OrdinalIgnoreCase) + || fact.Key.StartsWith("ANOMALY_WRITE_LATENCY" , StringComparison.OrdinalIgnoreCase) + || fact.Key.StartsWith("ANOMALY_BATCH_REQUESTS", StringComparison.OrdinalIgnoreCase) + || fact.Key.StartsWith("ANOMALY_SESSION_SPIKE" , StringComparison.OrdinalIgnoreCase) + || fact.Key.StartsWith("ANOMALY_QUERY_DURATION", StringComparison.OrdinalIgnoreCase) ) { // Deviation-based scoring: 2σ = 0.5, 4σ = 1.0 diff --git a/Dashboard/Analysis/SqlServerAnomalyDetector.cs b/Dashboard/Analysis/SqlServerAnomalyDetector.cs index bdf6664a..82d26294 100644 --- a/Dashboard/Analysis/SqlServerAnomalyDetector.cs +++ b/Dashboard/Analysis/SqlServerAnomalyDetector.cs @@ -1,543 +1,708 @@ -using System; -using System.Collections.Generic; -using System.Threading.Tasks; -using Microsoft.Data.SqlClient; -using PerformanceMonitorDashboard.Helpers; - -namespace PerformanceMonitorDashboard.Analysis; - -/// -/// Detects anomalies by comparing the analysis window's metrics against a -/// baseline period. When a metric deviates significantly from baseline -/// (mean + standard deviation), an ANOMALY fact is emitted. -/// -/// This is the "oh shit" mode -- detecting acute deviations that don't show -/// up in aggregate analysis because they're brief. A 5-minute CPU spike -/// that averages out over 4 hours is invisible to aggregate scoring but -/// obvious when compared against "what was this metric doing before?" -/// -/// Baseline selection: uses the 24 hours preceding the analysis window. -/// If less data is available, uses whatever exists with lower confidence. -/// -/// Port of Lite's AnomalyDetector -- uses SQL Server collect.* tables instead of DuckDB views. -/// No server_id filtering -- Dashboard monitors one server per database. -/// -public class SqlServerAnomalyDetector -{ - private readonly string _connectionString; - - /// - /// Minimum number of baseline samples needed for reliable detection. - /// Below this, anomalies are still detected but with reduced confidence. - /// - private const int MinBaselineSamples = 10; - - /// - /// Number of standard deviations above baseline mean to flag as anomalous. - /// - private const double DeviationThreshold = 2.0; - - public SqlServerAnomalyDetector(string connectionString) - { - _connectionString = connectionString; - } - - /// - /// Detects anomalies by comparing the analysis window against a baseline period. - /// Returns anomaly facts to be merged into the main fact list. - /// - public async Task> DetectAnomaliesAsync(AnalysisContext context) - { - var anomalies = new List(); - - // Baseline: 24 hours preceding the analysis window - var baselineEnd = context.TimeRangeStart; - var baselineStart = baselineEnd.AddHours(-24); - - // Check if baseline period has any data at all -- if not, skip all anomaly detection. - // Without baseline data, everything looks anomalous. - if (!await HasBaselineDataAsync(baselineStart, baselineEnd)) - return anomalies; - - await DetectCpuAnomalies(context, baselineStart, baselineEnd, anomalies); - await DetectWaitAnomalies(context, baselineStart, baselineEnd, anomalies); - await DetectBlockingAnomalies(context, baselineStart, baselineEnd, anomalies); - await DetectIoAnomalies(context, baselineStart, baselineEnd, anomalies); - - return anomalies; - } - - /// - /// Checks if the baseline period has any collected data. - /// Uses wait_stats as canary -- if waits are collected, other data is too. - /// - private async Task HasBaselineDataAsync(DateTime baselineStart, DateTime baselineEnd) - { - try - { - using var connection = new SqlConnection(_connectionString); - await connection.OpenAsync(); - - using var cmd = connection.CreateCommand(); - cmd.CommandText = @" -SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; - -SELECT - (SELECT COUNT(*) FROM collect.wait_stats - WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd) - + (SELECT COUNT(*) FROM collect.cpu_utilization_stats - WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd);"; - - cmd.Parameters.Add(new SqlParameter("@baselineStart", baselineStart)); - cmd.Parameters.Add(new SqlParameter("@baselineEnd", baselineEnd)); - - var count = Convert.ToInt64(await cmd.ExecuteScalarAsync() ?? 0); - return count > 0; - } - catch { return false; } - } - - /// - /// Detects CPU utilization anomalies by comparing per-sample values - /// against the baseline distribution. - /// - private async Task DetectCpuAnomalies(AnalysisContext context, - DateTime baselineStart, DateTime baselineEnd, List anomalies) - { - try - { - using var connection = new SqlConnection(_connectionString); - await connection.OpenAsync(); - - // Get baseline stats - using var baselineCmd = connection.CreateCommand(); - baselineCmd.CommandText = @" -SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; - -SELECT - AVG(CAST(sqlserver_cpu_utilization AS FLOAT)) AS mean_cpu, - STDEV(CAST(sqlserver_cpu_utilization AS FLOAT)) AS stddev_cpu, - COUNT(*) AS sample_count -FROM collect.cpu_utilization_stats -WHERE collection_time >= @baselineStart -AND collection_time < @baselineEnd;"; - - baselineCmd.Parameters.Add(new SqlParameter("@baselineStart", baselineStart)); - baselineCmd.Parameters.Add(new SqlParameter("@baselineEnd", baselineEnd)); - - double baselineMean = 0, baselineStdDev = 0; - long baselineSamples = 0; - - using (var reader = await baselineCmd.ExecuteReaderAsync()) - { - if (await reader.ReadAsync()) - { - baselineMean = reader.IsDBNull(0) ? 0 : Convert.ToDouble(reader.GetValue(0)); - baselineStdDev = reader.IsDBNull(1) ? 0 : Convert.ToDouble(reader.GetValue(1)); - baselineSamples = reader.IsDBNull(2) ? 0 : Convert.ToInt64(reader.GetValue(2)); - } - } - - if (baselineSamples < 3 || baselineStdDev <= 0) return; - - // Get peak and average in the analysis window - using var windowCmd = connection.CreateCommand(); - windowCmd.CommandText = @" -SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; - -SELECT - MAX(sqlserver_cpu_utilization) AS peak_cpu, - AVG(CAST(sqlserver_cpu_utilization AS FLOAT)) AS avg_cpu, - COUNT(*) AS sample_count, - (SELECT TOP 1 collection_time FROM collect.cpu_utilization_stats - WHERE collection_time >= @windowStart AND collection_time < @windowEnd - ORDER BY sqlserver_cpu_utilization DESC) AS peak_time -FROM collect.cpu_utilization_stats -WHERE collection_time >= @windowStart -AND collection_time < @windowEnd;"; - - windowCmd.Parameters.Add(new SqlParameter("@windowStart", context.TimeRangeStart)); - windowCmd.Parameters.Add(new SqlParameter("@windowEnd", context.TimeRangeEnd)); - - using var windowReader = await windowCmd.ExecuteReaderAsync(); - if (!await windowReader.ReadAsync()) return; - - var peakCpu = windowReader.IsDBNull(0) ? 0.0 : Convert.ToDouble(windowReader.GetValue(0)); - var avgCpu = windowReader.IsDBNull(1) ? 0.0 : Convert.ToDouble(windowReader.GetValue(1)); - var windowSamples = windowReader.IsDBNull(2) ? 0L : Convert.ToInt64(windowReader.GetValue(2)); - var peakTime = windowReader.IsDBNull(3) ? (DateTime?)null : windowReader.GetDateTime(3); - - if (windowSamples == 0) return; - - // Check if peak deviates significantly from baseline - var deviation = (peakCpu - baselineMean) / baselineStdDev; - if (deviation < DeviationThreshold || peakCpu < 50) return; // Don't flag low absolute values - - var confidence = baselineSamples >= MinBaselineSamples ? 1.0 : (double)baselineSamples / MinBaselineSamples; - - anomalies.Add(new Fact - { - Source = "anomaly", - Key = "ANOMALY_CPU_SPIKE", - Value = peakCpu, - ServerId = context.ServerId, - Metadata = new Dictionary - { - ["peak_cpu"] = peakCpu, - ["avg_cpu_in_window"] = avgCpu, - ["baseline_mean"] = baselineMean, - ["baseline_stddev"] = baselineStdDev, - ["deviation_sigma"] = deviation, - ["baseline_samples"] = baselineSamples, - ["window_samples"] = windowSamples, - ["confidence"] = confidence, - ["peak_time_ticks"] = peakTime?.Ticks ?? 0 - } - }); - } - catch (Exception ex) - { - Logger.Error($"[SqlServerAnomalyDetector] CPU anomaly detection failed: {ex.Message}"); - } - } - - /// - /// Detects wait stat anomalies -- significant waits in the analysis window - /// that were absent or much lower in the baseline. - /// - private async Task DetectWaitAnomalies(AnalysisContext context, - DateTime baselineStart, DateTime baselineEnd, List anomalies) - { - try - { - using var connection = new SqlConnection(_connectionString); - await connection.OpenAsync(); - - // Check if baseline has any wait data at all -- if not, skip - using var checkCmd = connection.CreateCommand(); - checkCmd.CommandText = @" -SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; - -SELECT COUNT(*) FROM collect.wait_stats -WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd;"; - - checkCmd.Parameters.Add(new SqlParameter("@baselineStart", baselineStart)); - checkCmd.Parameters.Add(new SqlParameter("@baselineEnd", baselineEnd)); - - var baselineCount = Convert.ToInt64(await checkCmd.ExecuteScalarAsync() ?? 0); - if (baselineCount == 0) return; - - // Get per-wait-type totals in both windows - using var cmd = connection.CreateCommand(); - cmd.CommandText = @" -SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; - -;WITH baseline AS ( - SELECT wait_type, - CAST(SUM(wait_time_ms_delta) AS BIGINT) AS total_ms - FROM collect.wait_stats - WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd - AND wait_time_ms_delta > 0 - GROUP BY wait_type -), -current_window AS ( - SELECT wait_type, - CAST(SUM(wait_time_ms_delta) AS BIGINT) AS total_ms - FROM collect.wait_stats - WHERE collection_time >= @windowStart AND collection_time <= @windowEnd - AND wait_time_ms_delta > 0 - GROUP BY wait_type -) -SELECT TOP 10 - c.wait_type, - c.total_ms AS current_ms, - COALESCE(b.total_ms, 0) AS baseline_ms -FROM current_window c -LEFT JOIN baseline b ON c.wait_type = b.wait_type -WHERE c.total_ms > 10000 -- At least 10 seconds of wait time -ORDER BY c.total_ms DESC;"; - - cmd.Parameters.Add(new SqlParameter("@baselineStart", baselineStart)); - cmd.Parameters.Add(new SqlParameter("@baselineEnd", baselineEnd)); - cmd.Parameters.Add(new SqlParameter("@windowStart", context.TimeRangeStart)); - cmd.Parameters.Add(new SqlParameter("@windowEnd", context.TimeRangeEnd)); - - using var reader = await cmd.ExecuteReaderAsync(); - while (await reader.ReadAsync()) - { - var waitType = reader.GetString(0); - var currentMs = Convert.ToInt64(reader.GetValue(1)); - var baselineMs = Convert.ToInt64(reader.GetValue(2)); - - // Normalize to per-hour rates before comparing (windows are different lengths) - var baselineHours = (baselineEnd - baselineStart).TotalHours; - var currentHours = (context.TimeRangeEnd - context.TimeRangeStart).TotalHours; - if (baselineHours <= 0) baselineHours = 1; - if (currentHours <= 0) currentHours = 1; - - double ratio; - string anomalyType; - - if (baselineMs == 0) - { - ratio = currentMs > 60_000 ? 100.0 : 0; // Only flag if > 1 minute total - anomalyType = "new"; - } - else - { - var baselineRate = baselineMs / baselineHours; - var currentRate = currentMs / currentHours; - ratio = baselineRate > 0 ? currentRate / baselineRate : 100.0; - anomalyType = "spike"; - } - - if (ratio < 5.0) continue; // Need at least 5x increase - - anomalies.Add(new Fact - { - Source = "anomaly", - Key = $"ANOMALY_WAIT_{waitType}", - Value = currentMs, - ServerId = context.ServerId, - Metadata = new Dictionary - { - ["current_ms"] = currentMs, - ["baseline_ms"] = baselineMs, - ["ratio"] = ratio, - ["is_new"] = anomalyType == "new" ? 1 : 0 - } - }); - } - } - catch (Exception ex) - { - Logger.Error($"[SqlServerAnomalyDetector] Wait anomaly detection failed: {ex.Message}"); - } - } - - /// - /// Detects blocking/deadlock anomalies -- events in the analysis window - /// that are significantly above baseline rates. - /// - private async Task DetectBlockingAnomalies(AnalysisContext context, - DateTime baselineStart, DateTime baselineEnd, List anomalies) - { - try - { - using var connection = new SqlConnection(_connectionString); - await connection.OpenAsync(); - - // Check if baseline period has any data at all - using var checkCmd = connection.CreateCommand(); - checkCmd.CommandText = @" -SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; - -SELECT - (SELECT COUNT(*) FROM collect.blocking_BlockedProcessReport - WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd) - + (SELECT COUNT(*) FROM collect.deadlocks - WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd) - + (SELECT COUNT(*) FROM collect.wait_stats - WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd);"; - - checkCmd.Parameters.Add(new SqlParameter("@baselineStart", baselineStart)); - checkCmd.Parameters.Add(new SqlParameter("@baselineEnd", baselineEnd)); - - var baselineDataCount = Convert.ToInt64(await checkCmd.ExecuteScalarAsync() ?? 0); - if (baselineDataCount == 0) return; // No baseline data = can't detect anomaly - - using var cmd = connection.CreateCommand(); - cmd.CommandText = @" -SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; - -SELECT - (SELECT COUNT(*) FROM collect.blocking_BlockedProcessReport - WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd) AS baseline_blocking, - (SELECT COUNT(*) FROM collect.blocking_BlockedProcessReport - WHERE collection_time >= @windowStart AND collection_time <= @windowEnd) AS current_blocking, - (SELECT COUNT(*) FROM collect.deadlocks - WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd) AS baseline_deadlocks, - (SELECT COUNT(*) FROM collect.deadlocks - WHERE collection_time >= @windowStart AND collection_time <= @windowEnd) AS current_deadlocks;"; - - cmd.Parameters.Add(new SqlParameter("@baselineStart", baselineStart)); - cmd.Parameters.Add(new SqlParameter("@baselineEnd", baselineEnd)); - cmd.Parameters.Add(new SqlParameter("@windowStart", context.TimeRangeStart)); - cmd.Parameters.Add(new SqlParameter("@windowEnd", context.TimeRangeEnd)); - - using var reader = await cmd.ExecuteReaderAsync(); - if (!await reader.ReadAsync()) return; - - var baselineBlocking = Convert.ToInt64(reader.GetValue(0)); - var currentBlocking = Convert.ToInt64(reader.GetValue(1)); - var baselineDeadlocks = Convert.ToInt64(reader.GetValue(2)); - var currentDeadlocks = Convert.ToInt64(reader.GetValue(3)); - - // Normalize to per-hour rates (windows are different lengths) - var baselineHours = (baselineEnd - baselineStart).TotalHours; - var currentHours = (context.TimeRangeEnd - context.TimeRangeStart).TotalHours; - if (baselineHours <= 0) baselineHours = 1; - if (currentHours <= 0) currentHours = 1; - - var baselineBlockingRate = baselineBlocking / baselineHours; - var currentBlockingRate = currentBlocking / currentHours; - var blockingRatio = baselineBlocking > 0 ? currentBlockingRate / baselineBlockingRate : 100.0; - - var baselineDeadlockRate = baselineDeadlocks / baselineHours; - var currentDeadlockRate = currentDeadlocks / currentHours; - var deadlockRatio = baselineDeadlocks > 0 ? currentDeadlockRate / baselineDeadlockRate : 100.0; - - // Blocking spike: at least 5 events AND 3x baseline rate (or new) - if (currentBlocking >= 5 && (baselineBlocking == 0 || blockingRatio >= 3)) - { - anomalies.Add(new Fact - { - Source = "anomaly", - Key = "ANOMALY_BLOCKING_SPIKE", - Value = currentBlocking, - ServerId = context.ServerId, - Metadata = new Dictionary - { - ["current_count"] = currentBlocking, - ["baseline_count"] = baselineBlocking, - ["ratio"] = blockingRatio - } - }); - } - - // Deadlock spike: at least 3 events AND 3x baseline rate (or new) - if (currentDeadlocks >= 3 && (baselineDeadlocks == 0 || deadlockRatio >= 3)) - { - anomalies.Add(new Fact - { - Source = "anomaly", - Key = "ANOMALY_DEADLOCK_SPIKE", - Value = currentDeadlocks, - ServerId = context.ServerId, - Metadata = new Dictionary - { - ["current_count"] = currentDeadlocks, - ["baseline_count"] = baselineDeadlocks, - ["ratio"] = deadlockRatio - } - }); - } - } - catch (Exception ex) - { - Logger.Error($"[SqlServerAnomalyDetector] Blocking anomaly detection failed: {ex.Message}"); - } - } - - /// - /// Detects I/O latency anomalies -- significant increase in read/write latency - /// compared to baseline. - /// - private async Task DetectIoAnomalies(AnalysisContext context, - DateTime baselineStart, DateTime baselineEnd, List anomalies) - { - try - { - using var connection = new SqlConnection(_connectionString); - await connection.OpenAsync(); - - using var cmd = connection.CreateCommand(); - cmd.CommandText = @" -SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; - -;WITH baseline AS ( - SELECT - AVG(io_stall_read_ms_delta * 1.0 / NULLIF(num_of_reads_delta, 0)) AS avg_read_lat, - AVG(io_stall_write_ms_delta * 1.0 / NULLIF(num_of_writes_delta, 0)) AS avg_write_lat, - STDEV(io_stall_read_ms_delta * 1.0 / NULLIF(num_of_reads_delta, 0)) AS stddev_read, - STDEV(io_stall_write_ms_delta * 1.0 / NULLIF(num_of_writes_delta, 0)) AS stddev_write, - COUNT(*) AS samples - FROM collect.file_io_stats - WHERE collection_time >= @baselineStart AND collection_time < @baselineEnd - AND (num_of_reads_delta > 0 OR num_of_writes_delta > 0) -), -current_window AS ( - SELECT - AVG(io_stall_read_ms_delta * 1.0 / NULLIF(num_of_reads_delta, 0)) AS avg_read_lat, - AVG(io_stall_write_ms_delta * 1.0 / NULLIF(num_of_writes_delta, 0)) AS avg_write_lat - FROM collect.file_io_stats - WHERE collection_time >= @windowStart AND collection_time <= @windowEnd - AND (num_of_reads_delta > 0 OR num_of_writes_delta > 0) -) -SELECT b.avg_read_lat, b.stddev_read, c.avg_read_lat, - b.avg_write_lat, b.stddev_write, c.avg_write_lat, - b.samples -FROM baseline b, current_window c;"; - - cmd.Parameters.Add(new SqlParameter("@baselineStart", baselineStart)); - cmd.Parameters.Add(new SqlParameter("@baselineEnd", baselineEnd)); - cmd.Parameters.Add(new SqlParameter("@windowStart", context.TimeRangeStart)); - cmd.Parameters.Add(new SqlParameter("@windowEnd", context.TimeRangeEnd)); - - using var reader = await cmd.ExecuteReaderAsync(); - if (!await reader.ReadAsync()) return; - - var baselineReadLat = reader.IsDBNull(0) ? 0.0 : Convert.ToDouble(reader.GetValue(0)); - var stddevRead = reader.IsDBNull(1) ? 0.0 : Convert.ToDouble(reader.GetValue(1)); - var currentReadLat = reader.IsDBNull(2) ? 0.0 : Convert.ToDouble(reader.GetValue(2)); - var baselineWriteLat = reader.IsDBNull(3) ? 0.0 : Convert.ToDouble(reader.GetValue(3)); - var stddevWrite = reader.IsDBNull(4) ? 0.0 : Convert.ToDouble(reader.GetValue(4)); - var currentWriteLat = reader.IsDBNull(5) ? 0.0 : Convert.ToDouble(reader.GetValue(5)); - var samples = reader.IsDBNull(6) ? 0L : Convert.ToInt64(reader.GetValue(6)); - - if (samples < 3) return; - - // Read latency anomaly - if (stddevRead > 0 && currentReadLat > 10) // At least 10ms to matter - { - var readDeviation = (currentReadLat - baselineReadLat) / stddevRead; - if (readDeviation >= DeviationThreshold) - { - anomalies.Add(new Fact - { - Source = "anomaly", - Key = "ANOMALY_READ_LATENCY", - Value = currentReadLat, - ServerId = context.ServerId, - Metadata = new Dictionary - { - ["current_latency_ms"] = currentReadLat, - ["baseline_mean_ms"] = baselineReadLat, - ["baseline_stddev_ms"] = stddevRead, - ["deviation_sigma"] = readDeviation, - ["baseline_samples"] = samples - } - }); - } - } - - // Write latency anomaly - if (stddevWrite > 0 && currentWriteLat > 5) // At least 5ms to matter - { - var writeDeviation = (currentWriteLat - baselineWriteLat) / stddevWrite; - if (writeDeviation >= DeviationThreshold) - { - anomalies.Add(new Fact - { - Source = "anomaly", - Key = "ANOMALY_WRITE_LATENCY", - Value = currentWriteLat, - ServerId = context.ServerId, - Metadata = new Dictionary - { - ["current_latency_ms"] = currentWriteLat, - ["baseline_mean_ms"] = baselineWriteLat, - ["baseline_stddev_ms"] = stddevWrite, - ["deviation_sigma"] = writeDeviation, - ["baseline_samples"] = samples - } - }); - } - } - } - catch (Exception ex) - { - Logger.Error($"[SqlServerAnomalyDetector] I/O anomaly detection failed: {ex.Message}"); - } - } -} +using System; +using System.Collections.Generic; +using System.Threading.Tasks; +using Microsoft.Data.SqlClient; +using PerformanceMonitorDashboard.Helpers; + +namespace PerformanceMonitorDashboard.Analysis; + +/// +/// Detects anomalies by comparing the analysis window's metrics against +/// time-bucketed baselines (hour-of-day x day-of-week, 30-day rolling window). +/// +/// Two detection patterns: +/// - Z-score: (observed - mean) / stddev — used for continuous metrics +/// (CPU, batch requests, I/O latency, session counts, query duration) +/// - Ratio: currentRate / baselineRate — used for rate/event metrics +/// (wait stats, blocking, deadlocks) +/// +/// Baseline computation and caching are handled by SqlServerBaselineProvider. +/// +/// Port of Lite's AnomalyDetector — uses SQL Server collect.* tables instead of DuckDB views. +/// No server_id filtering — Dashboard monitors one server per database. +/// No memory metric — Dashboard doesn't collect memory stats. +/// +public class SqlServerAnomalyDetector +{ + private readonly string _connectionString; + private readonly SqlServerBaselineProvider _baselineProvider; + + /// + /// Default number of standard deviations above baseline mean to flag as anomalous. + /// + private const double DefaultDeviationThreshold = 2.0; + + /// + /// Default ratio threshold for rate-based anomaly detection (wait stats). + /// + private const double DefaultRatioThreshold = 5.0; + + /// + /// Default ratio threshold for event-based anomaly detection (blocking/deadlocks). + /// + private const double DefaultEventRatioThreshold = 3.0; + + /// + /// Per-metric deviation thresholds. Metrics not listed use DefaultDeviationThreshold. + /// + private readonly Dictionary _deviationThresholds = new(); + + public SqlServerAnomalyDetector(string connectionString, SqlServerBaselineProvider baselineProvider) + { + _connectionString = connectionString; + _baselineProvider = baselineProvider; + } + + /// + /// Sets a custom deviation threshold for a specific metric. + /// + public void SetDeviationThreshold(string metricName, double threshold) + { + _deviationThresholds[metricName] = threshold; + } + + private double GetDeviationThreshold(string metricName) + { + return _deviationThresholds.TryGetValue(metricName, out var threshold) + ? threshold + : DefaultDeviationThreshold; + } + + /// + /// Adds baseline context metadata to an anomaly fact's metadata dictionary. + /// + private static void AddBaselineContext(Dictionary metadata, BaselineBucket baseline) + { + metadata["baseline_hour"] = baseline.HourOfDay; + metadata["baseline_dow"] = baseline.DayOfWeek; + metadata["baseline_tier"] = (double)baseline.Tier; + } + + /// + /// Detects anomalies by comparing the analysis window against time-bucketed baselines. + /// Returns anomaly facts to be merged into the main fact list. + /// + public async Task> DetectAnomaliesAsync(AnalysisContext context) + { + var anomalies = new List(); + + // Check if baseline period has any data at all — if not, skip all anomaly detection. + if (!await HasBaselineDataAsync()) + return anomalies; + + // Existing detection methods (upgraded to time-bucketed baselines) + await DetectCpuAnomalies(context, anomalies); + await DetectWaitAnomalies(context, anomalies); + await DetectBlockingAnomalies(context, anomalies); + await DetectIoAnomalies(context, anomalies); + + // New detection methods + await DetectBatchRequestAnomalies(context, anomalies); + await DetectSessionAnomalies(context, anomalies); + await DetectQueryDurationAnomalies(context, anomalies); + + return anomalies; + } + + /// + /// Checks if the server has enough historical data for meaningful baselines. + /// Uses wait_stats and cpu_utilization_stats as canary. + /// + private async Task HasBaselineDataAsync() + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + (SELECT COUNT(*) FROM collect.wait_stats + WHERE collection_time >= @cutoff) + + (SELECT COUNT(*) FROM collect.cpu_utilization_stats + WHERE collection_time >= @cutoff);"; + + cmd.Parameters.Add(new SqlParameter("@cutoff", DateTime.UtcNow.AddDays(-30))); + + var count = Convert.ToInt64(await cmd.ExecuteScalarAsync() ?? 0); + return count > 0; + } + catch { return false; } + } + + /// + /// Detects CPU utilization anomalies using z-score against time-bucketed baseline. + /// + private async Task DetectCpuAnomalies(AnalysisContext context, List anomalies) + { + try + { + var baseline = await _baselineProvider.GetBaselineAsync( + SqlServerMetricNames.Cpu, context.TimeRangeStart); + + if (baseline.SampleCount == 0) return; + var effectiveStdDev = baseline.EffectiveStdDev; + if (effectiveStdDev <= 0) return; + + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + MAX(sqlserver_cpu_utilization) AS peak_cpu, + AVG(CAST(sqlserver_cpu_utilization AS FLOAT)) AS avg_cpu, + COUNT(*) AS sample_count, + (SELECT TOP 1 collection_time FROM collect.cpu_utilization_stats + WHERE collection_time >= @windowStart AND collection_time < @windowEnd + ORDER BY sqlserver_cpu_utilization DESC) AS peak_time +FROM collect.cpu_utilization_stats +WHERE collection_time >= @windowStart +AND collection_time < @windowEnd;"; + + cmd.Parameters.Add(new SqlParameter("@windowStart", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@windowEnd", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var peakCpu = reader.IsDBNull(0) ? 0.0 : Convert.ToDouble(reader.GetValue(0)); + var avgCpu = reader.IsDBNull(1) ? 0.0 : Convert.ToDouble(reader.GetValue(1)); + var windowSamples = reader.IsDBNull(2) ? 0L : Convert.ToInt64(reader.GetValue(2)); + var peakTime = reader.IsDBNull(3) ? (DateTime?)null : reader.GetDateTime(3); + + if (windowSamples == 0) return; + + var deviation = (peakCpu - baseline.Mean) / effectiveStdDev; + if (deviation < GetDeviationThreshold(SqlServerMetricNames.Cpu) || peakCpu < 50) return; + + var metadata = new Dictionary + { + ["peak_cpu"] = peakCpu, + ["avg_cpu_in_window"] = avgCpu, + ["baseline_mean"] = baseline.Mean, + ["baseline_stddev"] = effectiveStdDev, + ["deviation_sigma"] = deviation, + ["baseline_samples"] = baseline.SampleCount, + ["window_samples"] = windowSamples, + ["confidence"] = 1.0, + ["peak_time_ticks"] = peakTime?.Ticks ?? 0 + }; + AddBaselineContext(metadata, baseline); + + anomalies.Add(new Fact + { + Source = "anomaly", + Key = "ANOMALY_CPU_SPIKE", + Value = peakCpu, + ServerId = context.ServerId, + Metadata = metadata + }); + } + catch (Exception ex) + { + Logger.Error($"[SqlServerAnomalyDetector] CPU anomaly detection failed: {ex.Message}"); + } + } + + /// + /// Detects wait stat anomalies — total wait time significantly above + /// baseline rate for this time bucket. Uses ratio-based scoring. + /// + private async Task DetectWaitAnomalies(AnalysisContext context, List anomalies) + { + try + { + var baseline = await _baselineProvider.GetBaselineAsync( + SqlServerMetricNames.WaitStats, context.TimeRangeStart); + + if (baseline.SampleCount == 0) return; + + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT TOP 10 + wait_type, + CAST(SUM(wait_time_ms_delta) AS BIGINT) AS total_ms +FROM collect.wait_stats +WHERE collection_time >= @windowStart AND collection_time <= @windowEnd +AND wait_time_ms_delta > 0 +GROUP BY wait_type +HAVING SUM(wait_time_ms_delta) > 10000 +ORDER BY total_ms DESC;"; + + cmd.Parameters.Add(new SqlParameter("@windowStart", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@windowEnd", context.TimeRangeEnd)); + + var currentHours = (context.TimeRangeEnd - context.TimeRangeStart).TotalHours; + if (currentHours <= 0) currentHours = 1; + + var baselineRate = baseline.SampleCount > 0 ? baseline.Mean : 0; + + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + var waitType = reader.GetString(0); + var currentMs = Convert.ToInt64(reader.GetValue(1)); + var currentRate = currentMs / currentHours; + + double ratio; + string anomalyType; + + if (baselineRate <= 0 || baseline.SampleCount == 0) + { + ratio = currentMs > 60_000 ? 100.0 : 0; + anomalyType = "new"; + } + else + { + ratio = currentRate / baselineRate; + anomalyType = "spike"; + } + + if (ratio < DefaultRatioThreshold) continue; + + var metadata = new Dictionary + { + ["current_ms"] = currentMs, + ["baseline_mean"] = baseline.Mean, + ["ratio"] = ratio, + ["is_new"] = anomalyType == "new" ? 1 : 0 + }; + AddBaselineContext(metadata, baseline); + + anomalies.Add(new Fact + { + Source = "anomaly", + Key = $"ANOMALY_WAIT_{waitType}", + Value = currentMs, + ServerId = context.ServerId, + Metadata = metadata + }); + } + } + catch (Exception ex) + { + Logger.Error($"[SqlServerAnomalyDetector] Wait anomaly detection failed: {ex.Message}"); + } + } + + /// + /// Detects blocking/deadlock anomalies — event rates significantly above + /// baseline for this time bucket. Uses ratio-based scoring. + /// + private async Task DetectBlockingAnomalies(AnalysisContext context, List anomalies) + { + try + { + var blockingBaseline = await _baselineProvider.GetBaselineAsync( + SqlServerMetricNames.Blocking, context.TimeRangeStart); + var deadlockBaseline = await _baselineProvider.GetBaselineAsync( + SqlServerMetricNames.Deadlock, context.TimeRangeStart); + + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + (SELECT COUNT(*) FROM collect.blocking_BlockedProcessReport + WHERE collection_time >= @windowStart AND collection_time <= @windowEnd) AS current_blocking, + (SELECT COUNT(*) FROM collect.deadlocks + WHERE collection_time >= @windowStart AND collection_time <= @windowEnd) AS current_deadlocks;"; + + cmd.Parameters.Add(new SqlParameter("@windowStart", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@windowEnd", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var currentBlocking = Convert.ToInt64(reader.GetValue(0)); + var currentDeadlocks = Convert.ToInt64(reader.GetValue(1)); + + var baselineBlockingRate = blockingBaseline.SampleCount > 0 ? blockingBaseline.Mean : 0; + var baselineDeadlockRate = deadlockBaseline.SampleCount > 0 ? deadlockBaseline.Mean : 0; + + // Blocking spike: at least 5 events AND 3x baseline rate (or no baseline) + if (currentBlocking >= 5 && (baselineBlockingRate <= 0 || currentBlocking / Math.Max(baselineBlockingRate, 1) >= DefaultEventRatioThreshold)) + { + var metadata = new Dictionary + { + ["current_count"] = currentBlocking, + ["baseline_rate"] = baselineBlockingRate, + ["ratio"] = baselineBlockingRate > 0 ? currentBlocking / baselineBlockingRate : 100.0 + }; + AddBaselineContext(metadata, blockingBaseline); + + anomalies.Add(new Fact + { + Source = "anomaly", + Key = "ANOMALY_BLOCKING_SPIKE", + Value = currentBlocking, + ServerId = context.ServerId, + Metadata = metadata + }); + } + + // Deadlock spike: at least 3 events AND 3x baseline rate (or no baseline) + if (currentDeadlocks >= 3 && (baselineDeadlockRate <= 0 || currentDeadlocks / Math.Max(baselineDeadlockRate, 1) >= DefaultEventRatioThreshold)) + { + var metadata = new Dictionary + { + ["current_count"] = currentDeadlocks, + ["baseline_rate"] = baselineDeadlockRate, + ["ratio"] = baselineDeadlockRate > 0 ? currentDeadlocks / baselineDeadlockRate : 100.0 + }; + AddBaselineContext(metadata, deadlockBaseline); + + anomalies.Add(new Fact + { + Source = "anomaly", + Key = "ANOMALY_DEADLOCK_SPIKE", + Value = currentDeadlocks, + ServerId = context.ServerId, + Metadata = metadata + }); + } + } + catch (Exception ex) + { + Logger.Error($"[SqlServerAnomalyDetector] Blocking anomaly detection failed: {ex.Message}"); + } + } + + /// + /// Detects I/O latency anomalies using z-score against time-bucketed baseline. + /// + private async Task DetectIoAnomalies(AnalysisContext context, List anomalies) + { + try + { + var baseline = await _baselineProvider.GetBaselineAsync( + SqlServerMetricNames.IoLatency, context.TimeRangeStart); + + if (baseline.SampleCount == 0) return; + var effectiveStdDev = baseline.EffectiveStdDev; + if (effectiveStdDev <= 0) return; + + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + AVG(io_stall_read_ms_delta * 1.0 / NULLIF(num_of_reads_delta, 0)) AS avg_read_lat, + AVG(io_stall_write_ms_delta * 1.0 / NULLIF(num_of_writes_delta, 0)) AS avg_write_lat +FROM collect.file_io_stats +WHERE collection_time >= @windowStart AND collection_time <= @windowEnd +AND (num_of_reads_delta > 0 OR num_of_writes_delta > 0);"; + + cmd.Parameters.Add(new SqlParameter("@windowStart", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@windowEnd", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var currentReadLat = reader.IsDBNull(0) ? 0.0 : Convert.ToDouble(reader.GetValue(0)); + var currentWriteLat = reader.IsDBNull(1) ? 0.0 : Convert.ToDouble(reader.GetValue(1)); + + var ioThreshold = GetDeviationThreshold(SqlServerMetricNames.IoLatency); + + // Read latency anomaly + if (currentReadLat > 10) + { + var readDeviation = (currentReadLat - baseline.Mean) / effectiveStdDev; + if (readDeviation >= ioThreshold) + { + var metadata = new Dictionary + { + ["current_latency_ms"] = currentReadLat, + ["baseline_mean_ms"] = baseline.Mean, + ["baseline_stddev_ms"] = effectiveStdDev, + ["deviation_sigma"] = readDeviation, + ["baseline_samples"] = baseline.SampleCount + }; + AddBaselineContext(metadata, baseline); + + anomalies.Add(new Fact + { + Source = "anomaly", + Key = "ANOMALY_READ_LATENCY", + Value = currentReadLat, + ServerId = context.ServerId, + Metadata = metadata + }); + } + } + + // Write latency anomaly + if (currentWriteLat > 5) + { + var writeDeviation = (currentWriteLat - baseline.Mean) / effectiveStdDev; + if (writeDeviation >= ioThreshold) + { + var metadata = new Dictionary + { + ["current_latency_ms"] = currentWriteLat, + ["baseline_mean_ms"] = baseline.Mean, + ["baseline_stddev_ms"] = effectiveStdDev, + ["deviation_sigma"] = writeDeviation, + ["baseline_samples"] = baseline.SampleCount + }; + AddBaselineContext(metadata, baseline); + + anomalies.Add(new Fact + { + Source = "anomaly", + Key = "ANOMALY_WRITE_LATENCY", + Value = currentWriteLat, + ServerId = context.ServerId, + Metadata = metadata + }); + } + } + } + catch (Exception ex) + { + Logger.Error($"[SqlServerAnomalyDetector] I/O anomaly detection failed: {ex.Message}"); + } + } + + /// + /// Detects batch requests/sec anomalies using z-score against time-bucketed baseline. + /// + private async Task DetectBatchRequestAnomalies(AnalysisContext context, List anomalies) + { + try + { + var baseline = await _baselineProvider.GetBaselineAsync( + SqlServerMetricNames.BatchRequests, context.TimeRangeStart); + + if (baseline.SampleCount == 0) return; + var effectiveStdDev = baseline.EffectiveStdDev; + if (effectiveStdDev <= 0) return; + + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + AVG(cntr_value_delta) AS avg_batch, + MAX(cntr_value_delta) AS peak_batch, + COUNT(*) AS sample_count +FROM collect.perfmon_stats +WHERE collection_time >= @windowStart AND collection_time <= @windowEnd +AND counter_name = 'Batch Requests/sec' +AND cntr_value_delta >= 0;"; + + cmd.Parameters.Add(new SqlParameter("@windowStart", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@windowEnd", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var avgBatch = reader.IsDBNull(0) ? 0.0 : Convert.ToDouble(reader.GetValue(0)); + var peakBatch = reader.IsDBNull(1) ? 0.0 : Convert.ToDouble(reader.GetValue(1)); + var windowSamples = reader.IsDBNull(2) ? 0L : Convert.ToInt64(reader.GetValue(2)); + + if (windowSamples == 0) return; + + var deviation = (peakBatch - baseline.Mean) / effectiveStdDev; + if (deviation < GetDeviationThreshold(SqlServerMetricNames.BatchRequests)) return; + + var metadata = new Dictionary + { + ["peak_batch_requests"] = peakBatch, + ["avg_batch_requests"] = avgBatch, + ["baseline_mean"] = baseline.Mean, + ["baseline_stddev"] = effectiveStdDev, + ["deviation_sigma"] = deviation, + ["baseline_samples"] = baseline.SampleCount, + ["window_samples"] = windowSamples + }; + AddBaselineContext(metadata, baseline); + + anomalies.Add(new Fact + { + Source = "anomaly", + Key = "ANOMALY_BATCH_REQUESTS", + Value = peakBatch, + ServerId = context.ServerId, + Metadata = metadata + }); + } + catch (Exception ex) + { + Logger.Error($"[SqlServerAnomalyDetector] Batch request anomaly detection failed: {ex.Message}"); + } + } + + /// + /// Detects session/connection count anomalies using z-score against time-bucketed baseline. + /// + private async Task DetectSessionAnomalies(AnalysisContext context, List anomalies) + { + try + { + var baseline = await _baselineProvider.GetBaselineAsync( + SqlServerMetricNames.SessionCount, context.TimeRangeStart); + + if (baseline.SampleCount == 0) return; + var effectiveStdDev = baseline.EffectiveStdDev; + if (effectiveStdDev <= 0) return; + + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +;WITH per_collection AS ( + SELECT collection_time, + SUM(total_sessions) AS total_connections + FROM collect.session_stats + WHERE collection_time >= @windowStart AND collection_time <= @windowEnd + GROUP BY collection_time +) +SELECT AVG(CAST(total_connections AS FLOAT)) AS avg_connections, + MAX(total_connections) AS peak_connections, + COUNT(*) AS sample_count +FROM per_collection;"; + + cmd.Parameters.Add(new SqlParameter("@windowStart", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@windowEnd", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var avgConnections = reader.IsDBNull(0) ? 0.0 : Convert.ToDouble(reader.GetValue(0)); + var peakConnections = reader.IsDBNull(1) ? 0.0 : Convert.ToDouble(reader.GetValue(1)); + var windowSamples = reader.IsDBNull(2) ? 0L : Convert.ToInt64(reader.GetValue(2)); + + if (windowSamples == 0) return; + + var deviation = (peakConnections - baseline.Mean) / effectiveStdDev; + if (deviation < GetDeviationThreshold(SqlServerMetricNames.SessionCount)) return; + + var metadata = new Dictionary + { + ["peak_connections"] = peakConnections, + ["avg_connections"] = avgConnections, + ["baseline_mean"] = baseline.Mean, + ["baseline_stddev"] = effectiveStdDev, + ["deviation_sigma"] = deviation, + ["baseline_samples"] = baseline.SampleCount, + ["window_samples"] = windowSamples + }; + AddBaselineContext(metadata, baseline); + + anomalies.Add(new Fact + { + Source = "anomaly", + Key = "ANOMALY_SESSION_SPIKE", + Value = peakConnections, + ServerId = context.ServerId, + Metadata = metadata + }); + } + catch (Exception ex) + { + Logger.Error($"[SqlServerAnomalyDetector] Session anomaly detection failed: {ex.Message}"); + } + } + + /// + /// Detects query duration aggregate anomalies using z-score against time-bucketed baseline. + /// Measures total elapsed time across all queries per collection interval. + /// + private async Task DetectQueryDurationAnomalies(AnalysisContext context, List anomalies) + { + try + { + var baseline = await _baselineProvider.GetBaselineAsync( + SqlServerMetricNames.QueryDuration, context.TimeRangeStart); + + if (baseline.SampleCount == 0) return; + var effectiveStdDev = baseline.EffectiveStdDev; + if (effectiveStdDev <= 0) return; + + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +;WITH per_collection AS ( + SELECT collection_time, + SUM(total_elapsed_time_delta) AS total_elapsed + FROM collect.query_stats + WHERE collection_time >= @windowStart AND collection_time <= @windowEnd + AND execution_count_delta > 0 + AND total_elapsed_time_delta >= 0 + GROUP BY collection_time +) +SELECT AVG(CAST(total_elapsed AS FLOAT)) AS avg_elapsed, + MAX(total_elapsed) AS peak_elapsed, + COUNT(*) AS sample_count +FROM per_collection;"; + + cmd.Parameters.Add(new SqlParameter("@windowStart", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@windowEnd", context.TimeRangeEnd)); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var avgElapsed = reader.IsDBNull(0) ? 0.0 : Convert.ToDouble(reader.GetValue(0)); + var peakElapsed = reader.IsDBNull(1) ? 0.0 : Convert.ToDouble(reader.GetValue(1)); + var windowSamples = reader.IsDBNull(2) ? 0L : Convert.ToInt64(reader.GetValue(2)); + + if (windowSamples == 0) return; + + var deviation = (peakElapsed - baseline.Mean) / effectiveStdDev; + if (deviation < GetDeviationThreshold(SqlServerMetricNames.QueryDuration)) return; + + var metadata = new Dictionary + { + ["peak_total_elapsed_us"] = peakElapsed, + ["avg_total_elapsed_us"] = avgElapsed, + ["baseline_mean"] = baseline.Mean, + ["baseline_stddev"] = effectiveStdDev, + ["deviation_sigma"] = deviation, + ["baseline_samples"] = baseline.SampleCount, + ["window_samples"] = windowSamples + }; + AddBaselineContext(metadata, baseline); + + anomalies.Add(new Fact + { + Source = "anomaly", + Key = "ANOMALY_QUERY_DURATION", + Value = peakElapsed, + ServerId = context.ServerId, + Metadata = metadata + }); + } + catch (Exception ex) + { + Logger.Error($"[SqlServerAnomalyDetector] Query duration anomaly detection failed: {ex.Message}"); + } + } +} diff --git a/Dashboard/Analysis/SqlServerBaselineProvider.cs b/Dashboard/Analysis/SqlServerBaselineProvider.cs new file mode 100644 index 00000000..1746028c --- /dev/null +++ b/Dashboard/Analysis/SqlServerBaselineProvider.cs @@ -0,0 +1,525 @@ +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using Microsoft.Data.SqlClient; +using PerformanceMonitorDashboard.Helpers; + +namespace PerformanceMonitorDashboard.Analysis; + +/// +/// Provides time-bucketed baselines (hour-of-day x day-of-week) computed from +/// 30-day rolling history in SQL Server collect.* tables. +/// +/// Port of Lite's BaselineProvider — uses SQL Server instead of DuckDB. +/// No server_id filtering — Dashboard monitors one server per database. +/// +/// Each baseline bucket contains mean, stddev, and sample count for a metric +/// at a specific (hour, day-of-week) combination. When a bucket has insufficient +/// samples, the provider collapses to less-specific tiers: +/// Full (hour+dow) -> Hour-only -> Flat (global mean/stddev) +/// +/// Baselines are cached in memory with a 1-hour TTL to avoid redundant +/// recomputation during rapid re-analysis. +/// +public class SqlServerBaselineProvider +{ + private readonly string _connectionString; + + /// Rolling window for baseline computation. + private const int BaselineWindowDays = 30; + + /// Collapse to hour-only when full bucket has fewer than this many samples. + private const int CollapseThreshold = 10; + + /// Restore to full bucket when sample count reaches this level (hysteresis). + private const int RestoreThreshold = 15; + + /// Cache TTL — baselines are recomputed after this interval. + public static TimeSpan CacheTtl { get; set; } = TimeSpan.FromHours(1); + + private readonly ConcurrentDictionary _cache = new(); + + public SqlServerBaselineProvider(string connectionString) + { + _connectionString = connectionString; + } + + /// + /// Gets the baseline for a specific metric and time bucket. + /// Returns the most specific bucket available, collapsing as needed. + /// + public async Task GetBaselineAsync(string metricName, DateTime analysisTime) + { + var hourOfDay = analysisTime.Hour; + var dayOfWeek = (int)analysisTime.DayOfWeek; // Sunday=0 + + var baselines = await GetOrComputeBaselinesAsync(metricName, analysisTime); + if (baselines == null || baselines.Count == 0) + return BaselineBucket.Empty; + + // Try full bucket (hour + day-of-week) + var fullKey = (hourOfDay, dayOfWeek); + if (baselines.TryGetValue(fullKey, out var fullBucket) && fullBucket.SampleCount >= RestoreThreshold) + return fullBucket; + + // If full bucket exists but below restore threshold, check if it's above collapse threshold + // (hysteresis: don't collapse if we're between 10-14 samples and were previously using full) + if (fullBucket != null && fullBucket.SampleCount >= CollapseThreshold) + return fullBucket; + + // Collapse to hour-only: aggregate all days for this hour + var hourBuckets = baselines + .Where(kvp => kvp.Key.HourOfDay == hourOfDay) + .Select(kvp => kvp.Value) + .ToList(); + + if (hourBuckets.Count > 0) + { + var collapsed = CollapseToHourOnly(hourBuckets); + if (collapsed.SampleCount >= CollapseThreshold) + return collapsed; + } + + // Collapse to flat: aggregate everything + var allBuckets = baselines.Values.ToList(); + if (allBuckets.Count > 0) + { + var flat = CollapseToFlat(allBuckets); + if (flat.SampleCount >= 3) // Minimum viable baseline + return flat; + } + + return BaselineBucket.Empty; + } + + /// + /// Gets all baseline buckets for a metric. Used by UI for rendering + /// expected-range bands across all time slots. + /// + public async Task?> GetAllBaselinesAsync( + string metricName, DateTime analysisTime) + { + return await GetOrComputeBaselinesAsync(metricName, analysisTime); + } + + /// Forces full cache clear — used during testing. + public void ClearCache() => _cache.Clear(); + + private async Task?> GetOrComputeBaselinesAsync( + string metricName, DateTime analysisTime) + { + var cacheKey = metricName; + var roundedHour = new DateTime(analysisTime.Year, analysisTime.Month, analysisTime.Day, analysisTime.Hour, 0, 0); + + if (_cache.TryGetValue(cacheKey, out var cached) && + cached.ComputedAt == roundedHour && + (DateTime.UtcNow - cached.RealTime) < CacheTtl) + { + return cached.Buckets; + } + + var buckets = await ComputeBaselinesAsync(metricName, analysisTime); + + _cache[cacheKey] = new CachedBaseline + { + ComputedAt = roundedHour, + RealTime = DateTime.UtcNow, + Buckets = buckets + }; + + return buckets; + } + + private async Task?> ComputeBaselinesAsync( + string metricName, DateTime analysisTime) + { + var query = GetBaselineQuery(metricName); + if (query == null) return null; + + var windowStart = analysisTime.AddDays(-BaselineWindowDays); + + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = query; + cmd.Parameters.Add(new SqlParameter("@windowStart", windowStart)); + cmd.Parameters.Add(new SqlParameter("@windowEnd", analysisTime)); + + var buckets = new Dictionary<(int, int), BaselineBucket>(); + + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + var hour = Convert.ToInt32(reader.GetValue(0)); + var dow = Convert.ToInt32(reader.GetValue(1)); + var mean = reader.IsDBNull(2) ? 0.0 : Convert.ToDouble(reader.GetValue(2)); + var stddev = reader.IsDBNull(3) ? 0.0 : Convert.ToDouble(reader.GetValue(3)); + var count = reader.IsDBNull(4) ? 0L : Convert.ToInt64(reader.GetValue(4)); + + buckets[(hour, dow)] = new BaselineBucket + { + HourOfDay = hour, + DayOfWeek = dow, + Mean = mean, + StdDev = stddev, + SampleCount = count, + Tier = count >= RestoreThreshold ? BaselineTier.Full + : count >= CollapseThreshold ? BaselineTier.Full + : BaselineTier.HourOnly + }; + } + + return buckets; + } + catch (Exception ex) + { + Logger.Error($"[SqlServerBaselineProvider] Failed to compute baselines for {metricName}: {ex.Message}"); + return null; + } + } + + private static string? GetBaselineQuery(string metricName) + { + // All queries return: hour_of_day, day_of_week, mean_val, stddev_val, sample_count + // Day-of-week normalization: (DATEPART(weekday, x) + @@DATEFIRST - 1) % 7 gives Sunday=0 + // Cumulative metrics use CTEs for restart poisoning exclusion — exclude samples where + // value drops near-zero when the prior sample was significantly higher. + // SQL Server has no QUALIFY — use ROW_NUMBER() in CTEs instead. + return metricName switch + { + // Point-in-time metric — no restart exclusion needed + SqlServerMetricNames.Cpu => @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT DATEPART(HOUR, collection_time) AS hour_of_day, + (DATEPART(WEEKDAY, collection_time) + @@DATEFIRST - 1) % 7 AS day_of_week, + AVG(CAST(sqlserver_cpu_utilization AS FLOAT)) AS mean_val, + STDEV(CAST(sqlserver_cpu_utilization AS FLOAT)) AS stddev_val, + COUNT(*) AS sample_count +FROM collect.cpu_utilization_stats +WHERE collection_time >= @windowStart AND collection_time < @windowEnd +GROUP BY DATEPART(HOUR, collection_time), + (DATEPART(WEEKDAY, collection_time) + @@DATEFIRST - 1) % 7;", + + // Cumulative counter — restart exclusion via CTE with LAG. + // server_start_time is inline in collect.perfmon_stats. + // Exclude samples within 5 min of a detected restart. + SqlServerMetricNames.BatchRequests => @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +;WITH filtered AS ( + SELECT collection_time, cntr_value_delta, + LAG(cntr_value_delta) OVER (ORDER BY collection_time) AS prev_value + FROM collect.perfmon_stats + WHERE collection_time >= @windowStart AND collection_time < @windowEnd + AND counter_name = 'Batch Requests/sec' + AND cntr_value_delta >= 0 +) +SELECT DATEPART(HOUR, collection_time) AS hour_of_day, + (DATEPART(WEEKDAY, collection_time) + @@DATEFIRST - 1) % 7 AS day_of_week, + AVG(cntr_value_delta) AS mean_val, + STDEV(cntr_value_delta) AS stddev_val, + COUNT(*) AS sample_count +FROM filtered +WHERE NOT (cntr_value_delta = 0 AND ISNULL(prev_value, 0) > 1000) +GROUP BY DATEPART(HOUR, collection_time), + (DATEPART(WEEKDAY, collection_time) + @@DATEFIRST - 1) % 7;", + + // Cumulative counter, multiple rows per collection (per wait type) — + // aggregate to total wait ms per collection first, then filter restart poisoning + SqlServerMetricNames.WaitStats => @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +;WITH per_collection AS ( + SELECT collection_time, + SUM(wait_time_ms_delta) AS total_wait_ms + FROM collect.wait_stats + WHERE collection_time >= @windowStart AND collection_time < @windowEnd + AND wait_time_ms_delta >= 0 + GROUP BY collection_time +), +with_lag AS ( + SELECT collection_time, total_wait_ms, + LAG(total_wait_ms) OVER (ORDER BY collection_time) AS prev_value + FROM per_collection +) +SELECT DATEPART(HOUR, collection_time) AS hour_of_day, + (DATEPART(WEEKDAY, collection_time) + @@DATEFIRST - 1) % 7 AS day_of_week, + AVG(CAST(total_wait_ms AS FLOAT)) AS mean_val, + STDEV(CAST(total_wait_ms AS FLOAT)) AS stddev_val, + COUNT(*) AS sample_count +FROM with_lag +WHERE NOT (total_wait_ms = 0 AND ISNULL(prev_value, 0) > 10000) +GROUP BY DATEPART(HOUR, collection_time), + (DATEPART(WEEKDAY, collection_time) + @@DATEFIRST - 1) % 7;", + + // Point-in-time, multiple rows per collection (per program_name) — + // aggregate to total connections per collection first. + // collect.session_stats does NOT have server_start_time — not needed. + SqlServerMetricNames.SessionCount => @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +;WITH per_collection AS ( + SELECT collection_time, + SUM(total_sessions) AS total_connections + FROM collect.session_stats + WHERE collection_time >= @windowStart AND collection_time < @windowEnd + GROUP BY collection_time +) +SELECT DATEPART(HOUR, collection_time) AS hour_of_day, + (DATEPART(WEEKDAY, collection_time) + @@DATEFIRST - 1) % 7 AS day_of_week, + AVG(CAST(total_connections AS FLOAT)) AS mean_val, + STDEV(CAST(total_connections AS FLOAT)) AS stddev_val, + COUNT(*) AS sample_count +FROM per_collection +GROUP BY DATEPART(HOUR, collection_time), + (DATEPART(WEEKDAY, collection_time) + @@DATEFIRST - 1) % 7;", + + // Cumulative (plan cache), multiple rows per collection (per query) — + // use delta columns, aggregate total elapsed per collection, filter restart poisoning. + // server_start_time is inline in collect.query_stats. + SqlServerMetricNames.QueryDuration => @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +;WITH per_collection AS ( + SELECT collection_time, + SUM(total_elapsed_time_delta) AS total_elapsed + FROM collect.query_stats + WHERE collection_time >= @windowStart AND collection_time < @windowEnd + AND execution_count_delta > 0 + AND total_elapsed_time_delta >= 0 + GROUP BY collection_time +), +with_lag AS ( + SELECT collection_time, total_elapsed, + LAG(total_elapsed) OVER (ORDER BY collection_time) AS prev_value + FROM per_collection +) +SELECT DATEPART(HOUR, collection_time) AS hour_of_day, + (DATEPART(WEEKDAY, collection_time) + @@DATEFIRST - 1) % 7 AS day_of_week, + AVG(CAST(total_elapsed AS FLOAT)) AS mean_val, + STDEV(CAST(total_elapsed AS FLOAT)) AS stddev_val, + COUNT(*) AS sample_count +FROM with_lag +WHERE NOT (total_elapsed = 0 AND ISNULL(prev_value, 0) > 100000) +GROUP BY DATEPART(HOUR, collection_time), + (DATEPART(WEEKDAY, collection_time) + @@DATEFIRST - 1) % 7;", + + // Point-in-time metric — no restart exclusion needed + SqlServerMetricNames.IoLatency => @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT DATEPART(HOUR, collection_time) AS hour_of_day, + (DATEPART(WEEKDAY, collection_time) + @@DATEFIRST - 1) % 7 AS day_of_week, + AVG(io_stall_read_ms_delta * 1.0 / NULLIF(num_of_reads_delta, 0)) AS mean_val, + STDEV(io_stall_read_ms_delta * 1.0 / NULLIF(num_of_reads_delta, 0)) AS stddev_val, + COUNT(*) AS sample_count +FROM collect.file_io_stats +WHERE collection_time >= @windowStart AND collection_time < @windowEnd +AND (num_of_reads_delta > 0 OR num_of_writes_delta > 0) +GROUP BY DATEPART(HOUR, collection_time), + (DATEPART(WEEKDAY, collection_time) + @@DATEFIRST - 1) % 7;", + + // Event-based — use wait_stats collection intervals as time spine (bucketed to minute), + // LEFT JOIN event counts so intervals with zero events are included in the baseline. + // Without this, the baseline only reflects storm periods (when events exist). + SqlServerMetricNames.Blocking => @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +;WITH spine AS ( + SELECT DISTINCT DATEADD(MINUTE, DATEDIFF(MINUTE, 0, collection_time), 0) AS minute_bucket + FROM collect.wait_stats + WHERE collection_time >= @windowStart AND collection_time < @windowEnd +), +event_counts AS ( + SELECT DATEADD(MINUTE, DATEDIFF(MINUTE, 0, collection_time), 0) AS minute_bucket, + CAST(COUNT(*) AS FLOAT) AS cnt + FROM collect.blocking_BlockedProcessReport + WHERE collection_time >= @windowStart AND collection_time < @windowEnd + GROUP BY DATEADD(MINUTE, DATEDIFF(MINUTE, 0, collection_time), 0) +), +per_interval AS ( + SELECT s.minute_bucket, ISNULL(e.cnt, 0) AS event_count + FROM spine s + LEFT JOIN event_counts e ON s.minute_bucket = e.minute_bucket +) +SELECT DATEPART(HOUR, minute_bucket) AS hour_of_day, + (DATEPART(WEEKDAY, minute_bucket) + @@DATEFIRST - 1) % 7 AS day_of_week, + AVG(event_count) AS mean_val, + STDEV(event_count) AS stddev_val, + COUNT(*) AS sample_count +FROM per_interval +GROUP BY DATEPART(HOUR, minute_bucket), + (DATEPART(WEEKDAY, minute_bucket) + @@DATEFIRST - 1) % 7;", + + // Event-based — same spine approach as blocking + SqlServerMetricNames.Deadlock => @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +;WITH spine AS ( + SELECT DISTINCT DATEADD(MINUTE, DATEDIFF(MINUTE, 0, collection_time), 0) AS minute_bucket + FROM collect.wait_stats + WHERE collection_time >= @windowStart AND collection_time < @windowEnd +), +event_counts AS ( + SELECT DATEADD(MINUTE, DATEDIFF(MINUTE, 0, collection_time), 0) AS minute_bucket, + CAST(COUNT(*) AS FLOAT) AS cnt + FROM collect.deadlocks + WHERE collection_time >= @windowStart AND collection_time < @windowEnd + GROUP BY DATEADD(MINUTE, DATEDIFF(MINUTE, 0, collection_time), 0) +), +per_interval AS ( + SELECT s.minute_bucket, ISNULL(e.cnt, 0) AS event_count + FROM spine s + LEFT JOIN event_counts e ON s.minute_bucket = e.minute_bucket +) +SELECT DATEPART(HOUR, minute_bucket) AS hour_of_day, + (DATEPART(WEEKDAY, minute_bucket) + @@DATEFIRST - 1) % 7 AS day_of_week, + AVG(event_count) AS mean_val, + STDEV(event_count) AS stddev_val, + COUNT(*) AS sample_count +FROM per_interval +GROUP BY DATEPART(HOUR, minute_bucket), + (DATEPART(WEEKDAY, minute_bucket) + @@DATEFIRST - 1) % 7;", + + _ => null + }; + } + + /// + /// Collapses multiple day-of-week buckets for the same hour into a single + /// hour-only bucket using pooled statistics. + /// + private static BaselineBucket CollapseToHourOnly(List hourBuckets) + { + var totalSamples = hourBuckets.Sum(b => b.SampleCount); + if (totalSamples == 0) + return BaselineBucket.Empty; + + // Weighted mean across all day-of-week buckets for this hour + var weightedMean = hourBuckets.Sum(b => b.Mean * b.SampleCount) / totalSamples; + + // Pooled standard deviation + var pooledVariance = PoolVariance(hourBuckets, weightedMean); + + return new BaselineBucket + { + HourOfDay = hourBuckets[0].HourOfDay, + DayOfWeek = -1, // Indicates hour-only + Mean = weightedMean, + StdDev = Math.Sqrt(pooledVariance), + SampleCount = totalSamples, + Tier = BaselineTier.HourOnly + }; + } + + /// + /// Collapses all buckets into a single flat baseline (equivalent to old 24h behavior). + /// + private static BaselineBucket CollapseToFlat(List allBuckets) + { + var totalSamples = allBuckets.Sum(b => b.SampleCount); + if (totalSamples == 0) + return BaselineBucket.Empty; + + var weightedMean = allBuckets.Sum(b => b.Mean * b.SampleCount) / totalSamples; + var pooledVariance = PoolVariance(allBuckets, weightedMean); + + return new BaselineBucket + { + HourOfDay = -1, + DayOfWeek = -1, + Mean = weightedMean, + StdDev = Math.Sqrt(pooledVariance), + SampleCount = totalSamples, + Tier = BaselineTier.Flat + }; + } + + /// + /// Computes pooled variance from multiple buckets, accounting for both + /// within-bucket variance and between-bucket mean differences. + /// + private static double PoolVariance(List buckets, double grandMean) + { + var totalSamples = buckets.Sum(b => b.SampleCount); + if (totalSamples <= 1) return 0; + + double totalSumSq = 0; + foreach (var b in buckets) + { + if (b.SampleCount <= 0) continue; + // Within-bucket variance contribution + totalSumSq += (b.StdDev * b.StdDev) * (b.SampleCount - 1); + // Between-bucket mean difference contribution + totalSumSq += b.SampleCount * (b.Mean - grandMean) * (b.Mean - grandMean); + } + + return totalSumSq / (totalSamples - 1); + } + + private class CachedBaseline + { + public DateTime ComputedAt { get; init; } + public DateTime RealTime { get; init; } + public Dictionary<(int HourOfDay, int DayOfWeek), BaselineBucket>? Buckets { get; init; } + } +} + +/// +/// Represents the computed baseline statistics for a single time bucket. +/// +public class BaselineBucket +{ + public int HourOfDay { get; init; } + public int DayOfWeek { get; init; } + public double Mean { get; init; } + public double StdDev { get; init; } + public long SampleCount { get; init; } + public BaselineTier Tier { get; init; } + + public static BaselineBucket Empty => new() + { + HourOfDay = -1, DayOfWeek = -1, Mean = 0, StdDev = 0, + SampleCount = 0, Tier = BaselineTier.Flat + }; + + /// + /// Returns the effective stddev with a proportional minimum floor to prevent + /// division-by-zero in z-score calculations. When both mean and stddev are 0 + /// (zero activity), returns 0 — callers should skip scoring. + /// + public double EffectiveStdDev + { + get + { + if (Mean == 0 && StdDev <= 0) return 0; // Zero activity — skip scoring + return Math.Max(StdDev, Mean * 0.01); + } + } +} + +public enum BaselineTier +{ + Full, // hour + day-of-week (168 buckets) + HourOnly, // hour only (24 buckets) + Flat // global mean/stddev +} + +/// Metric name constants used as baseline cache keys. +public static class SqlServerMetricNames +{ + public const string Cpu = "cpu"; + public const string BatchRequests = "batch_requests"; + public const string WaitStats = "wait_stats"; + public const string SessionCount = "session_count"; + public const string QueryDuration = "query_duration"; + public const string IoLatency = "io_latency"; + public const string Blocking = "blocking"; + public const string Deadlock = "deadlock"; +} diff --git a/Dashboard/Controls/CorrelatedTimelineLanesControl.xaml b/Dashboard/Controls/CorrelatedTimelineLanesControl.xaml index 46b68159..0d58d368 100644 --- a/Dashboard/Controls/CorrelatedTimelineLanesControl.xaml +++ b/Dashboard/Controls/CorrelatedTimelineLanesControl.xaml @@ -18,15 +18,11 @@ - - @@ -34,15 +30,11 @@ - - @@ -50,7 +42,6 @@ - - - + - - - @@ -86,15 +70,11 @@ - - diff --git a/Dashboard/Controls/CorrelatedTimelineLanesControl.xaml.cs b/Dashboard/Controls/CorrelatedTimelineLanesControl.xaml.cs index 94aa4291..9af24f3f 100644 --- a/Dashboard/Controls/CorrelatedTimelineLanesControl.xaml.cs +++ b/Dashboard/Controls/CorrelatedTimelineLanesControl.xaml.cs @@ -16,6 +16,7 @@ using System.Threading.Tasks; using System.Windows; using System.Windows.Controls; +using PerformanceMonitorDashboard.Analysis; using PerformanceMonitorDashboard.Helpers; using PerformanceMonitorDashboard.Services; @@ -24,6 +25,7 @@ namespace PerformanceMonitorDashboard.Controls; public partial class CorrelatedTimelineLanesControl : UserControl { private DatabaseService? _dataService; + private SqlServerBaselineProvider? _baselineProvider; private CorrelatedCrosshairManager? _crosshairManager; private bool _isRefreshing; @@ -34,12 +36,13 @@ public CorrelatedTimelineLanesControl() } /// - /// Initializes the control with the data service. + /// Initializes the control with the data service and optional baseline provider. /// Must be called before RefreshAsync. /// - public void Initialize(DatabaseService dataService) + public void Initialize(DatabaseService dataService, SqlServerBaselineProvider? baselineProvider = null) { _dataService = dataService; + _baselineProvider = baselineProvider; var charts = new[] { CpuChart, WaitStatsChart, BlockingChart, MemoryChart, FileIoChart }; foreach (var chart in charts) @@ -50,17 +53,18 @@ public void Initialize(DatabaseService dataService) } _crosshairManager = new CorrelatedCrosshairManager(); - _crosshairManager.AddLane(CpuChart, "CPU", "%", CpuValueLabel); - _crosshairManager.AddLane(WaitStatsChart, "Wait Stats", "ms/sec", WaitStatsValueLabel); - _crosshairManager.AddLane(BlockingChart, "Blocking", "events", BlockingValueLabel); - _crosshairManager.AddLane(MemoryChart, "Memory", "MB", MemoryValueLabel); - _crosshairManager.AddLane(FileIoChart, "I/O Latency", "ms", FileIoValueLabel); + _crosshairManager.AddLane(CpuChart, "CPU", "%"); + _crosshairManager.AddLane(WaitStatsChart, "Wait Stats", "ms/sec"); + _crosshairManager.AddLane(BlockingChart, "Blocking", "events"); + _crosshairManager.AddLane(MemoryChart, "Buffer Pool", "MB"); + _crosshairManager.AddLane(FileIoChart, "I/O Latency", "ms"); } /// /// Refreshes all lane data for the given time range. /// - public async Task RefreshAsync(int hoursBack, DateTime? fromDate, DateTime? toDate) + public async Task RefreshAsync(int hoursBack, DateTime? fromDate, DateTime? toDate, + (DateTime From, DateTime To)? comparisonRange = null) { if (_dataService == null || _isRefreshing) return; _isRefreshing = true; @@ -76,26 +80,59 @@ public async Task RefreshAsync(int hoursBack, DateTime? fromDate, DateTime? toDa var memoryTask = _dataService.GetMemoryStatsAsync(hoursBack, fromDate, toDate); var fileIoTask = _dataService.GetFileIoLatencyTimeSeriesAsync(false, hoursBack, fromDate, toDate); + // Fetch baselines for band rendering if provider is available + var referenceTime = fromDate ?? DateTime.UtcNow.AddHours(-hoursBack); + Task? cpuBaselineTask = null; + Task? waitBaselineTask = null; + Task? ioBaselineTask = null; + Task? blockingBaselineTask = null; + Task? deadlockBaselineTask = null; + + if (_baselineProvider != null) + { + cpuBaselineTask = GetBaselineAsync(SqlServerMetricNames.Cpu, referenceTime); + waitBaselineTask = GetBaselineAsync(SqlServerMetricNames.WaitStats, referenceTime); + ioBaselineTask = GetBaselineAsync(SqlServerMetricNames.IoLatency, referenceTime); + blockingBaselineTask = GetBaselineAsync(SqlServerMetricNames.Blocking, referenceTime); + deadlockBaselineTask = GetBaselineAsync(SqlServerMetricNames.Deadlock, referenceTime); + } + try { - await Task.WhenAll(cpuTask, waitTask, blockingTask, deadlockTask, memoryTask, fileIoTask); + var tasks = new List { cpuTask, waitTask, blockingTask, deadlockTask, memoryTask, fileIoTask }; + if (cpuBaselineTask != null) tasks.Add(cpuBaselineTask); + if (waitBaselineTask != null) tasks.Add(waitBaselineTask); + if (ioBaselineTask != null) tasks.Add(ioBaselineTask); + if (blockingBaselineTask != null) tasks.Add(blockingBaselineTask); + if (deadlockBaselineTask != null) tasks.Add(deadlockBaselineTask); + await Task.WhenAll(tasks); } catch (Exception ex) { Debug.WriteLine($"CorrelatedLanes: Data fetch failed: {ex.Message}"); } + var cpuBaseline = cpuBaselineTask is { IsCompletedSuccessfully: true } ? cpuBaselineTask.Result : null; + var waitBaseline = waitBaselineTask is { IsCompletedSuccessfully: true } ? waitBaselineTask.Result : null; + var ioBaseline = ioBaselineTask is { IsCompletedSuccessfully: true } ? ioBaselineTask.Result : null; + var blockingBaseline = blockingBaselineTask is { IsCompletedSuccessfully: true } ? blockingBaselineTask.Result : null; + var deadlockBaseline = deadlockBaselineTask is { IsCompletedSuccessfully: true } ? deadlockBaselineTask.Result : null; + var blockingLaneBaseline = blockingBaseline ?? deadlockBaseline; + + // minAnomalyValue: absolute floor below which dots/arrows are suppressed even if outside band. + // Prevents "1% CPU above 0.5% baseline" false alarms on idle servers. if (cpuTask.IsCompletedSuccessfully) UpdateLane(CpuChart, "CPU %", - cpuTask.Result.Select(d => (d.SampleTime.ToOADate(), (double)d.SqlServerCpuUtilization)).ToList(), - "#4FC3F7", 0, 105); + cpuTask.Result.OrderBy(d => d.SampleTime) + .Select(d => (d.SampleTime.ToOADate(), (double)d.SqlServerCpuUtilization)).ToList(), + "#4FC3F7", 0, 105, cpuBaseline, minAnomalyValue: 10); else ShowEmpty(CpuChart, "CPU %"); if (waitTask.IsCompletedSuccessfully) UpdateLane(WaitStatsChart, "Wait ms/sec", waitTask.Result.Select(d => (d.CollectionTime.ToOADate(), (double)d.WaitTimeMsPerSecond)).ToList(), - "#FFB74D"); + "#FFB74D", baseline: waitBaseline, minAnomalyValue: 100); else ShowEmpty(WaitStatsChart, "Wait ms/sec"); @@ -113,7 +150,7 @@ public async Task RefreshAsync(int hoursBack, DateTime? fromDate, DateTime? toDa .Select(d => (d.CollectionTime.ToOADate(), (double)d.BlockedCount)) .ToList() : new List<(double, double)>(); - UpdateBlockingLane(blockingData, deadlockData); + UpdateBlockingLane(blockingData, deadlockData, blockingLaneBaseline); } catch (Exception ex) { @@ -122,11 +159,11 @@ public async Task RefreshAsync(int hoursBack, DateTime? fromDate, DateTime? toDa } if (memoryTask.IsCompletedSuccessfully) - UpdateLane(MemoryChart, "Memory MB", + UpdateLane(MemoryChart, "Buffer Pool MB", memoryTask.Result.Select(d => (d.CollectionTime.ToOADate(), (double)d.TotalMemoryMb)).ToList(), "#CE93D8"); else - ShowEmpty(MemoryChart, "Memory MB"); + ShowEmpty(MemoryChart, "Buffer Pool MB"); if (fileIoTask.IsCompletedSuccessfully) { @@ -135,11 +172,63 @@ public async Task RefreshAsync(int hoursBack, DateTime? fromDate, DateTime? toDa .OrderBy(g => g.Key) .Select(g => (g.Key.ToOADate(), (double)g.Average(x => x.ReadLatencyMs))) .ToList(); - UpdateLane(FileIoChart, "I/O ms", ioGrouped, "#81C784"); + UpdateLane(FileIoChart, "I/O ms", ioGrouped, "#81C784", baseline: ioBaseline, minAnomalyValue: 2); } else ShowEmpty(FileIoChart, "I/O ms"); + // Comparison overlay — fetch reference period data and render as ghost lines + if (comparisonRange.HasValue) + { + var refFrom = comparisonRange.Value.From; + var refTo = comparisonRange.Value.To; + var timeShift = (fromDate ?? DateTime.UtcNow.AddHours(-hoursBack)) - refFrom; + + var refCpuTask = _dataService.GetCpuUtilizationAsync(0, refFrom, refTo); + var refWaitTask = _dataService.GetTotalWaitStatsTrendAsync(0, refFrom, refTo); + var refBlockingTask = _dataService.GetBlockedSessionTrendAsync(0, refFrom, refTo); + var refMemoryTask = _dataService.GetMemoryStatsAsync(0, refFrom, refTo); + var refIoTask = _dataService.GetFileIoLatencyTimeSeriesAsync(false, 0, refFrom, refTo); + + try { await Task.WhenAll(refCpuTask, refWaitTask, refBlockingTask, refMemoryTask, refIoTask); } + catch (Exception ex) { Debug.WriteLine($"CorrelatedLanes: Comparison fetch failed: {ex.Message}"); } + + if (refCpuTask.IsCompletedSuccessfully) + AddGhostLine(CpuChart, refCpuTask.Result + .Select(d => (d.SampleTime.Add(timeShift).ToOADate(), (double)d.SqlServerCpuUtilization)).ToList(), "#4FC3F7"); + + if (refWaitTask.IsCompletedSuccessfully) + AddGhostLine(WaitStatsChart, refWaitTask.Result + .Select(d => (d.CollectionTime.Add(timeShift).ToOADate(), (double)d.WaitTimeMsPerSecond)).ToList(), "#FFB74D"); + + if (refBlockingTask.IsCompletedSuccessfully) + { + var refBlocking = refBlockingTask.Result + .GroupBy(d => d.CollectionTime) + .OrderBy(g => g.Key) + .Select(g => (g.Key.Add(timeShift).ToOADate(), (double)g.Sum(x => x.BlockedCount))) + .ToList(); + if (refBlocking.Count > 0) + AddGhostLine(BlockingChart, refBlocking, "#E57373"); + } + + if (refMemoryTask.IsCompletedSuccessfully) + AddGhostLine(MemoryChart, refMemoryTask.Result + .Select(d => (d.CollectionTime.Add(timeShift).ToOADate(), (double)d.TotalMemoryMb)).ToList(), "#CE93D8"); + + if (refIoTask.IsCompletedSuccessfully) + { + var refIo = refIoTask.Result + .GroupBy(d => d.CollectionTime) + .OrderBy(g => g.Key) + .Select(g => (g.Key.Add(timeShift).ToOADate(), (double)g.Average(x => x.ReadLatencyMs))) + .ToList(); + AddGhostLine(FileIoChart, refIo, "#81C784"); + } + + _crosshairManager?.SetComparisonLabel(ComparisonLabel(comparisonRange.Value, fromDate, hoursBack)); + } + _crosshairManager?.ReattachVLines(); SyncXAxes(hoursBack, fromDate, toDate); } @@ -149,21 +238,32 @@ public async Task RefreshAsync(int hoursBack, DateTime? fromDate, DateTime? toDa } } + /// + /// Fetches a baseline bucket from the provider, wrapping in a nullable task. + /// + private async Task GetBaselineAsync(string metricName, DateTime referenceTime) + { + if (_baselineProvider == null) return null; + try + { + var bucket = await _baselineProvider.GetBaselineAsync(metricName, referenceTime); + return bucket.SampleCount > 0 ? bucket : null; + } + catch { return null; } + } + private void UpdateBlockingLane(List<(double Time, double Value)> blockingData, - List<(double Time, double Value)> deadlockData) + List<(double Time, double Value)> deadlockData, BaselineBucket? baseline = null) { ClearChart(BlockingChart); TabHelpers.ApplyThemeToChart(BlockingChart); - // Register blocking and deadlock as separate named series for the tooltip var blockTimes = blockingData.Select(d => d.Time).ToArray(); var blockValues = blockingData.Select(d => d.Value).ToArray(); var deadTimes = deadlockData.Select(d => d.Time).ToArray(); var deadValues = deadlockData.Select(d => d.Value).ToArray(); - // First series clears any previous data _crosshairManager?.SetLaneData(BlockingChart, blockTimes, blockValues, isEventBased: true); - // Rename the auto-created series and add the second _crosshairManager?.AddLaneSeries(BlockingChart, "Deadlocks", "events", deadTimes, deadValues, isEventBased: true); @@ -176,7 +276,6 @@ private void UpdateBlockingLane(List<(double Time, double Value)> blockingData, double barWidth = 30.0 / 86400.0; double maxCount = 0; - // Blocking bars — red if (blockingData.Count > 0) { var bars = blockingData.Select(d => new ScottPlot.Bar @@ -191,7 +290,6 @@ private void UpdateBlockingLane(List<(double Time, double Value)> blockingData, maxCount = Math.Max(maxCount, blockingData.Max(d => d.Value)); } - // Deadlock bars — yellow/amber, slightly narrower so both are visible if (deadlockData.Count > 0) { var bars = deadlockData.Select(d => new ScottPlot.Bar @@ -206,6 +304,31 @@ private void UpdateBlockingLane(List<(double Time, double Value)> blockingData, maxCount = Math.Max(maxCount, deadlockData.Max(d => d.Value)); } + // Baseline for blocking — event-based metrics where zero is normal. + // Even if EffectiveStdDev is 0 (all-zero baseline), still register the baseline + // so the event-based indicator check (mean < 1 → any event is ▲) works. + if (baseline != null && baseline.SampleCount > 0) + { + var effectiveStdDev = Math.Max(baseline.EffectiveStdDev, 0.01); + var upper = baseline.Mean + 2 * effectiveStdDev; + var lower = Math.Max(0, baseline.Mean - 2 * effectiveStdDev); + + _crosshairManager?.SetLaneBaseline(BlockingChart, lower, upper, isEventBased: true); + + // Only render the visual band if there's meaningful variance + if (baseline.EffectiveStdDev > 0) + { + var band = BlockingChart.Plot.Add.HorizontalSpan(lower, upper); + band.FillStyle.Color = ScottPlot.Color.FromHex("#E57373").WithAlpha(25); + band.LineStyle.Width = 0; + + var meanLine = BlockingChart.Plot.Add.HorizontalLine(baseline.Mean); + meanLine.Color = ScottPlot.Color.FromHex("#E57373").WithAlpha(60); + meanLine.LinePattern = ScottPlot.LinePattern.Dashed; + meanLine.LineWidth = 1; + } + } + BlockingChart.Plot.Axes.DateTimeTicksBottom(); BlockingChart.Plot.Axes.Bottom.TickLabelStyle.IsVisible = false; TabHelpers.ReapplyAxisColors(BlockingChart); @@ -215,13 +338,12 @@ private void UpdateBlockingLane(List<(double Time, double Value)> blockingData, BlockingChart.Plot.Legend.IsVisible = false; BlockingChart.Plot.Axes.Margins(bottom: 0); BlockingChart.Plot.Axes.SetLimitsY(0, Math.Max(maxCount * 1.3, 2)); - - BlockingChart.Refresh(); } private void UpdateLane(ScottPlot.WPF.WpfPlot chart, string title, List<(double Time, double Value)> data, string colorHex, - double? yMin = null, double? yMax = null) + double? yMin = null, double? yMax = null, BaselineBucket? baseline = null, + double minAnomalyValue = 0) { ClearChart(chart); TabHelpers.ApplyThemeToChart(chart); @@ -235,6 +357,43 @@ private void UpdateLane(ScottPlot.WPF.WpfPlot chart, string title, var times = data.Select(d => d.Time).ToArray(); var values = data.Select(d => d.Value).ToArray(); + // Render baseline band FIRST (behind the data line) + if (baseline != null && baseline.SampleCount > 0 && baseline.EffectiveStdDev > 0) + { + var upper = baseline.Mean + 2 * baseline.EffectiveStdDev; + var lower = Math.Max(0, baseline.Mean - 2 * baseline.EffectiveStdDev); + + _crosshairManager?.SetLaneBaseline(chart, lower, upper, minAnomalyValue); + + var band = chart.Plot.Add.HorizontalSpan(lower, upper); + band.FillStyle.Color = ScottPlot.Color.FromHex(colorHex).WithAlpha(25); + band.LineStyle.Width = 0; + + var meanLine = chart.Plot.Add.HorizontalLine(baseline.Mean); + meanLine.Color = ScottPlot.Color.FromHex(colorHex).WithAlpha(60); + meanLine.LinePattern = ScottPlot.LinePattern.Dashed; + meanLine.LineWidth = 1; + + // Highlight anomalous points (outside ± 2σ band AND above absolute minimum) + var anomalyIndices = new List(); + for (int i = 0; i < values.Length; i++) + { + if ((values[i] > upper && values[i] >= minAnomalyValue) || values[i] < lower) + anomalyIndices.Add(i); + } + + if (anomalyIndices.Count > 0) + { + var anomalyTimes = anomalyIndices.Select(i => times[i]).ToArray(); + var anomalyValues = anomalyIndices.Select(i => values[i]).ToArray(); + var anomalyScatter = chart.Plot.Add.Scatter(anomalyTimes, anomalyValues); + anomalyScatter.Color = ScottPlot.Color.FromHex("#FF5252"); + anomalyScatter.MarkerSize = 6; + anomalyScatter.MarkerShape = ScottPlot.MarkerShape.FilledCircle; + anomalyScatter.LineWidth = 0; + } + } + var scatter = chart.Plot.Add.Scatter(times, values); scatter.Color = ScottPlot.Color.FromHex(colorHex); scatter.MarkerSize = 0; @@ -245,13 +404,11 @@ private void UpdateLane(ScottPlot.WPF.WpfPlot chart, string title, _crosshairManager?.SetLaneData(chart, times, values); chart.Plot.Axes.DateTimeTicksBottom(); - // Hide bottom tick labels on all lanes except the last (File I/O) if (chart != FileIoChart) chart.Plot.Axes.Bottom.TickLabelStyle.IsVisible = false; TabHelpers.ReapplyAxisColors(chart); - // Compact layout: hide Y label, minimize title, no legend chart.Plot.Title(""); chart.Plot.YLabel(""); chart.Plot.Legend.IsVisible = false; @@ -266,13 +423,8 @@ private void UpdateLane(ScottPlot.WPF.WpfPlot chart, string title, var padding = Math.Max((maxVal - minVal) * 0.1, 1); chart.Plot.Axes.SetLimitsY(Math.Max(0, minVal - padding), maxVal + padding); } - - chart.Refresh(); } - /// - /// Sets identical X-axis limits across all lanes. - /// private void SyncXAxes(int hoursBack, DateTime? fromDate, DateTime? toDate) { DateTime xStart, xEnd; @@ -298,9 +450,34 @@ private void SyncXAxes(int hoursBack, DateTime? fromDate, DateTime? toDate) } } + private static void AddGhostLine(ScottPlot.WPF.WpfPlot chart, + List<(double Time, double Value)> data, string colorHex) + { + if (data.Count == 0) return; + + var times = data.Select(d => d.Time).ToArray(); + var values = data.Select(d => d.Value).ToArray(); + + var scatter = chart.Plot.Add.Scatter(times, values); + scatter.Color = ScottPlot.Colors.White.WithAlpha(140); + scatter.MarkerSize = 0; + scatter.LineWidth = 1.5f; + scatter.LinePattern = ScottPlot.LinePattern.Dashed; + } + + private static string ComparisonLabel((DateTime From, DateTime To) range, + DateTime? fromDate, int hoursBack) + { + var currentStart = fromDate ?? DateTime.UtcNow.AddHours(-hoursBack); + var daysBack = (currentStart - range.From).TotalDays; + + if (Math.Abs(daysBack - 1) < 0.5) return "yesterday"; + if (Math.Abs(daysBack - 7) < 0.5) return "last week"; + return $"{daysBack:N0}d ago"; + } + private static void ClearChart(ScottPlot.WPF.WpfPlot chart) { - chart.Reset(); chart.Plot.Clear(); } @@ -317,7 +494,6 @@ private static void ShowEmpty(ScottPlot.WPF.WpfPlot chart, string title) chart.Plot.Axes.Bottom.TickGenerator = new ScottPlot.TickGenerators.EmptyTickGenerator(); chart.Plot.Axes.Left.TickGenerator = new ScottPlot.TickGenerators.EmptyTickGenerator(); chart.Plot.Legend.IsVisible = false; - chart.Refresh(); } /// diff --git a/Dashboard/Controls/ResourceMetricsContent.xaml b/Dashboard/Controls/ResourceMetricsContent.xaml index 4067d08f..36850412 100644 --- a/Dashboard/Controls/ResourceMetricsContent.xaml +++ b/Dashboard/Controls/ResourceMetricsContent.xaml @@ -28,7 +28,21 @@ - + + + + + + + + + + + + diff --git a/Dashboard/Controls/ResourceMetricsContent.xaml.cs b/Dashboard/Controls/ResourceMetricsContent.xaml.cs index 4884c6cd..7faa537f 100644 --- a/Dashboard/Controls/ResourceMetricsContent.xaml.cs +++ b/Dashboard/Controls/ResourceMetricsContent.xaml.cs @@ -214,10 +214,11 @@ private void SetupChartContextMenus() /// /// Initializes the control with required dependencies. /// - public void Initialize(DatabaseService databaseService) + public void Initialize(DatabaseService databaseService, + Analysis.SqlServerBaselineProvider? baselineProvider = null) { _databaseService = databaseService ?? throw new ArgumentNullException(nameof(databaseService)); - CorrelatedLanes.Initialize(databaseService); + CorrelatedLanes.Initialize(databaseService, baselineProvider); } /// @@ -1024,12 +1025,41 @@ private async Task LoadFileIoThroughputChartsAsync() #region Server Trends Tab + private async void CompareToCombo_SelectionChanged(object sender, SelectionChangedEventArgs e) + { + if (!IsLoaded) return; + ComparisonRange = GetComparisonRange(); + await RefreshServerTrendsAsync(); + } + + private (DateTime From, DateTime To)? ComparisonRange { get; set; } + + /// + /// Computes the reference time range for the comparison overlay. + /// Returns null if "None" is selected. + /// + private (DateTime From, DateTime To)? GetComparisonRange() + { + if (CompareToCombo == null || CompareToCombo.SelectedIndex <= 0) return null; + + var currentEnd = _serverTrendsToDate ?? DateTime.UtcNow; + var currentStart = _serverTrendsFromDate ?? currentEnd.AddHours(-_serverTrendsHoursBack); + + return CompareToCombo.SelectedIndex switch + { + 1 => (currentStart.AddDays(-1), currentEnd.AddDays(-1)), // Yesterday + 2 => (currentStart.AddDays(-7), currentEnd.AddDays(-7)), // Last week + 3 => (currentStart.AddDays(-7), currentEnd.AddDays(-7)), // Same day last week + _ => null + }; + } + private async Task RefreshServerTrendsAsync() { if (_databaseService == null) return; try { - await CorrelatedLanes.RefreshAsync(_serverTrendsHoursBack, _serverTrendsFromDate, _serverTrendsToDate); + await CorrelatedLanes.RefreshAsync(_serverTrendsHoursBack, _serverTrendsFromDate, _serverTrendsToDate, ComparisonRange); } catch (Exception ex) { diff --git a/Dashboard/Helpers/CorrelatedCrosshairManager.cs b/Dashboard/Helpers/CorrelatedCrosshairManager.cs index 7da54004..c49b0a7b 100644 --- a/Dashboard/Helpers/CorrelatedCrosshairManager.cs +++ b/Dashboard/Helpers/CorrelatedCrosshairManager.cs @@ -1,7 +1,7 @@ /* * Copyright (c) 2026 Erik Darling, Darling Data LLC * - * This file is part of the SQL Server Performance Monitor Lite. + * This file is part of the SQL Server Performance Monitor Dashboard. * * Licensed under the MIT License. See LICENSE file in the project root for full license information. * @@ -15,6 +15,7 @@ using System.Windows; using System.Windows.Controls; using System.Windows.Controls.Primitives; +using System.Windows.Documents; using System.Windows.Input; using System.Windows.Media; using PerformanceMonitorDashboard.Services; @@ -62,14 +63,13 @@ public CorrelatedCrosshairManager() /// /// Registers a chart lane for crosshair synchronization. /// - public void AddLane(ScottPlot.WPF.WpfPlot chart, string label, string unit, TextBlock valueLabel) + public void AddLane(ScottPlot.WPF.WpfPlot chart, string label, string unit) { var lane = new LaneInfo { Chart = chart, Label = label, - Unit = unit, - ValueLabel = valueLabel + Unit = unit }; chart.MouseMove += (s, e) => OnMouseMove(lane, e); @@ -78,6 +78,21 @@ public void AddLane(ScottPlot.WPF.WpfPlot chart, string label, string unit, Text _lanes.Add(lane); } + /// + /// Sets the expected baseline range for a lane (upper/lower bounds). + /// Values outside this range get ▲/▼ indicators in the tooltip. + /// + public void SetLaneBaseline(ScottPlot.WPF.WpfPlot chart, double lower, double upper, + double minAnomalyValue = 0, bool isEventBased = false) + { + var lane = _lanes.Find(l => l.Chart == chart); + if (lane == null) return; + lane.BaselineLower = lower; + lane.BaselineUpper = upper; + lane.MinAnomalyValue = minAnomalyValue; + lane.IsEventBased = isEventBased; + } + /// /// Sets a single data series for a lane (most lanes have one series). /// @@ -117,6 +132,16 @@ public void AddLaneSeries(ScottPlot.WPF.WpfPlot chart, string name, string unit, }); } + /// + /// Sets the label shown in the tooltip for comparison data (e.g., "yesterday"). + /// + public void SetComparisonLabel(string label) + { + _comparisonLabel = label; + } + + private string? _comparisonLabel; + /// /// Clears data and VLines. Call before re-populating charts. /// @@ -124,10 +149,14 @@ public void PrepareForRefresh() { _isRefreshing = true; _tooltip.IsOpen = false; + _comparisonLabel = null; foreach (var lane in _lanes) { lane.Series.Clear(); lane.VLine = null; + lane.BaselineUpper = null; + lane.BaselineLower = null; + lane.MinAnomalyValue = 0; } } @@ -165,10 +194,14 @@ private void OnMouseMove(LaneInfo sourceLane, MouseEventArgs e) var mouseCoords = sourceLane.Chart.Plot.GetCoordinates(pixel); double xValue = mouseCoords.X; - var tooltipLines = new List(); + _tooltipText.Inlines.Clear(); var time = DateTime.FromOADate(xValue); var displayTime = ServerTimeHelper.ConvertForDisplay(time, ServerTimeHelper.CurrentDisplayMode); - tooltipLines.Add(displayTime.ToString("yyyy-MM-dd HH:mm:ss")); + _tooltipText.Inlines.Add(new Run(displayTime.ToString("yyyy-MM-dd HH:mm:ss"))); + if (_comparisonLabel != null) + _tooltipText.Inlines.Add(new Run($" (dashed = {_comparisonLabel})") { Foreground = DimBrush }); + + var defaultBrush = new SolidColorBrush(Color.FromRgb(0xE0, 0xE0, 0xE0)); foreach (var lane in _lanes) { @@ -179,51 +212,49 @@ private void OnMouseMove(LaneInfo sourceLane, MouseEventArgs e) if (lane.Series.Count == 1) { - // Single series — use lane label and unit var series = lane.Series[0]; double? value = FindNearestValue(series, xValue); if (value.HasValue) { - lane.ValueLabel.Text = $"{value.Value:N1} {lane.Unit}"; - tooltipLines.Add($"{lane.Label}: {value.Value:N1} {lane.Unit}"); + var indicator = GetBaselineIndicator(lane, value.Value); + + // Tooltip: value + arrow + "30d avg" context + _tooltipText.Inlines.Add(new Run($"\n{lane.Label}: {value.Value:N1} {lane.Unit}") { Foreground = defaultBrush }); + if (indicator != null) + { + _tooltipText.Inlines.Add(new Run($" {indicator.Value.Symbol}") { Foreground = indicator.Value.Brush }); + } } else { - lane.ValueLabel.Text = ""; - tooltipLines.Add($"{lane.Label}: —"); + _tooltipText.Inlines.Add(new Run($"\n{lane.Label}: —") { Foreground = defaultBrush }); } } else if (lane.Series.Count > 1) { - // Multiple series — show each with its own name - var valueParts = new List(); foreach (var series in lane.Series) { double? value = FindNearestValue(series, xValue); string unit = series.Unit ?? lane.Unit; if (value.HasValue) { - valueParts.Add($"{value.Value:N0}"); - tooltipLines.Add($"{series.Name}: {value.Value:N0} {unit}"); + _tooltipText.Inlines.Add(new Run($"\n{series.Name}: {value.Value:N0} {unit}") { Foreground = defaultBrush }); + var indicator = GetBaselineIndicator(lane, value.Value); + if (indicator != null) + _tooltipText.Inlines.Add(new Run($" {indicator.Value.Symbol}") { Foreground = indicator.Value.Brush }); } else - { - tooltipLines.Add($"{series.Name}: —"); - } + _tooltipText.Inlines.Add(new Run($"\n{series.Name}: —") { Foreground = defaultBrush }); } - lane.ValueLabel.Text = valueParts.Count > 0 ? string.Join("/", valueParts) : ""; } else { - lane.ValueLabel.Text = ""; - tooltipLines.Add($"{lane.Label}: —"); + _tooltipText.Inlines.Add(new Run($"\n{lane.Label}: —") { Foreground = defaultBrush }); } lane.Chart.Refresh(); } - - _tooltipText.Text = string.Join("\n", tooltipLines); _tooltip.PlacementTarget = sourceLane.Chart; _tooltip.HorizontalOffset = pos.X + 15; _tooltip.VerticalOffset = pos.Y + 15; @@ -265,6 +296,38 @@ private void OnMouseMove(LaneInfo sourceLane, MouseEventArgs e) return val; } + private static readonly SolidColorBrush RedBrush = new(Color.FromRgb(0xFF, 0x52, 0x52)); + private static readonly SolidColorBrush GreenBrush = new(Color.FromRgb(0x69, 0xF0, 0x69)); + private static readonly SolidColorBrush DimBrush = new(Color.FromRgb(0x90, 0x96, 0xA0)); + + private record struct BaselineIndicator(string Symbol, SolidColorBrush Brush); + + private static string? FormatBaselineContext(LaneInfo lane) + { + if (lane.BaselineUpper == null || lane.BaselineLower == null) return null; + var mean = (lane.BaselineUpper.Value + lane.BaselineLower.Value) / 2.0; + var formatted = mean >= 1000 ? $"{mean:N0}" : mean >= 10 ? $"{mean:N1}" : $"{mean:N2}"; + return $"30d avg: ~{formatted}"; + } + + private static BaselineIndicator? GetBaselineIndicator(LaneInfo lane, double value) + { + if (lane.BaselineUpper == null || lane.BaselineLower == null) return null; + // For event-based metrics (blocking/deadlocks): value significantly above + // the baseline mean is a spike, even if within the wide ± 2σ band. + // Uses 3x mean as threshold — if you normally see ~5 events and now see 20, that's a spike. + var mean = (lane.BaselineUpper.Value + lane.BaselineLower.Value) / 2.0; + if (lane.IsEventBased && value >= 1.0 && (mean < 1.0 || value > mean * 3)) + return new BaselineIndicator("▲", RedBrush); + // ▲ requires both: outside band AND above absolute minimum (prevents 1% CPU false alarms) + if (value > lane.BaselineUpper.Value && value >= lane.MinAnomalyValue) + return new BaselineIndicator("▲", RedBrush); + // ▼ always shown when below band (drops are always interesting — tuning feedback) + if (value < lane.BaselineLower.Value) + return new BaselineIndicator("▼", GreenBrush); + return null; + } + private void OnMouseLeave() { _tooltip.IsOpen = false; @@ -272,7 +335,6 @@ private void OnMouseLeave() { if (lane.VLine != null) lane.VLine.IsVisible = false; - lane.ValueLabel.Text = ""; lane.Chart.Refresh(); } } @@ -303,7 +365,10 @@ private class LaneInfo public string Label { get; set; } = ""; public string Unit { get; set; } = ""; public ScottPlot.Plottables.VerticalLine? VLine { get; set; } - public TextBlock ValueLabel { get; set; } = null!; public List Series { get; set; } = new(); + public double? BaselineUpper { get; set; } + public double? BaselineLower { get; set; } + public double MinAnomalyValue { get; set; } + public bool IsEventBased { get; set; } } } diff --git a/Dashboard/ServerTab.xaml.cs b/Dashboard/ServerTab.xaml.cs index b3e2872b..c207bad9 100644 --- a/Dashboard/ServerTab.xaml.cs +++ b/Dashboard/ServerTab.xaml.cs @@ -158,7 +158,8 @@ public ServerTab(ServerConnection serverConnection, int utcOffsetMinutes = 0) SetDrillDownGlobalRange(from, to); }; SystemEventsContent.Initialize(_databaseService); - ResourceMetricsContent.Initialize(_databaseService); + var baselineProvider = new Analysis.SqlServerBaselineProvider(_databaseService.ConnectionString); + ResourceMetricsContent.Initialize(_databaseService, baselineProvider); ResourceMetricsContent.ChartDrillDownRequested += OnChildChartDrillDown; // Set default time range on UserControls based on user preferences diff --git a/Lite/Controls/CorrelatedTimelineLanesControl.xaml.cs b/Lite/Controls/CorrelatedTimelineLanesControl.xaml.cs index d42a7bc5..d39775af 100644 --- a/Lite/Controls/CorrelatedTimelineLanesControl.xaml.cs +++ b/Lite/Controls/CorrelatedTimelineLanesControl.xaml.cs @@ -275,7 +275,7 @@ private void UpdateBlockingLane(List<(double Time, double Value)> blockingData, var upper = baseline.Mean + 2 * baseline.EffectiveStdDev; var lower = Math.Max(0, baseline.Mean - 2 * baseline.EffectiveStdDev); - _crosshairManager?.SetLaneBaseline(BlockingChart, lower, upper); + _crosshairManager?.SetLaneBaseline(BlockingChart, lower, upper, isEventBased: true); var band = BlockingChart.Plot.Add.HorizontalSpan(lower, upper); band.FillStyle.Color = ScottPlot.Color.FromHex("#E57373").WithAlpha(25); diff --git a/Lite/Helpers/CorrelatedCrosshairManager.cs b/Lite/Helpers/CorrelatedCrosshairManager.cs index 1cb52a12..75bce9bb 100644 --- a/Lite/Helpers/CorrelatedCrosshairManager.cs +++ b/Lite/Helpers/CorrelatedCrosshairManager.cs @@ -83,13 +83,14 @@ public void AddLane(ScottPlot.WPF.WpfPlot chart, string label, string unit) /// Values outside this range get ▲/▼ indicators in the tooltip. /// public void SetLaneBaseline(ScottPlot.WPF.WpfPlot chart, double lower, double upper, - double minAnomalyValue = 0) + double minAnomalyValue = 0, bool isEventBased = false) { var lane = _lanes.Find(l => l.Chart == chart); if (lane == null) return; lane.BaselineLower = lower; lane.BaselineUpper = upper; lane.MinAnomalyValue = minAnomalyValue; + lane.IsEventBased = isEventBased; } /// @@ -223,9 +224,6 @@ private void OnMouseMove(LaneInfo sourceLane, MouseEventArgs e) if (indicator != null) { _tooltipText.Inlines.Add(new Run($" {indicator.Value.Symbol}") { Foreground = indicator.Value.Brush }); - var ctx = FormatBaselineContext(lane); - if (ctx != null) - _tooltipText.Inlines.Add(new Run($" ({ctx})") { Foreground = DimBrush }); } } else @@ -240,7 +238,12 @@ private void OnMouseMove(LaneInfo sourceLane, MouseEventArgs e) double? value = FindNearestValue(series, xValue); string unit = series.Unit ?? lane.Unit; if (value.HasValue) + { _tooltipText.Inlines.Add(new Run($"\n{series.Name}: {value.Value:N0} {unit}") { Foreground = defaultBrush }); + var indicator = GetBaselineIndicator(lane, value.Value); + if (indicator != null) + _tooltipText.Inlines.Add(new Run($" {indicator.Value.Symbol}") { Foreground = indicator.Value.Brush }); + } else _tooltipText.Inlines.Add(new Run($"\n{series.Name}: —") { Foreground = defaultBrush }); } @@ -310,6 +313,12 @@ private record struct BaselineIndicator(string Symbol, SolidColorBrush Brush); private static BaselineIndicator? GetBaselineIndicator(LaneInfo lane, double value) { if (lane.BaselineUpper == null || lane.BaselineLower == null) return null; + // For event-based metrics (blocking/deadlocks): value significantly above + // the baseline mean is a spike, even if within the wide ± 2σ band. + // Uses 3x mean as threshold — if you normally see ~5 events and now see 20, that's a spike. + var mean = (lane.BaselineUpper.Value + lane.BaselineLower.Value) / 2.0; + if (lane.IsEventBased && value >= 1.0 && (mean < 1.0 || value > mean * 3)) + return new BaselineIndicator("▲", RedBrush); // ▲ requires both: outside band AND above absolute minimum (prevents 1% CPU false alarms) if (value > lane.BaselineUpper.Value && value >= lane.MinAnomalyValue) return new BaselineIndicator("▲", RedBrush); @@ -360,5 +369,6 @@ private class LaneInfo public double? BaselineUpper { get; set; } public double? BaselineLower { get; set; } public double MinAnomalyValue { get; set; } + public bool IsEventBased { get; set; } } }