From 3b43644beda1da5733b9c3ecb7f76c4a4cbc0bc2 Mon Sep 17 00:00:00 2001 From: Erik Darling <2136037+erikdarlingdata@users.noreply.github.com> Date: Tue, 17 Mar 2026 11:34:01 -0400 Subject: [PATCH] =?UTF-8?q?Add=20anomaly=20detection=20=E2=80=94=20baselin?= =?UTF-8?q?e=20comparison=20for=20acute=20deviations=20(#589)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New AnomalyDetector compares the analysis window against a 24-hour baseline period to detect acute deviations. Detects: - CPU spikes: peak CPU deviation from baseline mean (σ-based scoring) - Wait spikes: wait types with 5x+ increase or new in analysis window - Blocking spikes: blocking/deadlock counts 3x+ above baseline - I/O latency anomalies: read/write latency deviation from baseline Scoring: CPU/IO use standard deviation (2σ=0.5, 4σ=1.0). Waits use ratio (5x=0.5, 20x=1.0). Blocking uses ratio (3x=0.5, 10x=1.0). Global safety: skips all anomaly detection when no baseline data exists (prevents everything looking anomalous on new servers). Uses strict boundary exclusion to prevent analysis window data leaking into baseline. Wired into AnalysisService pipeline between fact collection and scoring. Tool recommendations added for all anomaly fact types. Test scenarios: CPU spike anomaly (10% baseline → 95% spike), blocking spike anomaly (0 baseline → 50 events), wait spike anomaly (minimal PAGEIOLATCH baseline → 8M ms flood). 7 new tests, 138 total passing. Co-Authored-By: Claude Opus 4.6 (1M context) --- Lite.Tests/ScenarioTests.cs | 104 +++++++ Lite/Analysis/AnalysisService.cs | 6 + Lite/Analysis/AnomalyDetector.cs | 507 +++++++++++++++++++++++++++++++ Lite/Analysis/FactScorer.cs | 40 ++- Lite/Analysis/TestDataSeeder.cs | 194 ++++++++++++ Lite/Mcp/McpAnalysisTools.cs | 34 ++- 6 files changed, 883 insertions(+), 2 deletions(-) create mode 100644 Lite/Analysis/AnomalyDetector.cs diff --git a/Lite.Tests/ScenarioTests.cs b/Lite.Tests/ScenarioTests.cs index faf2ae51..dcadf1af 100644 --- a/Lite.Tests/ScenarioTests.cs +++ b/Lite.Tests/ScenarioTests.cs @@ -359,6 +359,110 @@ public async Task EverythingOnFire_BlockingAndDeadlocksPresent() return (stories, factsByKey); } + /* ── Anomaly Detection: CPU Spike ── */ + + [Fact] + public async Task CpuSpikeAnomaly_DetectsCpuDeviation() + { + var (stories, facts) = await RunFullPipelineWithAnomaliesAsync(s => s.SeedCpuSpikeAnomalyAsync()); + PrintStories("CPU SPIKE ANOMALY", stories); + + Assert.True(facts.ContainsKey("ANOMALY_CPU_SPIKE"), "Should detect CPU anomaly"); + Assert.True(facts["ANOMALY_CPU_SPIKE"].Severity >= 0.5, "CPU anomaly severity should be significant"); + } + + [Fact] + public async Task CpuSpikeAnomaly_HighDeviation() + { + var (_, facts) = await RunFullPipelineWithAnomaliesAsync(s => s.SeedCpuSpikeAnomalyAsync()); + + var deviation = facts["ANOMALY_CPU_SPIKE"].Metadata["deviation_sigma"]; + Assert.True(deviation > 5.0, $"Expected large deviation (>5σ), got {deviation:F1}σ"); + } + + [Fact] + public async Task CpuSpikeAnomaly_AppearsAsStory() + { + var (stories, _) = await RunFullPipelineWithAnomaliesAsync(s => s.SeedCpuSpikeAnomalyAsync()); + + Assert.Contains(stories, s => s.RootFactKey == "ANOMALY_CPU_SPIKE"); + } + + /* ── Anomaly Detection: Blocking Spike ── */ + + [Fact] + public async Task BlockingSpikeAnomaly_DetectsBlockingBurst() + { + var (stories, facts) = await RunFullPipelineWithAnomaliesAsync(s => s.SeedBlockingSpikeAnomalyAsync()); + PrintStories("BLOCKING SPIKE ANOMALY", stories); + + Assert.True(facts.ContainsKey("ANOMALY_BLOCKING_SPIKE"), "Should detect blocking spike"); + Assert.True(facts["ANOMALY_BLOCKING_SPIKE"].Severity >= 0.5, "Blocking spike should be significant"); + } + + [Fact] + public async Task BlockingSpikeAnomaly_DetectsDeadlockSpike() + { + var (_, facts) = await RunFullPipelineWithAnomaliesAsync(s => s.SeedBlockingSpikeAnomalyAsync()); + + Assert.True(facts.ContainsKey("ANOMALY_DEADLOCK_SPIKE"), "Should detect deadlock spike"); + } + + /* ── Anomaly Detection: Wait Spike ── */ + + [Fact] + public async Task WaitSpikeAnomaly_DetectsPageiolatchFlood() + { + var (stories, facts) = await RunFullPipelineWithAnomaliesAsync(s => s.SeedWaitSpikeAnomalyAsync()); + PrintStories("WAIT SPIKE ANOMALY", stories); + + Assert.True(facts.ContainsKey("ANOMALY_WAIT_PAGEIOLATCH_SH"), "Should detect PAGEIOLATCH spike"); + Assert.True(facts["ANOMALY_WAIT_PAGEIOLATCH_SH"].Severity >= 0.5, "PAGEIOLATCH anomaly should be significant"); + } + + [Fact] + public async Task WaitSpikeAnomaly_HighRatio() + { + var (_, facts) = await RunFullPipelineWithAnomaliesAsync(s => s.SeedWaitSpikeAnomalyAsync()); + + var ratio = facts["ANOMALY_WAIT_PAGEIOLATCH_SH"].Metadata["ratio"]; + Assert.True(ratio >= 5.0, $"Expected >= 5x increase, got {ratio:F1}x"); + } + + /* ── Helpers ── */ + + private async Task<(List Stories, Dictionary Facts)> RunFullPipelineWithAnomaliesAsync( + Func seedAction) + { + await _duckDb.InitializeAsync(); + await _duckDb.InitializeAnalysisSchemaAsync(); + + var seeder = new TestDataSeeder(_duckDb); + await seedAction(seeder); + + var collector = new DuckDbFactCollector(_duckDb); + var context = TestDataSeeder.CreateTestContext(); + var facts = await collector.CollectFactsAsync(context); + + // Run anomaly detection (compares analysis window against baseline) + var anomalyDetector = new AnomalyDetector(_duckDb); + var anomalies = await anomalyDetector.DetectAnomaliesAsync(context); + facts.AddRange(anomalies); + + var scorer = new FactScorer(); + scorer.ScoreAll(facts); + + var graph = new RelationshipGraph(); + var engine = new InferenceEngine(graph); + var stories = engine.BuildStories(facts); + + var factsByKey = facts + .Where(f => f.Severity > 0) + .ToDictionary(f => f.Key, f => f); + + return (stories, factsByKey); + } + private static void PrintStories(string scenario, List stories) { var output = TestContext.Current.TestOutputHelper!; diff --git a/Lite/Analysis/AnalysisService.cs b/Lite/Analysis/AnalysisService.cs index b09b04be..ad79a4e5 100644 --- a/Lite/Analysis/AnalysisService.cs +++ b/Lite/Analysis/AnalysisService.cs @@ -22,6 +22,7 @@ public class AnalysisService private readonly RelationshipGraph _graph; private readonly InferenceEngine _engine; private readonly DrillDownCollector _drillDown; + private readonly AnomalyDetector _anomalyDetector; /// /// Minimum hours of collected data required before analysis will run. /// Short collection windows distort fraction-of-period calculations — @@ -59,6 +60,7 @@ public AnalysisService(DuckDbInitializer duckDb) _graph = new RelationshipGraph(); _engine = new InferenceEngine(_graph); _drillDown = new DrillDownCollector(duckDb); + _anomalyDetector = new AnomalyDetector(duckDb); } /// @@ -126,6 +128,10 @@ public async Task> AnalyzeAsync(AnalysisContext context) return []; } + // 1.5. Detect anomalies (compare analysis window against baseline) + var anomalies = await _anomalyDetector.DetectAnomaliesAsync(context); + facts.AddRange(anomalies); + // 2. Score facts (base severity + amplifiers) _scorer.ScoreAll(facts); diff --git a/Lite/Analysis/AnomalyDetector.cs b/Lite/Analysis/AnomalyDetector.cs new file mode 100644 index 00000000..73c728ef --- /dev/null +++ b/Lite/Analysis/AnomalyDetector.cs @@ -0,0 +1,507 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using DuckDB.NET.Data; +using PerformanceMonitorLite.Database; +using PerformanceMonitorLite.Services; + +namespace PerformanceMonitorLite.Analysis; + +/// +/// Detects anomalies by comparing the analysis window's metrics against a +/// baseline period. When a metric deviates significantly from baseline +/// (mean + standard deviation), an ANOMALY fact is emitted. +/// +/// This is the "oh shit" mode — detecting acute deviations that don't show +/// up in aggregate analysis because they're brief. A 5-minute CPU spike +/// that averages out over 4 hours is invisible to aggregate scoring but +/// obvious when compared against "what was this metric doing before?" +/// +/// Baseline selection: uses the 24 hours preceding the analysis window. +/// If less data is available, uses whatever exists with lower confidence. +/// +public class AnomalyDetector +{ + private readonly DuckDbInitializer _duckDb; + + /// + /// Minimum number of baseline samples needed for reliable detection. + /// Below this, anomalies are still detected but with reduced confidence. + /// + private const int MinBaselineSamples = 10; + + /// + /// Number of standard deviations above baseline mean to flag as anomalous. + /// + private const double DeviationThreshold = 2.0; + + public AnomalyDetector(DuckDbInitializer duckDb) + { + _duckDb = duckDb; + } + + /// + /// Detects anomalies by comparing the analysis window against a baseline period. + /// Returns anomaly facts to be merged into the main fact list. + /// + public async Task> DetectAnomaliesAsync(AnalysisContext context) + { + var anomalies = new List(); + + // Baseline: 24 hours preceding the analysis window + var baselineEnd = context.TimeRangeStart; + var baselineStart = baselineEnd.AddHours(-24); + + // Check if baseline period has any data at all — if not, skip all anomaly detection. + // Without baseline data, everything looks anomalous. + if (!await HasBaselineDataAsync(context.ServerId, baselineStart, baselineEnd)) + return anomalies; + + await DetectCpuAnomalies(context, baselineStart, baselineEnd, anomalies); + await DetectWaitAnomalies(context, baselineStart, baselineEnd, anomalies); + await DetectBlockingAnomalies(context, baselineStart, baselineEnd, anomalies); + await DetectIoAnomalies(context, baselineStart, baselineEnd, anomalies); + + return anomalies; + } + + /// + /// Checks if the baseline period has any collected data. + /// Uses wait_stats as canary — if waits are collected, other data is too. + /// + private async Task HasBaselineDataAsync(int serverId, DateTime baselineStart, DateTime baselineEnd) + { + try + { + using var readLock = _duckDb.AcquireReadLock(); + using var connection = _duckDb.CreateConnection(); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SELECT (SELECT COUNT(*) FROM v_wait_stats + WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3) + + (SELECT COUNT(*) FROM v_cpu_utilization_stats + WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3)"; + cmd.Parameters.Add(new DuckDBParameter { Value = serverId }); + cmd.Parameters.Add(new DuckDBParameter { Value = baselineStart }); + cmd.Parameters.Add(new DuckDBParameter { Value = baselineEnd }); + + var count = Convert.ToInt64(await cmd.ExecuteScalarAsync() ?? 0); + return count > 0; + } + catch { return false; } + } + + /// + /// Detects CPU utilization anomalies by comparing per-sample values + /// against the baseline distribution. + /// + private async Task DetectCpuAnomalies(AnalysisContext context, + DateTime baselineStart, DateTime baselineEnd, List anomalies) + { + try + { + using var readLock = _duckDb.AcquireReadLock(); + using var connection = _duckDb.CreateConnection(); + await connection.OpenAsync(); + + // Get baseline stats + using var baselineCmd = connection.CreateCommand(); + baselineCmd.CommandText = @" +SELECT AVG(sqlserver_cpu_utilization) AS mean_cpu, + STDDEV_SAMP(sqlserver_cpu_utilization) AS stddev_cpu, + COUNT(*) AS sample_count +FROM v_cpu_utilization_stats +WHERE server_id = $1 +AND collection_time >= $2 AND collection_time < $3"; + + baselineCmd.Parameters.Add(new DuckDBParameter { Value = context.ServerId }); + baselineCmd.Parameters.Add(new DuckDBParameter { Value = baselineStart }); + baselineCmd.Parameters.Add(new DuckDBParameter { Value = baselineEnd }); + + double baselineMean = 0, baselineStdDev = 0; + long baselineSamples = 0; + + using (var reader = await baselineCmd.ExecuteReaderAsync()) + { + if (await reader.ReadAsync()) + { + baselineMean = reader.IsDBNull(0) ? 0 : Convert.ToDouble(reader.GetValue(0)); + baselineStdDev = reader.IsDBNull(1) ? 0 : Convert.ToDouble(reader.GetValue(1)); + baselineSamples = reader.IsDBNull(2) ? 0 : Convert.ToInt64(reader.GetValue(2)); + } + } + + if (baselineSamples < 3 || baselineStdDev <= 0) return; + + // Get peak and average in the analysis window + using var windowCmd = connection.CreateCommand(); + windowCmd.CommandText = @" +SELECT MAX(sqlserver_cpu_utilization) AS peak_cpu, + AVG(sqlserver_cpu_utilization) AS avg_cpu, + COUNT(*) AS sample_count, + (SELECT collection_time FROM v_cpu_utilization_stats + WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3 + ORDER BY sqlserver_cpu_utilization DESC LIMIT 1) AS peak_time +FROM v_cpu_utilization_stats +WHERE server_id = $1 +AND collection_time >= $2 AND collection_time < $3"; + + windowCmd.Parameters.Add(new DuckDBParameter { Value = context.ServerId }); + windowCmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeStart }); + windowCmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeEnd }); + + using var windowReader = await windowCmd.ExecuteReaderAsync(); + if (!await windowReader.ReadAsync()) return; + + var peakCpu = windowReader.IsDBNull(0) ? 0.0 : Convert.ToDouble(windowReader.GetValue(0)); + var avgCpu = windowReader.IsDBNull(1) ? 0.0 : Convert.ToDouble(windowReader.GetValue(1)); + var windowSamples = windowReader.IsDBNull(2) ? 0L : Convert.ToInt64(windowReader.GetValue(2)); + var peakTime = windowReader.IsDBNull(3) ? (DateTime?)null : windowReader.GetDateTime(3); + + if (windowSamples == 0) return; + + // Check if peak deviates significantly from baseline + var deviation = (peakCpu - baselineMean) / baselineStdDev; + if (deviation < DeviationThreshold || peakCpu < 50) return; // Don't flag low absolute values + + var confidence = baselineSamples >= MinBaselineSamples ? 1.0 : (double)baselineSamples / MinBaselineSamples; + + anomalies.Add(new Fact + { + Source = "anomaly", + Key = "ANOMALY_CPU_SPIKE", + Value = peakCpu, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["peak_cpu"] = peakCpu, + ["avg_cpu_in_window"] = avgCpu, + ["baseline_mean"] = baselineMean, + ["baseline_stddev"] = baselineStdDev, + ["deviation_sigma"] = deviation, + ["baseline_samples"] = baselineSamples, + ["window_samples"] = windowSamples, + ["confidence"] = confidence, + ["peak_time_ticks"] = peakTime?.Ticks ?? 0 + } + }); + } + catch (Exception ex) + { + AppLogger.Error("AnomalyDetector", $"CPU anomaly detection failed: {ex.Message}"); + } + } + + /// + /// Detects wait stat anomalies — significant waits in the analysis window + /// that were absent or much lower in the baseline. + /// + private async Task DetectWaitAnomalies(AnalysisContext context, + DateTime baselineStart, DateTime baselineEnd, List anomalies) + { + try + { + using var readLock = _duckDb.AcquireReadLock(); + using var connection = _duckDb.CreateConnection(); + await connection.OpenAsync(); + + // Check if baseline has any wait data at all — if not, skip + using var checkCmd = connection.CreateCommand(); + checkCmd.CommandText = @" +SELECT COUNT(*) FROM v_wait_stats +WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3"; + checkCmd.Parameters.Add(new DuckDBParameter { Value = context.ServerId }); + checkCmd.Parameters.Add(new DuckDBParameter { Value = baselineStart }); + checkCmd.Parameters.Add(new DuckDBParameter { Value = baselineEnd }); + var baselineCount = Convert.ToInt64(await checkCmd.ExecuteScalarAsync() ?? 0); + if (baselineCount == 0) return; + + // Get per-wait-type totals in both windows + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +WITH baseline AS ( + SELECT wait_type, + SUM(delta_wait_time_ms)::BIGINT AS total_ms + FROM v_wait_stats + WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3 + AND delta_wait_time_ms > 0 + GROUP BY wait_type +), +current_window AS ( + SELECT wait_type, + SUM(delta_wait_time_ms)::BIGINT AS total_ms + FROM v_wait_stats + WHERE server_id = $1 AND collection_time >= $4 AND collection_time <= $5 + AND delta_wait_time_ms > 0 + GROUP BY wait_type +) +SELECT c.wait_type, + c.total_ms AS current_ms, + COALESCE(b.total_ms, 0) AS baseline_ms +FROM current_window c +LEFT JOIN baseline b ON c.wait_type = b.wait_type +WHERE c.total_ms > 10000 -- At least 10 seconds of wait time +ORDER BY c.total_ms DESC +LIMIT 10"; + + cmd.Parameters.Add(new DuckDBParameter { Value = context.ServerId }); + cmd.Parameters.Add(new DuckDBParameter { Value = baselineStart }); + cmd.Parameters.Add(new DuckDBParameter { Value = baselineEnd }); + cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeStart }); + cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeEnd }); + + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + var waitType = reader.GetString(0); + var currentMs = Convert.ToInt64(reader.GetValue(1)); + var baselineMs = Convert.ToInt64(reader.GetValue(2)); + + // New wait (absent in baseline) or 5x+ increase + double ratio; + string anomalyType; + + if (baselineMs == 0) + { + ratio = currentMs > 60_000 ? 100.0 : 0; // Only flag if > 1 minute total + anomalyType = "new"; + } + else + { + ratio = (double)currentMs / baselineMs; + anomalyType = "spike"; + } + + if (ratio < 5.0) continue; // Need at least 5x increase + + anomalies.Add(new Fact + { + Source = "anomaly", + Key = $"ANOMALY_WAIT_{waitType}", + Value = currentMs, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["current_ms"] = currentMs, + ["baseline_ms"] = baselineMs, + ["ratio"] = ratio, + ["is_new"] = anomalyType == "new" ? 1 : 0 + } + }); + } + } + catch (Exception ex) + { + AppLogger.Error("AnomalyDetector", $"Wait anomaly detection failed: {ex.Message}"); + } + } + + /// + /// Detects blocking/deadlock anomalies — events in the analysis window + /// that are significantly above baseline rates. + /// + private async Task DetectBlockingAnomalies(AnalysisContext context, + DateTime baselineStart, DateTime baselineEnd, List anomalies) + { + try + { + using var readLock = _duckDb.AcquireReadLock(); + using var connection = _duckDb.CreateConnection(); + await connection.OpenAsync(); + + // Check if baseline period has any data at all + using var checkCmd = connection.CreateCommand(); + checkCmd.CommandText = @" +SELECT (SELECT COUNT(*) FROM v_blocked_process_reports + WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3) + + (SELECT COUNT(*) FROM v_deadlocks + WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3) + + (SELECT COUNT(*) FROM v_wait_stats + WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3)"; + checkCmd.Parameters.Add(new DuckDBParameter { Value = context.ServerId }); + checkCmd.Parameters.Add(new DuckDBParameter { Value = baselineStart }); + checkCmd.Parameters.Add(new DuckDBParameter { Value = baselineEnd }); + var baselineDataCount = Convert.ToInt64(await checkCmd.ExecuteScalarAsync() ?? 0); + if (baselineDataCount == 0) return; // No baseline data = can't detect anomaly + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SELECT + (SELECT COUNT(*) FROM v_blocked_process_reports + WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3) AS baseline_blocking, + (SELECT COUNT(*) FROM v_blocked_process_reports + WHERE server_id = $1 AND collection_time >= $4 AND collection_time <= $5) AS current_blocking, + (SELECT COUNT(*) FROM v_deadlocks + WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3) AS baseline_deadlocks, + (SELECT COUNT(*) FROM v_deadlocks + WHERE server_id = $1 AND collection_time >= $4 AND collection_time <= $5) AS current_deadlocks"; + + cmd.Parameters.Add(new DuckDBParameter { Value = context.ServerId }); + cmd.Parameters.Add(new DuckDBParameter { Value = baselineStart }); + cmd.Parameters.Add(new DuckDBParameter { Value = baselineEnd }); + cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeStart }); + cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeEnd }); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var baselineBlocking = Convert.ToInt64(reader.GetValue(0)); + var currentBlocking = Convert.ToInt64(reader.GetValue(1)); + var baselineDeadlocks = Convert.ToInt64(reader.GetValue(2)); + var currentDeadlocks = Convert.ToInt64(reader.GetValue(3)); + + // Blocking spike: at least 5 events AND 3x baseline (or new) + if (currentBlocking >= 5 && (baselineBlocking == 0 || (double)currentBlocking / baselineBlocking >= 3)) + { + anomalies.Add(new Fact + { + Source = "anomaly", + Key = "ANOMALY_BLOCKING_SPIKE", + Value = currentBlocking, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["current_count"] = currentBlocking, + ["baseline_count"] = baselineBlocking, + ["ratio"] = baselineBlocking > 0 ? (double)currentBlocking / baselineBlocking : 100 + } + }); + } + + // Deadlock spike: at least 3 events AND 3x baseline (or new) + if (currentDeadlocks >= 3 && (baselineDeadlocks == 0 || (double)currentDeadlocks / baselineDeadlocks >= 3)) + { + anomalies.Add(new Fact + { + Source = "anomaly", + Key = "ANOMALY_DEADLOCK_SPIKE", + Value = currentDeadlocks, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["current_count"] = currentDeadlocks, + ["baseline_count"] = baselineDeadlocks, + ["ratio"] = baselineDeadlocks > 0 ? (double)currentDeadlocks / baselineDeadlocks : 100 + } + }); + } + } + catch (Exception ex) + { + AppLogger.Error("AnomalyDetector", $"Blocking anomaly detection failed: {ex.Message}"); + } + } + + /// + /// Detects I/O latency anomalies — significant increase in read/write latency + /// compared to baseline. + /// + private async Task DetectIoAnomalies(AnalysisContext context, + DateTime baselineStart, DateTime baselineEnd, List anomalies) + { + try + { + using var readLock = _duckDb.AcquireReadLock(); + using var connection = _duckDb.CreateConnection(); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +WITH baseline AS ( + SELECT AVG(delta_stall_read_ms * 1.0 / NULLIF(delta_reads, 0)) AS avg_read_lat, + AVG(delta_stall_write_ms * 1.0 / NULLIF(delta_writes, 0)) AS avg_write_lat, + STDDEV_SAMP(delta_stall_read_ms * 1.0 / NULLIF(delta_reads, 0)) AS stddev_read, + STDDEV_SAMP(delta_stall_write_ms * 1.0 / NULLIF(delta_writes, 0)) AS stddev_write, + COUNT(*) AS samples + FROM v_file_io_stats + WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3 + AND (delta_reads > 0 OR delta_writes > 0) +), +current_window AS ( + SELECT AVG(delta_stall_read_ms * 1.0 / NULLIF(delta_reads, 0)) AS avg_read_lat, + AVG(delta_stall_write_ms * 1.0 / NULLIF(delta_writes, 0)) AS avg_write_lat + FROM v_file_io_stats + WHERE server_id = $1 AND collection_time >= $4 AND collection_time <= $5 + AND (delta_reads > 0 OR delta_writes > 0) +) +SELECT b.avg_read_lat, b.stddev_read, c.avg_read_lat, + b.avg_write_lat, b.stddev_write, c.avg_write_lat, + b.samples +FROM baseline b, current_window c"; + + cmd.Parameters.Add(new DuckDBParameter { Value = context.ServerId }); + cmd.Parameters.Add(new DuckDBParameter { Value = baselineStart }); + cmd.Parameters.Add(new DuckDBParameter { Value = baselineEnd }); + cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeStart }); + cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeEnd }); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var baselineReadLat = reader.IsDBNull(0) ? 0.0 : Convert.ToDouble(reader.GetValue(0)); + var stddevRead = reader.IsDBNull(1) ? 0.0 : Convert.ToDouble(reader.GetValue(1)); + var currentReadLat = reader.IsDBNull(2) ? 0.0 : Convert.ToDouble(reader.GetValue(2)); + var baselineWriteLat = reader.IsDBNull(3) ? 0.0 : Convert.ToDouble(reader.GetValue(3)); + var stddevWrite = reader.IsDBNull(4) ? 0.0 : Convert.ToDouble(reader.GetValue(4)); + var currentWriteLat = reader.IsDBNull(5) ? 0.0 : Convert.ToDouble(reader.GetValue(5)); + var samples = reader.IsDBNull(6) ? 0L : Convert.ToInt64(reader.GetValue(6)); + + if (samples < 3) return; + + // Read latency anomaly + if (stddevRead > 0 && currentReadLat > 10) // At least 10ms to matter + { + var readDeviation = (currentReadLat - baselineReadLat) / stddevRead; + if (readDeviation >= DeviationThreshold) + { + anomalies.Add(new Fact + { + Source = "anomaly", + Key = "ANOMALY_READ_LATENCY", + Value = currentReadLat, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["current_latency_ms"] = currentReadLat, + ["baseline_mean_ms"] = baselineReadLat, + ["baseline_stddev_ms"] = stddevRead, + ["deviation_sigma"] = readDeviation, + ["baseline_samples"] = samples + } + }); + } + } + + // Write latency anomaly + if (stddevWrite > 0 && currentWriteLat > 5) // At least 5ms to matter + { + var writeDeviation = (currentWriteLat - baselineWriteLat) / stddevWrite; + if (writeDeviation >= DeviationThreshold) + { + anomalies.Add(new Fact + { + Source = "anomaly", + Key = "ANOMALY_WRITE_LATENCY", + Value = currentWriteLat, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["current_latency_ms"] = currentWriteLat, + ["baseline_mean_ms"] = baselineWriteLat, + ["baseline_stddev_ms"] = stddevWrite, + ["deviation_sigma"] = writeDeviation, + ["baseline_samples"] = samples + } + }); + } + } + } + catch (Exception ex) + { + AppLogger.Error("AnomalyDetector", $"I/O anomaly detection failed: {ex.Message}"); + } + } +} diff --git a/Lite/Analysis/FactScorer.cs b/Lite/Analysis/FactScorer.cs index 4c35efe1..a9fc97e5 100644 --- a/Lite/Analysis/FactScorer.cs +++ b/Lite/Analysis/FactScorer.cs @@ -37,6 +37,7 @@ public void ScoreAll(List facts) "jobs" => ScoreJobFact(fact), "disk" => ScoreDiskFact(fact), "bad_actor" => ScoreBadActorFact(fact), + "anomaly" => ScoreAnomalyFact(fact), _ => 0.0 }; } @@ -44,7 +45,7 @@ public void ScoreAll(List facts) // Build lookup for amplifier evaluation (include context facts that amplifiers reference) var contextSources = new HashSet { "config", "cpu", "io", "tempdb", "memory", "queries", "perfmon", - "database_config", "jobs", "sessions", "disk", "bad_actor" }; + "database_config", "jobs", "sessions", "disk", "bad_actor", "anomaly" }; var factsByKey = facts .Where(f => f.BaseSeverity > 0 || contextSources.Contains(f.Source)) .ToDictionary(f => f.Key, f => f); @@ -300,6 +301,43 @@ private static double ScoreBadActorFact(Fact fact) return tierBase * impact; } + /// + /// Scores anomaly facts based on deviation from baseline. + /// At 2σ → 0.5, at 4σ → 1.0. Higher deviations are more severe. + /// For count-based anomalies (blocking/deadlock spikes), uses ratio instead. + /// + private static double ScoreAnomalyFact(Fact fact) + { + if (fact.Key.StartsWith("ANOMALY_CPU_SPIKE") || fact.Key.StartsWith("ANOMALY_READ_LATENCY") + || fact.Key.StartsWith("ANOMALY_WRITE_LATENCY")) + { + // Deviation-based scoring: 2σ = 0.5, 4σ = 1.0 + var deviation = fact.Metadata.GetValueOrDefault("deviation_sigma"); + var confidence = fact.Metadata.GetValueOrDefault("confidence", 1.0); + if (deviation < 2.0) return 0.0; + var base_score = 0.5 + 0.5 * Math.Min((deviation - 2.0) / 2.0, 1.0); + return base_score * confidence; + } + + if (fact.Key.StartsWith("ANOMALY_WAIT_")) + { + // Ratio-based scoring: 5x = 0.5, 20x = 1.0 + var ratio = fact.Metadata.GetValueOrDefault("ratio"); + if (ratio < 5) return 0.0; + return 0.5 + 0.5 * Math.Min((ratio - 5.0) / 15.0, 1.0); + } + + if (fact.Key.StartsWith("ANOMALY_BLOCKING_SPIKE") || fact.Key.StartsWith("ANOMALY_DEADLOCK_SPIKE")) + { + // Ratio-based: 3x = 0.5, 10x = 1.0 + var ratio = fact.Metadata.GetValueOrDefault("ratio"); + if (ratio < 3) return 0.0; + return 0.5 + 0.5 * Math.Min((ratio - 3.0) / 7.0, 1.0); + } + + return 0.0; + } + /// /// Generic threshold formula used by waits, latency, and count-based metrics. /// Critical == null means "concerning only" — hitting concerning = 1.0. diff --git a/Lite/Analysis/TestDataSeeder.cs b/Lite/Analysis/TestDataSeeder.cs index f47b82f7..1b29fe91 100644 --- a/Lite/Analysis/TestDataSeeder.cs +++ b/Lite/Analysis/TestDataSeeder.cs @@ -32,6 +32,12 @@ public class TestDataSeeder public static DateTime TestPeriodStart => _periodEnd.AddHours(-4); public static double TestPeriodDurationMs => (TestPeriodEnd - TestPeriodStart).TotalMilliseconds; + /// + /// Baseline period for anomaly detection: 24 hours before the analysis window. + /// + public static DateTime BaselineStart => TestPeriodStart.AddHours(-24); + public static DateTime BaselineEnd => TestPeriodStart; + private long _nextId = -1_000_000; public TestDataSeeder(DuckDbInitializer duckDb) @@ -503,6 +509,194 @@ await SeedDiskSpaceAsync( ("D:\\", 2_000_000, 140_000)); // 7% free — critical } + /// + /// CPU spike anomaly: server normally runs at 10% CPU, then spikes to 95%. + /// Baseline: 24h of steady ~10% CPU. + /// Analysis window: 4h with 95% peak CPU. + /// + /// Expected: ANOMALY_CPU_SPIKE with high deviation (~10σ+). + /// + public async Task SeedCpuSpikeAnomalyAsync() + { + await ClearTestDataAsync(); + await SeedTestServerAsync(); + + // Baseline: 24h of steady low CPU (10% avg, small variance) + await SeedCpuUtilizationInRangeAsync(BaselineStart, BaselineEnd, avgCpu: 10, variance: 3, samples: 96); + + // Analysis window: spike to 95% + await SeedCpuUtilizationInRangeAsync(TestPeriodStart, TestPeriodEnd, avgCpu: 95, variance: 5, samples: 16); + + // Need basic config for the analysis to run + await SeedServerConfigAsync(ctfp: 50, maxdop: 8, maxMemoryMb: 122_880); + await SeedMemoryStatsAsync(totalPhysicalMb: 131_072, bufferPoolMb: 100_000, targetMb: 122_880); + await SeedFileSizeAsync(totalDataSizeMb: 102_400); + await SeedServerEditionAsync(edition: 3, majorVersion: 16); + await SeedServerPropertiesAsync(cpuCount: 8, htRatio: 1, physicalMemMb: 131_072); + } + + /// + /// Blocking spike anomaly: normally no blocking, then sudden burst. + /// Baseline: 24h with 0 blocking events. + /// Analysis window: 4h with 50 blocking events and 10 deadlocks. + /// + /// Expected: ANOMALY_BLOCKING_SPIKE, ANOMALY_DEADLOCK_SPIKE. + /// + public async Task SeedBlockingSpikeAnomalyAsync() + { + await ClearTestDataAsync(); + await SeedTestServerAsync(); + + // Baseline: some normal wait activity (no blocking/deadlocks) + await SeedWaitStatsInRangeAsync(BaselineStart, BaselineEnd, + new Dictionary + { + ["SOS_SCHEDULER_YIELD"] = (100_000, 500_000, 0), + }, samples: 24); + + // Analysis window: sudden blocking burst + await SeedBlockingEventsAsync(50, avgWaitTimeMs: 15_000, sleepingBlockerCount: 5); + await SeedDeadlocksAsync(10); + + // Some lock waits to corroborate + var waits = new Dictionary + { + ["LCK_M_X"] = (5_000_000, 200_000, 50_000), + ["LCK_M_S"] = (1_000_000, 100_000, 10_000), + }; + await SeedWaitStatsAsync(waits); + + await SeedServerConfigAsync(ctfp: 50, maxdop: 8, maxMemoryMb: 122_880); + await SeedMemoryStatsAsync(totalPhysicalMb: 131_072, bufferPoolMb: 100_000, targetMb: 122_880); + await SeedFileSizeAsync(totalDataSizeMb: 102_400); + await SeedServerEditionAsync(edition: 3, majorVersion: 16); + await SeedServerPropertiesAsync(cpuCount: 8, htRatio: 1, physicalMemMb: 131_072); + await SeedDatabaseConfigAsync( + ("AppDB1", false, false, false, "CHECKSUM"), + ("AppDB2", false, false, false, "CHECKSUM")); + } + + /// + /// Wait spike anomaly: normally low waits, then sudden PAGEIOLATCH flood. + /// Baseline: 24h with minimal PAGEIOLATCH. + /// Analysis window: 4h with massive PAGEIOLATCH. + /// + /// Expected: ANOMALY_WAIT_PAGEIOLATCH_SH with high ratio. + /// + public async Task SeedWaitSpikeAnomalyAsync() + { + await ClearTestDataAsync(); + await SeedTestServerAsync(); + + // Baseline: 24h with minimal PAGEIOLATCH + await SeedWaitStatsInRangeAsync(BaselineStart, BaselineEnd, + new Dictionary + { + ["PAGEIOLATCH_SH"] = (50_000, 25_000, 1_000), // 50 seconds over 24h = noise + ["SOS_SCHEDULER_YIELD"] = (100_000, 500_000, 0), + }, samples: 24); + + // Analysis window: massive PAGEIOLATCH spike + var waits = new Dictionary + { + ["PAGEIOLATCH_SH"] = (8_000_000, 4_000_000, 100_000), // 8 million ms in 4h + ["SOS_SCHEDULER_YIELD"] = (100_000, 500_000, 0), // Normal + }; + await SeedWaitStatsAsync(waits); + + await SeedServerConfigAsync(ctfp: 50, maxdop: 8, maxMemoryMb: 122_880); + await SeedMemoryStatsAsync(totalPhysicalMb: 131_072, bufferPoolMb: 100_000, targetMb: 122_880); + await SeedFileSizeAsync(totalDataSizeMb: 102_400); + await SeedServerEditionAsync(edition: 3, majorVersion: 16); + await SeedServerPropertiesAsync(cpuCount: 8, htRatio: 1, physicalMemMb: 131_072); + } + + /// + /// Seeds CPU utilization data in a specific time range with variance. + /// Used for baseline + spike anomaly scenarios. + /// + internal async Task SeedCpuUtilizationInRangeAsync(DateTime start, DateTime end, + int avgCpu, int variance, int samples) + { + using var readLock = _duckDb.AcquireReadLock(); + using var connection = _duckDb.CreateConnection(); + await connection.OpenAsync(); + + var interval = (end - start).TotalMinutes / samples; + var rng = new Random(42); // Deterministic for reproducibility + + for (var i = 0; i < samples; i++) + { + var t = start.AddMinutes(i * interval); + var cpu = Math.Clamp(avgCpu + rng.Next(-variance, variance + 1), 0, 100); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +INSERT INTO cpu_utilization_stats + (collection_id, collection_time, server_id, server_name, + sample_time, sqlserver_cpu_utilization, other_process_cpu_utilization) +VALUES ($1, $2, $3, $4, $5, $6, $7)"; + + cmd.Parameters.Add(new DuckDBParameter { Value = _nextId-- }); + cmd.Parameters.Add(new DuckDBParameter { Value = t }); + cmd.Parameters.Add(new DuckDBParameter { Value = TestServerId }); + cmd.Parameters.Add(new DuckDBParameter { Value = TestServerName }); + cmd.Parameters.Add(new DuckDBParameter { Value = t }); + cmd.Parameters.Add(new DuckDBParameter { Value = cpu }); + cmd.Parameters.Add(new DuckDBParameter { Value = 2 }); // other CPU + + await cmd.ExecuteNonQueryAsync(); + } + } + + /// + /// Seeds wait stats in a specific time range. Used for baseline periods + /// in anomaly detection scenarios. + /// + internal async Task SeedWaitStatsInRangeAsync(DateTime start, DateTime end, + Dictionary waits, int samples) + { + using var readLock = _duckDb.AcquireReadLock(); + using var connection = _duckDb.CreateConnection(); + await connection.OpenAsync(); + + var interval = (end - start).TotalMinutes / samples; + + foreach (var (waitType, (totalWaitTimeMs, totalWaitingTasks, totalSignalMs)) in waits) + { + var perSampleWaitMs = totalWaitTimeMs / samples; + var perSampleTasks = totalWaitingTasks / samples; + var perSampleSignal = totalSignalMs / samples; + + for (var i = 0; i < samples; i++) + { + var t = start.AddMinutes(i * interval); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +INSERT INTO wait_stats + (collection_id, collection_time, server_id, server_name, + wait_type, waiting_tasks_count, wait_time_ms, signal_wait_time_ms, + delta_waiting_tasks, delta_wait_time_ms, delta_signal_wait_time_ms) +VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)"; + + cmd.Parameters.Add(new DuckDBParameter { Value = _nextId-- }); + cmd.Parameters.Add(new DuckDBParameter { Value = t }); + cmd.Parameters.Add(new DuckDBParameter { Value = TestServerId }); + cmd.Parameters.Add(new DuckDBParameter { Value = TestServerName }); + cmd.Parameters.Add(new DuckDBParameter { Value = waitType }); + cmd.Parameters.Add(new DuckDBParameter { Value = totalWaitingTasks }); // cumulative + cmd.Parameters.Add(new DuckDBParameter { Value = totalWaitTimeMs }); // cumulative + cmd.Parameters.Add(new DuckDBParameter { Value = totalSignalMs }); // cumulative + cmd.Parameters.Add(new DuckDBParameter { Value = perSampleTasks }); // delta + cmd.Parameters.Add(new DuckDBParameter { Value = perSampleWaitMs }); // delta + cmd.Parameters.Add(new DuckDBParameter { Value = perSampleSignal }); // delta + + await cmd.ExecuteNonQueryAsync(); + } + } + } + /// /// Removes all test data across all tables. /// diff --git a/Lite/Mcp/McpAnalysisTools.cs b/Lite/Mcp/McpAnalysisTools.cs index 48c6acbf..9a194fc4 100644 --- a/Lite/Mcp/McpAnalysisTools.cs +++ b/Lite/Mcp/McpAnalysisTools.cs @@ -742,6 +742,30 @@ internal static class ToolRecommendations new("get_running_jobs", "See currently running jobs with duration vs historical"), new("get_cpu_utilization", "Check if long-running jobs are consuming CPU") ], + ["ANOMALY_CPU"] = + [ + new("get_cpu_utilization", "See CPU trend to identify when the spike occurred"), + new("get_active_queries", "Find what queries were running during the spike"), + new("get_top_queries_by_cpu", "Find the most CPU-expensive queries in the period") + ], + ["ANOMALY_WAIT"] = + [ + new("get_wait_stats", "See full wait stats breakdown"), + new("get_wait_trend", "Track the anomalous wait type over time"), + new("compare_analysis", "Compare current vs baseline to see what changed") + ], + ["ANOMALY_BLOCKING"] = + [ + new("get_blocked_process_reports", "Get detailed blocking event reports"), + new("get_deadlocks", "Get recent deadlock events"), + new("get_blocking_trend", "Track blocking frequency over time") + ], + ["ANOMALY_IO"] = + [ + new("get_file_io_stats", "Check per-file I/O latency"), + new("get_file_io_trend", "Track I/O latency over time"), + new("get_memory_stats", "Check if buffer pool is undersized") + ], ["BAD_ACTOR"] = [ new("get_top_queries_by_cpu", "See full query stats for this query"), @@ -769,9 +793,17 @@ public static List GetForStoryPath(string storyPath) { if (!ByFactKey.TryGetValue(key, out var recommendations)) { - // Handle dynamic keys like BAD_ACTOR_0x... by checking prefix + // Handle dynamic keys by checking prefix if (key.StartsWith("BAD_ACTOR_")) ByFactKey.TryGetValue("BAD_ACTOR", out recommendations); + else if (key.StartsWith("ANOMALY_CPU")) + ByFactKey.TryGetValue("ANOMALY_CPU", out recommendations); + else if (key.StartsWith("ANOMALY_WAIT_")) + ByFactKey.TryGetValue("ANOMALY_WAIT", out recommendations); + else if (key.StartsWith("ANOMALY_BLOCKING") || key.StartsWith("ANOMALY_DEADLOCK")) + ByFactKey.TryGetValue("ANOMALY_BLOCKING", out recommendations); + else if (key.StartsWith("ANOMALY_READ") || key.StartsWith("ANOMALY_WRITE")) + ByFactKey.TryGetValue("ANOMALY_IO", out recommendations); if (recommendations == null) continue; }