diff --git a/Lite.Tests/ScenarioTests.cs b/Lite.Tests/ScenarioTests.cs index faf2ae51..dcadf1af 100644 --- a/Lite.Tests/ScenarioTests.cs +++ b/Lite.Tests/ScenarioTests.cs @@ -359,6 +359,110 @@ public async Task EverythingOnFire_BlockingAndDeadlocksPresent() return (stories, factsByKey); } + /* ── Anomaly Detection: CPU Spike ── */ + + [Fact] + public async Task CpuSpikeAnomaly_DetectsCpuDeviation() + { + var (stories, facts) = await RunFullPipelineWithAnomaliesAsync(s => s.SeedCpuSpikeAnomalyAsync()); + PrintStories("CPU SPIKE ANOMALY", stories); + + Assert.True(facts.ContainsKey("ANOMALY_CPU_SPIKE"), "Should detect CPU anomaly"); + Assert.True(facts["ANOMALY_CPU_SPIKE"].Severity >= 0.5, "CPU anomaly severity should be significant"); + } + + [Fact] + public async Task CpuSpikeAnomaly_HighDeviation() + { + var (_, facts) = await RunFullPipelineWithAnomaliesAsync(s => s.SeedCpuSpikeAnomalyAsync()); + + var deviation = facts["ANOMALY_CPU_SPIKE"].Metadata["deviation_sigma"]; + Assert.True(deviation > 5.0, $"Expected large deviation (>5σ), got {deviation:F1}σ"); + } + + [Fact] + public async Task CpuSpikeAnomaly_AppearsAsStory() + { + var (stories, _) = await RunFullPipelineWithAnomaliesAsync(s => s.SeedCpuSpikeAnomalyAsync()); + + Assert.Contains(stories, s => s.RootFactKey == "ANOMALY_CPU_SPIKE"); + } + + /* ── Anomaly Detection: Blocking Spike ── */ + + [Fact] + public async Task BlockingSpikeAnomaly_DetectsBlockingBurst() + { + var (stories, facts) = await RunFullPipelineWithAnomaliesAsync(s => s.SeedBlockingSpikeAnomalyAsync()); + PrintStories("BLOCKING SPIKE ANOMALY", stories); + + Assert.True(facts.ContainsKey("ANOMALY_BLOCKING_SPIKE"), "Should detect blocking spike"); + Assert.True(facts["ANOMALY_BLOCKING_SPIKE"].Severity >= 0.5, "Blocking spike should be significant"); + } + + [Fact] + public async Task BlockingSpikeAnomaly_DetectsDeadlockSpike() + { + var (_, facts) = await RunFullPipelineWithAnomaliesAsync(s => s.SeedBlockingSpikeAnomalyAsync()); + + Assert.True(facts.ContainsKey("ANOMALY_DEADLOCK_SPIKE"), "Should detect deadlock spike"); + } + + /* ── Anomaly Detection: Wait Spike ── */ + + [Fact] + public async Task WaitSpikeAnomaly_DetectsPageiolatchFlood() + { + var (stories, facts) = await RunFullPipelineWithAnomaliesAsync(s => s.SeedWaitSpikeAnomalyAsync()); + PrintStories("WAIT SPIKE ANOMALY", stories); + + Assert.True(facts.ContainsKey("ANOMALY_WAIT_PAGEIOLATCH_SH"), "Should detect PAGEIOLATCH spike"); + Assert.True(facts["ANOMALY_WAIT_PAGEIOLATCH_SH"].Severity >= 0.5, "PAGEIOLATCH anomaly should be significant"); + } + + [Fact] + public async Task WaitSpikeAnomaly_HighRatio() + { + var (_, facts) = await RunFullPipelineWithAnomaliesAsync(s => s.SeedWaitSpikeAnomalyAsync()); + + var ratio = facts["ANOMALY_WAIT_PAGEIOLATCH_SH"].Metadata["ratio"]; + Assert.True(ratio >= 5.0, $"Expected >= 5x increase, got {ratio:F1}x"); + } + + /* ── Helpers ── */ + + private async Task<(List Stories, Dictionary Facts)> RunFullPipelineWithAnomaliesAsync( + Func seedAction) + { + await _duckDb.InitializeAsync(); + await _duckDb.InitializeAnalysisSchemaAsync(); + + var seeder = new TestDataSeeder(_duckDb); + await seedAction(seeder); + + var collector = new DuckDbFactCollector(_duckDb); + var context = TestDataSeeder.CreateTestContext(); + var facts = await collector.CollectFactsAsync(context); + + // Run anomaly detection (compares analysis window against baseline) + var anomalyDetector = new AnomalyDetector(_duckDb); + var anomalies = await anomalyDetector.DetectAnomaliesAsync(context); + facts.AddRange(anomalies); + + var scorer = new FactScorer(); + scorer.ScoreAll(facts); + + var graph = new RelationshipGraph(); + var engine = new InferenceEngine(graph); + var stories = engine.BuildStories(facts); + + var factsByKey = facts + .Where(f => f.Severity > 0) + .ToDictionary(f => f.Key, f => f); + + return (stories, factsByKey); + } + private static void PrintStories(string scenario, List stories) { var output = TestContext.Current.TestOutputHelper!; diff --git a/Lite/Analysis/AnalysisService.cs b/Lite/Analysis/AnalysisService.cs index b09b04be..ad79a4e5 100644 --- a/Lite/Analysis/AnalysisService.cs +++ b/Lite/Analysis/AnalysisService.cs @@ -22,6 +22,7 @@ public class AnalysisService private readonly RelationshipGraph _graph; private readonly InferenceEngine _engine; private readonly DrillDownCollector _drillDown; + private readonly AnomalyDetector _anomalyDetector; /// /// Minimum hours of collected data required before analysis will run. /// Short collection windows distort fraction-of-period calculations — @@ -59,6 +60,7 @@ public AnalysisService(DuckDbInitializer duckDb) _graph = new RelationshipGraph(); _engine = new InferenceEngine(_graph); _drillDown = new DrillDownCollector(duckDb); + _anomalyDetector = new AnomalyDetector(duckDb); } /// @@ -126,6 +128,10 @@ public async Task> AnalyzeAsync(AnalysisContext context) return []; } + // 1.5. Detect anomalies (compare analysis window against baseline) + var anomalies = await _anomalyDetector.DetectAnomaliesAsync(context); + facts.AddRange(anomalies); + // 2. Score facts (base severity + amplifiers) _scorer.ScoreAll(facts); diff --git a/Lite/Analysis/AnomalyDetector.cs b/Lite/Analysis/AnomalyDetector.cs new file mode 100644 index 00000000..73c728ef --- /dev/null +++ b/Lite/Analysis/AnomalyDetector.cs @@ -0,0 +1,507 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using DuckDB.NET.Data; +using PerformanceMonitorLite.Database; +using PerformanceMonitorLite.Services; + +namespace PerformanceMonitorLite.Analysis; + +/// +/// Detects anomalies by comparing the analysis window's metrics against a +/// baseline period. When a metric deviates significantly from baseline +/// (mean + standard deviation), an ANOMALY fact is emitted. +/// +/// This is the "oh shit" mode — detecting acute deviations that don't show +/// up in aggregate analysis because they're brief. A 5-minute CPU spike +/// that averages out over 4 hours is invisible to aggregate scoring but +/// obvious when compared against "what was this metric doing before?" +/// +/// Baseline selection: uses the 24 hours preceding the analysis window. +/// If less data is available, uses whatever exists with lower confidence. +/// +public class AnomalyDetector +{ + private readonly DuckDbInitializer _duckDb; + + /// + /// Minimum number of baseline samples needed for reliable detection. + /// Below this, anomalies are still detected but with reduced confidence. + /// + private const int MinBaselineSamples = 10; + + /// + /// Number of standard deviations above baseline mean to flag as anomalous. + /// + private const double DeviationThreshold = 2.0; + + public AnomalyDetector(DuckDbInitializer duckDb) + { + _duckDb = duckDb; + } + + /// + /// Detects anomalies by comparing the analysis window against a baseline period. + /// Returns anomaly facts to be merged into the main fact list. + /// + public async Task> DetectAnomaliesAsync(AnalysisContext context) + { + var anomalies = new List(); + + // Baseline: 24 hours preceding the analysis window + var baselineEnd = context.TimeRangeStart; + var baselineStart = baselineEnd.AddHours(-24); + + // Check if baseline period has any data at all — if not, skip all anomaly detection. + // Without baseline data, everything looks anomalous. + if (!await HasBaselineDataAsync(context.ServerId, baselineStart, baselineEnd)) + return anomalies; + + await DetectCpuAnomalies(context, baselineStart, baselineEnd, anomalies); + await DetectWaitAnomalies(context, baselineStart, baselineEnd, anomalies); + await DetectBlockingAnomalies(context, baselineStart, baselineEnd, anomalies); + await DetectIoAnomalies(context, baselineStart, baselineEnd, anomalies); + + return anomalies; + } + + /// + /// Checks if the baseline period has any collected data. + /// Uses wait_stats as canary — if waits are collected, other data is too. + /// + private async Task HasBaselineDataAsync(int serverId, DateTime baselineStart, DateTime baselineEnd) + { + try + { + using var readLock = _duckDb.AcquireReadLock(); + using var connection = _duckDb.CreateConnection(); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SELECT (SELECT COUNT(*) FROM v_wait_stats + WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3) + + (SELECT COUNT(*) FROM v_cpu_utilization_stats + WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3)"; + cmd.Parameters.Add(new DuckDBParameter { Value = serverId }); + cmd.Parameters.Add(new DuckDBParameter { Value = baselineStart }); + cmd.Parameters.Add(new DuckDBParameter { Value = baselineEnd }); + + var count = Convert.ToInt64(await cmd.ExecuteScalarAsync() ?? 0); + return count > 0; + } + catch { return false; } + } + + /// + /// Detects CPU utilization anomalies by comparing per-sample values + /// against the baseline distribution. + /// + private async Task DetectCpuAnomalies(AnalysisContext context, + DateTime baselineStart, DateTime baselineEnd, List anomalies) + { + try + { + using var readLock = _duckDb.AcquireReadLock(); + using var connection = _duckDb.CreateConnection(); + await connection.OpenAsync(); + + // Get baseline stats + using var baselineCmd = connection.CreateCommand(); + baselineCmd.CommandText = @" +SELECT AVG(sqlserver_cpu_utilization) AS mean_cpu, + STDDEV_SAMP(sqlserver_cpu_utilization) AS stddev_cpu, + COUNT(*) AS sample_count +FROM v_cpu_utilization_stats +WHERE server_id = $1 +AND collection_time >= $2 AND collection_time < $3"; + + baselineCmd.Parameters.Add(new DuckDBParameter { Value = context.ServerId }); + baselineCmd.Parameters.Add(new DuckDBParameter { Value = baselineStart }); + baselineCmd.Parameters.Add(new DuckDBParameter { Value = baselineEnd }); + + double baselineMean = 0, baselineStdDev = 0; + long baselineSamples = 0; + + using (var reader = await baselineCmd.ExecuteReaderAsync()) + { + if (await reader.ReadAsync()) + { + baselineMean = reader.IsDBNull(0) ? 0 : Convert.ToDouble(reader.GetValue(0)); + baselineStdDev = reader.IsDBNull(1) ? 0 : Convert.ToDouble(reader.GetValue(1)); + baselineSamples = reader.IsDBNull(2) ? 0 : Convert.ToInt64(reader.GetValue(2)); + } + } + + if (baselineSamples < 3 || baselineStdDev <= 0) return; + + // Get peak and average in the analysis window + using var windowCmd = connection.CreateCommand(); + windowCmd.CommandText = @" +SELECT MAX(sqlserver_cpu_utilization) AS peak_cpu, + AVG(sqlserver_cpu_utilization) AS avg_cpu, + COUNT(*) AS sample_count, + (SELECT collection_time FROM v_cpu_utilization_stats + WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3 + ORDER BY sqlserver_cpu_utilization DESC LIMIT 1) AS peak_time +FROM v_cpu_utilization_stats +WHERE server_id = $1 +AND collection_time >= $2 AND collection_time < $3"; + + windowCmd.Parameters.Add(new DuckDBParameter { Value = context.ServerId }); + windowCmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeStart }); + windowCmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeEnd }); + + using var windowReader = await windowCmd.ExecuteReaderAsync(); + if (!await windowReader.ReadAsync()) return; + + var peakCpu = windowReader.IsDBNull(0) ? 0.0 : Convert.ToDouble(windowReader.GetValue(0)); + var avgCpu = windowReader.IsDBNull(1) ? 0.0 : Convert.ToDouble(windowReader.GetValue(1)); + var windowSamples = windowReader.IsDBNull(2) ? 0L : Convert.ToInt64(windowReader.GetValue(2)); + var peakTime = windowReader.IsDBNull(3) ? (DateTime?)null : windowReader.GetDateTime(3); + + if (windowSamples == 0) return; + + // Check if peak deviates significantly from baseline + var deviation = (peakCpu - baselineMean) / baselineStdDev; + if (deviation < DeviationThreshold || peakCpu < 50) return; // Don't flag low absolute values + + var confidence = baselineSamples >= MinBaselineSamples ? 1.0 : (double)baselineSamples / MinBaselineSamples; + + anomalies.Add(new Fact + { + Source = "anomaly", + Key = "ANOMALY_CPU_SPIKE", + Value = peakCpu, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["peak_cpu"] = peakCpu, + ["avg_cpu_in_window"] = avgCpu, + ["baseline_mean"] = baselineMean, + ["baseline_stddev"] = baselineStdDev, + ["deviation_sigma"] = deviation, + ["baseline_samples"] = baselineSamples, + ["window_samples"] = windowSamples, + ["confidence"] = confidence, + ["peak_time_ticks"] = peakTime?.Ticks ?? 0 + } + }); + } + catch (Exception ex) + { + AppLogger.Error("AnomalyDetector", $"CPU anomaly detection failed: {ex.Message}"); + } + } + + /// + /// Detects wait stat anomalies — significant waits in the analysis window + /// that were absent or much lower in the baseline. + /// + private async Task DetectWaitAnomalies(AnalysisContext context, + DateTime baselineStart, DateTime baselineEnd, List anomalies) + { + try + { + using var readLock = _duckDb.AcquireReadLock(); + using var connection = _duckDb.CreateConnection(); + await connection.OpenAsync(); + + // Check if baseline has any wait data at all — if not, skip + using var checkCmd = connection.CreateCommand(); + checkCmd.CommandText = @" +SELECT COUNT(*) FROM v_wait_stats +WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3"; + checkCmd.Parameters.Add(new DuckDBParameter { Value = context.ServerId }); + checkCmd.Parameters.Add(new DuckDBParameter { Value = baselineStart }); + checkCmd.Parameters.Add(new DuckDBParameter { Value = baselineEnd }); + var baselineCount = Convert.ToInt64(await checkCmd.ExecuteScalarAsync() ?? 0); + if (baselineCount == 0) return; + + // Get per-wait-type totals in both windows + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +WITH baseline AS ( + SELECT wait_type, + SUM(delta_wait_time_ms)::BIGINT AS total_ms + FROM v_wait_stats + WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3 + AND delta_wait_time_ms > 0 + GROUP BY wait_type +), +current_window AS ( + SELECT wait_type, + SUM(delta_wait_time_ms)::BIGINT AS total_ms + FROM v_wait_stats + WHERE server_id = $1 AND collection_time >= $4 AND collection_time <= $5 + AND delta_wait_time_ms > 0 + GROUP BY wait_type +) +SELECT c.wait_type, + c.total_ms AS current_ms, + COALESCE(b.total_ms, 0) AS baseline_ms +FROM current_window c +LEFT JOIN baseline b ON c.wait_type = b.wait_type +WHERE c.total_ms > 10000 -- At least 10 seconds of wait time +ORDER BY c.total_ms DESC +LIMIT 10"; + + cmd.Parameters.Add(new DuckDBParameter { Value = context.ServerId }); + cmd.Parameters.Add(new DuckDBParameter { Value = baselineStart }); + cmd.Parameters.Add(new DuckDBParameter { Value = baselineEnd }); + cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeStart }); + cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeEnd }); + + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + var waitType = reader.GetString(0); + var currentMs = Convert.ToInt64(reader.GetValue(1)); + var baselineMs = Convert.ToInt64(reader.GetValue(2)); + + // New wait (absent in baseline) or 5x+ increase + double ratio; + string anomalyType; + + if (baselineMs == 0) + { + ratio = currentMs > 60_000 ? 100.0 : 0; // Only flag if > 1 minute total + anomalyType = "new"; + } + else + { + ratio = (double)currentMs / baselineMs; + anomalyType = "spike"; + } + + if (ratio < 5.0) continue; // Need at least 5x increase + + anomalies.Add(new Fact + { + Source = "anomaly", + Key = $"ANOMALY_WAIT_{waitType}", + Value = currentMs, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["current_ms"] = currentMs, + ["baseline_ms"] = baselineMs, + ["ratio"] = ratio, + ["is_new"] = anomalyType == "new" ? 1 : 0 + } + }); + } + } + catch (Exception ex) + { + AppLogger.Error("AnomalyDetector", $"Wait anomaly detection failed: {ex.Message}"); + } + } + + /// + /// Detects blocking/deadlock anomalies — events in the analysis window + /// that are significantly above baseline rates. + /// + private async Task DetectBlockingAnomalies(AnalysisContext context, + DateTime baselineStart, DateTime baselineEnd, List anomalies) + { + try + { + using var readLock = _duckDb.AcquireReadLock(); + using var connection = _duckDb.CreateConnection(); + await connection.OpenAsync(); + + // Check if baseline period has any data at all + using var checkCmd = connection.CreateCommand(); + checkCmd.CommandText = @" +SELECT (SELECT COUNT(*) FROM v_blocked_process_reports + WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3) + + (SELECT COUNT(*) FROM v_deadlocks + WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3) + + (SELECT COUNT(*) FROM v_wait_stats + WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3)"; + checkCmd.Parameters.Add(new DuckDBParameter { Value = context.ServerId }); + checkCmd.Parameters.Add(new DuckDBParameter { Value = baselineStart }); + checkCmd.Parameters.Add(new DuckDBParameter { Value = baselineEnd }); + var baselineDataCount = Convert.ToInt64(await checkCmd.ExecuteScalarAsync() ?? 0); + if (baselineDataCount == 0) return; // No baseline data = can't detect anomaly + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SELECT + (SELECT COUNT(*) FROM v_blocked_process_reports + WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3) AS baseline_blocking, + (SELECT COUNT(*) FROM v_blocked_process_reports + WHERE server_id = $1 AND collection_time >= $4 AND collection_time <= $5) AS current_blocking, + (SELECT COUNT(*) FROM v_deadlocks + WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3) AS baseline_deadlocks, + (SELECT COUNT(*) FROM v_deadlocks + WHERE server_id = $1 AND collection_time >= $4 AND collection_time <= $5) AS current_deadlocks"; + + cmd.Parameters.Add(new DuckDBParameter { Value = context.ServerId }); + cmd.Parameters.Add(new DuckDBParameter { Value = baselineStart }); + cmd.Parameters.Add(new DuckDBParameter { Value = baselineEnd }); + cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeStart }); + cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeEnd }); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var baselineBlocking = Convert.ToInt64(reader.GetValue(0)); + var currentBlocking = Convert.ToInt64(reader.GetValue(1)); + var baselineDeadlocks = Convert.ToInt64(reader.GetValue(2)); + var currentDeadlocks = Convert.ToInt64(reader.GetValue(3)); + + // Blocking spike: at least 5 events AND 3x baseline (or new) + if (currentBlocking >= 5 && (baselineBlocking == 0 || (double)currentBlocking / baselineBlocking >= 3)) + { + anomalies.Add(new Fact + { + Source = "anomaly", + Key = "ANOMALY_BLOCKING_SPIKE", + Value = currentBlocking, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["current_count"] = currentBlocking, + ["baseline_count"] = baselineBlocking, + ["ratio"] = baselineBlocking > 0 ? (double)currentBlocking / baselineBlocking : 100 + } + }); + } + + // Deadlock spike: at least 3 events AND 3x baseline (or new) + if (currentDeadlocks >= 3 && (baselineDeadlocks == 0 || (double)currentDeadlocks / baselineDeadlocks >= 3)) + { + anomalies.Add(new Fact + { + Source = "anomaly", + Key = "ANOMALY_DEADLOCK_SPIKE", + Value = currentDeadlocks, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["current_count"] = currentDeadlocks, + ["baseline_count"] = baselineDeadlocks, + ["ratio"] = baselineDeadlocks > 0 ? (double)currentDeadlocks / baselineDeadlocks : 100 + } + }); + } + } + catch (Exception ex) + { + AppLogger.Error("AnomalyDetector", $"Blocking anomaly detection failed: {ex.Message}"); + } + } + + /// + /// Detects I/O latency anomalies — significant increase in read/write latency + /// compared to baseline. + /// + private async Task DetectIoAnomalies(AnalysisContext context, + DateTime baselineStart, DateTime baselineEnd, List anomalies) + { + try + { + using var readLock = _duckDb.AcquireReadLock(); + using var connection = _duckDb.CreateConnection(); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +WITH baseline AS ( + SELECT AVG(delta_stall_read_ms * 1.0 / NULLIF(delta_reads, 0)) AS avg_read_lat, + AVG(delta_stall_write_ms * 1.0 / NULLIF(delta_writes, 0)) AS avg_write_lat, + STDDEV_SAMP(delta_stall_read_ms * 1.0 / NULLIF(delta_reads, 0)) AS stddev_read, + STDDEV_SAMP(delta_stall_write_ms * 1.0 / NULLIF(delta_writes, 0)) AS stddev_write, + COUNT(*) AS samples + FROM v_file_io_stats + WHERE server_id = $1 AND collection_time >= $2 AND collection_time < $3 + AND (delta_reads > 0 OR delta_writes > 0) +), +current_window AS ( + SELECT AVG(delta_stall_read_ms * 1.0 / NULLIF(delta_reads, 0)) AS avg_read_lat, + AVG(delta_stall_write_ms * 1.0 / NULLIF(delta_writes, 0)) AS avg_write_lat + FROM v_file_io_stats + WHERE server_id = $1 AND collection_time >= $4 AND collection_time <= $5 + AND (delta_reads > 0 OR delta_writes > 0) +) +SELECT b.avg_read_lat, b.stddev_read, c.avg_read_lat, + b.avg_write_lat, b.stddev_write, c.avg_write_lat, + b.samples +FROM baseline b, current_window c"; + + cmd.Parameters.Add(new DuckDBParameter { Value = context.ServerId }); + cmd.Parameters.Add(new DuckDBParameter { Value = baselineStart }); + cmd.Parameters.Add(new DuckDBParameter { Value = baselineEnd }); + cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeStart }); + cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeEnd }); + + using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return; + + var baselineReadLat = reader.IsDBNull(0) ? 0.0 : Convert.ToDouble(reader.GetValue(0)); + var stddevRead = reader.IsDBNull(1) ? 0.0 : Convert.ToDouble(reader.GetValue(1)); + var currentReadLat = reader.IsDBNull(2) ? 0.0 : Convert.ToDouble(reader.GetValue(2)); + var baselineWriteLat = reader.IsDBNull(3) ? 0.0 : Convert.ToDouble(reader.GetValue(3)); + var stddevWrite = reader.IsDBNull(4) ? 0.0 : Convert.ToDouble(reader.GetValue(4)); + var currentWriteLat = reader.IsDBNull(5) ? 0.0 : Convert.ToDouble(reader.GetValue(5)); + var samples = reader.IsDBNull(6) ? 0L : Convert.ToInt64(reader.GetValue(6)); + + if (samples < 3) return; + + // Read latency anomaly + if (stddevRead > 0 && currentReadLat > 10) // At least 10ms to matter + { + var readDeviation = (currentReadLat - baselineReadLat) / stddevRead; + if (readDeviation >= DeviationThreshold) + { + anomalies.Add(new Fact + { + Source = "anomaly", + Key = "ANOMALY_READ_LATENCY", + Value = currentReadLat, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["current_latency_ms"] = currentReadLat, + ["baseline_mean_ms"] = baselineReadLat, + ["baseline_stddev_ms"] = stddevRead, + ["deviation_sigma"] = readDeviation, + ["baseline_samples"] = samples + } + }); + } + } + + // Write latency anomaly + if (stddevWrite > 0 && currentWriteLat > 5) // At least 5ms to matter + { + var writeDeviation = (currentWriteLat - baselineWriteLat) / stddevWrite; + if (writeDeviation >= DeviationThreshold) + { + anomalies.Add(new Fact + { + Source = "anomaly", + Key = "ANOMALY_WRITE_LATENCY", + Value = currentWriteLat, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["current_latency_ms"] = currentWriteLat, + ["baseline_mean_ms"] = baselineWriteLat, + ["baseline_stddev_ms"] = stddevWrite, + ["deviation_sigma"] = writeDeviation, + ["baseline_samples"] = samples + } + }); + } + } + } + catch (Exception ex) + { + AppLogger.Error("AnomalyDetector", $"I/O anomaly detection failed: {ex.Message}"); + } + } +} diff --git a/Lite/Analysis/FactScorer.cs b/Lite/Analysis/FactScorer.cs index 4c35efe1..a9fc97e5 100644 --- a/Lite/Analysis/FactScorer.cs +++ b/Lite/Analysis/FactScorer.cs @@ -37,6 +37,7 @@ public void ScoreAll(List facts) "jobs" => ScoreJobFact(fact), "disk" => ScoreDiskFact(fact), "bad_actor" => ScoreBadActorFact(fact), + "anomaly" => ScoreAnomalyFact(fact), _ => 0.0 }; } @@ -44,7 +45,7 @@ public void ScoreAll(List facts) // Build lookup for amplifier evaluation (include context facts that amplifiers reference) var contextSources = new HashSet { "config", "cpu", "io", "tempdb", "memory", "queries", "perfmon", - "database_config", "jobs", "sessions", "disk", "bad_actor" }; + "database_config", "jobs", "sessions", "disk", "bad_actor", "anomaly" }; var factsByKey = facts .Where(f => f.BaseSeverity > 0 || contextSources.Contains(f.Source)) .ToDictionary(f => f.Key, f => f); @@ -300,6 +301,43 @@ private static double ScoreBadActorFact(Fact fact) return tierBase * impact; } + /// + /// Scores anomaly facts based on deviation from baseline. + /// At 2σ → 0.5, at 4σ → 1.0. Higher deviations are more severe. + /// For count-based anomalies (blocking/deadlock spikes), uses ratio instead. + /// + private static double ScoreAnomalyFact(Fact fact) + { + if (fact.Key.StartsWith("ANOMALY_CPU_SPIKE") || fact.Key.StartsWith("ANOMALY_READ_LATENCY") + || fact.Key.StartsWith("ANOMALY_WRITE_LATENCY")) + { + // Deviation-based scoring: 2σ = 0.5, 4σ = 1.0 + var deviation = fact.Metadata.GetValueOrDefault("deviation_sigma"); + var confidence = fact.Metadata.GetValueOrDefault("confidence", 1.0); + if (deviation < 2.0) return 0.0; + var base_score = 0.5 + 0.5 * Math.Min((deviation - 2.0) / 2.0, 1.0); + return base_score * confidence; + } + + if (fact.Key.StartsWith("ANOMALY_WAIT_")) + { + // Ratio-based scoring: 5x = 0.5, 20x = 1.0 + var ratio = fact.Metadata.GetValueOrDefault("ratio"); + if (ratio < 5) return 0.0; + return 0.5 + 0.5 * Math.Min((ratio - 5.0) / 15.0, 1.0); + } + + if (fact.Key.StartsWith("ANOMALY_BLOCKING_SPIKE") || fact.Key.StartsWith("ANOMALY_DEADLOCK_SPIKE")) + { + // Ratio-based: 3x = 0.5, 10x = 1.0 + var ratio = fact.Metadata.GetValueOrDefault("ratio"); + if (ratio < 3) return 0.0; + return 0.5 + 0.5 * Math.Min((ratio - 3.0) / 7.0, 1.0); + } + + return 0.0; + } + /// /// Generic threshold formula used by waits, latency, and count-based metrics. /// Critical == null means "concerning only" — hitting concerning = 1.0. diff --git a/Lite/Analysis/TestDataSeeder.cs b/Lite/Analysis/TestDataSeeder.cs index f47b82f7..1b29fe91 100644 --- a/Lite/Analysis/TestDataSeeder.cs +++ b/Lite/Analysis/TestDataSeeder.cs @@ -32,6 +32,12 @@ public class TestDataSeeder public static DateTime TestPeriodStart => _periodEnd.AddHours(-4); public static double TestPeriodDurationMs => (TestPeriodEnd - TestPeriodStart).TotalMilliseconds; + /// + /// Baseline period for anomaly detection: 24 hours before the analysis window. + /// + public static DateTime BaselineStart => TestPeriodStart.AddHours(-24); + public static DateTime BaselineEnd => TestPeriodStart; + private long _nextId = -1_000_000; public TestDataSeeder(DuckDbInitializer duckDb) @@ -503,6 +509,194 @@ await SeedDiskSpaceAsync( ("D:\\", 2_000_000, 140_000)); // 7% free — critical } + /// + /// CPU spike anomaly: server normally runs at 10% CPU, then spikes to 95%. + /// Baseline: 24h of steady ~10% CPU. + /// Analysis window: 4h with 95% peak CPU. + /// + /// Expected: ANOMALY_CPU_SPIKE with high deviation (~10σ+). + /// + public async Task SeedCpuSpikeAnomalyAsync() + { + await ClearTestDataAsync(); + await SeedTestServerAsync(); + + // Baseline: 24h of steady low CPU (10% avg, small variance) + await SeedCpuUtilizationInRangeAsync(BaselineStart, BaselineEnd, avgCpu: 10, variance: 3, samples: 96); + + // Analysis window: spike to 95% + await SeedCpuUtilizationInRangeAsync(TestPeriodStart, TestPeriodEnd, avgCpu: 95, variance: 5, samples: 16); + + // Need basic config for the analysis to run + await SeedServerConfigAsync(ctfp: 50, maxdop: 8, maxMemoryMb: 122_880); + await SeedMemoryStatsAsync(totalPhysicalMb: 131_072, bufferPoolMb: 100_000, targetMb: 122_880); + await SeedFileSizeAsync(totalDataSizeMb: 102_400); + await SeedServerEditionAsync(edition: 3, majorVersion: 16); + await SeedServerPropertiesAsync(cpuCount: 8, htRatio: 1, physicalMemMb: 131_072); + } + + /// + /// Blocking spike anomaly: normally no blocking, then sudden burst. + /// Baseline: 24h with 0 blocking events. + /// Analysis window: 4h with 50 blocking events and 10 deadlocks. + /// + /// Expected: ANOMALY_BLOCKING_SPIKE, ANOMALY_DEADLOCK_SPIKE. + /// + public async Task SeedBlockingSpikeAnomalyAsync() + { + await ClearTestDataAsync(); + await SeedTestServerAsync(); + + // Baseline: some normal wait activity (no blocking/deadlocks) + await SeedWaitStatsInRangeAsync(BaselineStart, BaselineEnd, + new Dictionary + { + ["SOS_SCHEDULER_YIELD"] = (100_000, 500_000, 0), + }, samples: 24); + + // Analysis window: sudden blocking burst + await SeedBlockingEventsAsync(50, avgWaitTimeMs: 15_000, sleepingBlockerCount: 5); + await SeedDeadlocksAsync(10); + + // Some lock waits to corroborate + var waits = new Dictionary + { + ["LCK_M_X"] = (5_000_000, 200_000, 50_000), + ["LCK_M_S"] = (1_000_000, 100_000, 10_000), + }; + await SeedWaitStatsAsync(waits); + + await SeedServerConfigAsync(ctfp: 50, maxdop: 8, maxMemoryMb: 122_880); + await SeedMemoryStatsAsync(totalPhysicalMb: 131_072, bufferPoolMb: 100_000, targetMb: 122_880); + await SeedFileSizeAsync(totalDataSizeMb: 102_400); + await SeedServerEditionAsync(edition: 3, majorVersion: 16); + await SeedServerPropertiesAsync(cpuCount: 8, htRatio: 1, physicalMemMb: 131_072); + await SeedDatabaseConfigAsync( + ("AppDB1", false, false, false, "CHECKSUM"), + ("AppDB2", false, false, false, "CHECKSUM")); + } + + /// + /// Wait spike anomaly: normally low waits, then sudden PAGEIOLATCH flood. + /// Baseline: 24h with minimal PAGEIOLATCH. + /// Analysis window: 4h with massive PAGEIOLATCH. + /// + /// Expected: ANOMALY_WAIT_PAGEIOLATCH_SH with high ratio. + /// + public async Task SeedWaitSpikeAnomalyAsync() + { + await ClearTestDataAsync(); + await SeedTestServerAsync(); + + // Baseline: 24h with minimal PAGEIOLATCH + await SeedWaitStatsInRangeAsync(BaselineStart, BaselineEnd, + new Dictionary + { + ["PAGEIOLATCH_SH"] = (50_000, 25_000, 1_000), // 50 seconds over 24h = noise + ["SOS_SCHEDULER_YIELD"] = (100_000, 500_000, 0), + }, samples: 24); + + // Analysis window: massive PAGEIOLATCH spike + var waits = new Dictionary + { + ["PAGEIOLATCH_SH"] = (8_000_000, 4_000_000, 100_000), // 8 million ms in 4h + ["SOS_SCHEDULER_YIELD"] = (100_000, 500_000, 0), // Normal + }; + await SeedWaitStatsAsync(waits); + + await SeedServerConfigAsync(ctfp: 50, maxdop: 8, maxMemoryMb: 122_880); + await SeedMemoryStatsAsync(totalPhysicalMb: 131_072, bufferPoolMb: 100_000, targetMb: 122_880); + await SeedFileSizeAsync(totalDataSizeMb: 102_400); + await SeedServerEditionAsync(edition: 3, majorVersion: 16); + await SeedServerPropertiesAsync(cpuCount: 8, htRatio: 1, physicalMemMb: 131_072); + } + + /// + /// Seeds CPU utilization data in a specific time range with variance. + /// Used for baseline + spike anomaly scenarios. + /// + internal async Task SeedCpuUtilizationInRangeAsync(DateTime start, DateTime end, + int avgCpu, int variance, int samples) + { + using var readLock = _duckDb.AcquireReadLock(); + using var connection = _duckDb.CreateConnection(); + await connection.OpenAsync(); + + var interval = (end - start).TotalMinutes / samples; + var rng = new Random(42); // Deterministic for reproducibility + + for (var i = 0; i < samples; i++) + { + var t = start.AddMinutes(i * interval); + var cpu = Math.Clamp(avgCpu + rng.Next(-variance, variance + 1), 0, 100); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +INSERT INTO cpu_utilization_stats + (collection_id, collection_time, server_id, server_name, + sample_time, sqlserver_cpu_utilization, other_process_cpu_utilization) +VALUES ($1, $2, $3, $4, $5, $6, $7)"; + + cmd.Parameters.Add(new DuckDBParameter { Value = _nextId-- }); + cmd.Parameters.Add(new DuckDBParameter { Value = t }); + cmd.Parameters.Add(new DuckDBParameter { Value = TestServerId }); + cmd.Parameters.Add(new DuckDBParameter { Value = TestServerName }); + cmd.Parameters.Add(new DuckDBParameter { Value = t }); + cmd.Parameters.Add(new DuckDBParameter { Value = cpu }); + cmd.Parameters.Add(new DuckDBParameter { Value = 2 }); // other CPU + + await cmd.ExecuteNonQueryAsync(); + } + } + + /// + /// Seeds wait stats in a specific time range. Used for baseline periods + /// in anomaly detection scenarios. + /// + internal async Task SeedWaitStatsInRangeAsync(DateTime start, DateTime end, + Dictionary waits, int samples) + { + using var readLock = _duckDb.AcquireReadLock(); + using var connection = _duckDb.CreateConnection(); + await connection.OpenAsync(); + + var interval = (end - start).TotalMinutes / samples; + + foreach (var (waitType, (totalWaitTimeMs, totalWaitingTasks, totalSignalMs)) in waits) + { + var perSampleWaitMs = totalWaitTimeMs / samples; + var perSampleTasks = totalWaitingTasks / samples; + var perSampleSignal = totalSignalMs / samples; + + for (var i = 0; i < samples; i++) + { + var t = start.AddMinutes(i * interval); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +INSERT INTO wait_stats + (collection_id, collection_time, server_id, server_name, + wait_type, waiting_tasks_count, wait_time_ms, signal_wait_time_ms, + delta_waiting_tasks, delta_wait_time_ms, delta_signal_wait_time_ms) +VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)"; + + cmd.Parameters.Add(new DuckDBParameter { Value = _nextId-- }); + cmd.Parameters.Add(new DuckDBParameter { Value = t }); + cmd.Parameters.Add(new DuckDBParameter { Value = TestServerId }); + cmd.Parameters.Add(new DuckDBParameter { Value = TestServerName }); + cmd.Parameters.Add(new DuckDBParameter { Value = waitType }); + cmd.Parameters.Add(new DuckDBParameter { Value = totalWaitingTasks }); // cumulative + cmd.Parameters.Add(new DuckDBParameter { Value = totalWaitTimeMs }); // cumulative + cmd.Parameters.Add(new DuckDBParameter { Value = totalSignalMs }); // cumulative + cmd.Parameters.Add(new DuckDBParameter { Value = perSampleTasks }); // delta + cmd.Parameters.Add(new DuckDBParameter { Value = perSampleWaitMs }); // delta + cmd.Parameters.Add(new DuckDBParameter { Value = perSampleSignal }); // delta + + await cmd.ExecuteNonQueryAsync(); + } + } + } + /// /// Removes all test data across all tables. /// diff --git a/Lite/Mcp/McpAnalysisTools.cs b/Lite/Mcp/McpAnalysisTools.cs index 48c6acbf..9a194fc4 100644 --- a/Lite/Mcp/McpAnalysisTools.cs +++ b/Lite/Mcp/McpAnalysisTools.cs @@ -742,6 +742,30 @@ internal static class ToolRecommendations new("get_running_jobs", "See currently running jobs with duration vs historical"), new("get_cpu_utilization", "Check if long-running jobs are consuming CPU") ], + ["ANOMALY_CPU"] = + [ + new("get_cpu_utilization", "See CPU trend to identify when the spike occurred"), + new("get_active_queries", "Find what queries were running during the spike"), + new("get_top_queries_by_cpu", "Find the most CPU-expensive queries in the period") + ], + ["ANOMALY_WAIT"] = + [ + new("get_wait_stats", "See full wait stats breakdown"), + new("get_wait_trend", "Track the anomalous wait type over time"), + new("compare_analysis", "Compare current vs baseline to see what changed") + ], + ["ANOMALY_BLOCKING"] = + [ + new("get_blocked_process_reports", "Get detailed blocking event reports"), + new("get_deadlocks", "Get recent deadlock events"), + new("get_blocking_trend", "Track blocking frequency over time") + ], + ["ANOMALY_IO"] = + [ + new("get_file_io_stats", "Check per-file I/O latency"), + new("get_file_io_trend", "Track I/O latency over time"), + new("get_memory_stats", "Check if buffer pool is undersized") + ], ["BAD_ACTOR"] = [ new("get_top_queries_by_cpu", "See full query stats for this query"), @@ -769,9 +793,17 @@ public static List GetForStoryPath(string storyPath) { if (!ByFactKey.TryGetValue(key, out var recommendations)) { - // Handle dynamic keys like BAD_ACTOR_0x... by checking prefix + // Handle dynamic keys by checking prefix if (key.StartsWith("BAD_ACTOR_")) ByFactKey.TryGetValue("BAD_ACTOR", out recommendations); + else if (key.StartsWith("ANOMALY_CPU")) + ByFactKey.TryGetValue("ANOMALY_CPU", out recommendations); + else if (key.StartsWith("ANOMALY_WAIT_")) + ByFactKey.TryGetValue("ANOMALY_WAIT", out recommendations); + else if (key.StartsWith("ANOMALY_BLOCKING") || key.StartsWith("ANOMALY_DEADLOCK")) + ByFactKey.TryGetValue("ANOMALY_BLOCKING", out recommendations); + else if (key.StartsWith("ANOMALY_READ") || key.StartsWith("ANOMALY_WRITE")) + ByFactKey.TryGetValue("ANOMALY_IO", out recommendations); if (recommendations == null) continue; }