From b7b03b77f7c618baa6f87defdeae70fa0f80b40a Mon Sep 17 00:00:00 2001 From: Erik Darling <2136037+erikdarlingdata@users.noreply.github.com> Date: Thu, 21 May 2026 11:13:49 -0400 Subject: [PATCH] Add plan-regression and parameter-sensitivity triage detectors (Lite) Adds two automated-triage facts to the Lite analysis engine: - PARAMETER_SENSITIVITY: detects a single cached plan whose per-execution worker time varies wildly (classic parameter sniffing), sourced from v_query_stats. - PLAN_REGRESSION: detects a query whose current plan is materially worse than a better plan it previously used, sourced from Query Store (v_query_store_stats). Both emit one aggregate fact scored by magnitude (a lone catastrophic offender scores high on its own), join the inference graph with forward and reverse edges so CPU/memory stories reach them as leaf causes, and have drill-down enrichment plus TestDataSeeder scenarios. Stage 1 of 2 (detect + triage). Wiring AnalysisFindings into the notification channels is the committed follow-on workstream. 266/266 Lite tests pass (260 existing + 6 new scenario tests). Co-Authored-By: Claude Opus 4.7 (1M context) --- Lite.Tests/ScenarioTests.cs | 66 ++++++ Lite/Analysis/DrillDownCollector.cs | 234 +++++++++++++++++++++ Lite/Analysis/DuckDbFactCollector.cs | 294 +++++++++++++++++++++++++++ Lite/Analysis/FactScorer.cs | 77 +++++++ Lite/Analysis/RelationshipGraph.cs | 44 ++++ Lite/Analysis/TestDataSeeder.cs | 116 +++++++++++ 6 files changed, 831 insertions(+) diff --git a/Lite.Tests/ScenarioTests.cs b/Lite.Tests/ScenarioTests.cs index bc764814..8d8856f9 100644 --- a/Lite.Tests/ScenarioTests.cs +++ b/Lite.Tests/ScenarioTests.cs @@ -329,6 +329,72 @@ public async Task EverythingOnFire_BlockingAndDeadlocksPresent() Assert.True(facts["DEADLOCKS"].Severity > 0, "Deadlocks severity should be non-zero"); } + /* ── Parameter Sensitivity ── */ + + [Fact] + public async Task ParameterSensitive_FactCollectedAtHighSeverity() + { + var (stories, facts) = await RunFullPipelineAsync(s => s.SeedParameterSensitiveServerAsync()); + PrintStories("PARAMETER SENSITIVITY", stories); + + Assert.True(facts.ContainsKey("PARAMETER_SENSITIVITY"), "PARAMETER_SENSITIVITY should be collected"); + // Worst ratio ~1000x → base severity 1.0; grant/spill/systemic amplifiers push it higher. + Assert.True(facts["PARAMETER_SENSITIVITY"].Severity >= 1.0, + $"Expected high severity, got {facts["PARAMETER_SENSITIVITY"].Severity:F2}"); + } + + [Fact] + public async Task ParameterSensitive_AppearsInStories() + { + var (stories, _) = await RunFullPipelineAsync(s => s.SeedParameterSensitiveServerAsync()); + Assert.Contains(stories, s => s.Path.Contains("PARAMETER_SENSITIVITY")); + } + + [Fact] + public async Task ParameterSensitive_ThreeOffendersWithDivergenceFlags() + { + var (_, facts) = await RunFullPipelineAsync(s => s.SeedParameterSensitiveServerAsync()); + + var fact = facts["PARAMETER_SENSITIVITY"]; + Assert.Equal(3.0, fact.Metadata["offender_count"]); + // Worst offender: grant ratio ~1024x and spills on some parameter values only. + Assert.Equal(1.0, fact.Metadata["grant_divergence"]); + Assert.Equal(1.0, fact.Metadata["spill_divergence"]); + } + + /* ── Plan Regression ── */ + + [Fact] + public async Task PlanRegression_FactCollectedAtHighSeverity() + { + var (stories, facts) = await RunFullPipelineAsync(s => s.SeedPlanRegressionServerAsync()); + PrintStories("PLAN REGRESSION", stories); + + Assert.True(facts.ContainsKey("PLAN_REGRESSION"), "PLAN_REGRESSION should be collected"); + // Worst factor ~12x is past the critical threshold (10x) → base severity 1.0. + Assert.True(facts["PLAN_REGRESSION"].Severity >= 1.0, + $"Expected high severity, got {facts["PLAN_REGRESSION"].Severity:F2}"); + } + + [Fact] + public async Task PlanRegression_AppearsInStories() + { + var (stories, _) = await RunFullPipelineAsync(s => s.SeedPlanRegressionServerAsync()); + Assert.Contains(stories, s => s.Path.Contains("PLAN_REGRESSION")); + } + + [Fact] + public async Task PlanRegression_WorstFactorIsCpuDriven() + { + var (_, facts) = await RunFullPipelineAsync(s => s.SeedPlanRegressionServerAsync()); + + var fact = facts["PLAN_REGRESSION"]; + // 1.2s vs 100ms CPU = 12x; 1.35s vs 120ms duration = 11.25x → CPU dimension wins. + Assert.Equal(12.0, fact.Metadata["worst_regression_factor"], precision: 1); + Assert.Equal(1.0, fact.Metadata["regressed_dimension"]); // 1 = cpu + Assert.Equal(1.0, fact.Metadata["offender_count"]); + } + /* ── Helper ── */ diff --git a/Lite/Analysis/DrillDownCollector.cs b/Lite/Analysis/DrillDownCollector.cs index b3d515ed..a067428b 100644 --- a/Lite/Analysis/DrillDownCollector.cs +++ b/Lite/Analysis/DrillDownCollector.cs @@ -77,6 +77,12 @@ public async Task EnrichFindingsAsync(List findings, AnalysisCo if (pathKeys.Any(k => k.StartsWith("BAD_ACTOR_", StringComparison.OrdinalIgnoreCase))) await CollectBadActorDetail(finding, context); + if (pathKeys.Contains("PARAMETER_SENSITIVITY")) + await CollectParameterSensitiveQueries(finding, context); + + if (pathKeys.Contains("PLAN_REGRESSION")) + await CollectRegressedQueries(finding, context); + // Plan analysis: for findings with top queries, analyze their cached plans await CollectPlanAnalysis(finding, context); @@ -171,6 +177,234 @@ ORDER BY wait_time_ms DESC finding.DrillDown!["top_blocking_chains"] = items; } + /// + /// Top parameter-sensitive plans behind a PARAMETER_SENSITIVITY finding. + /// Re-runs Detector A's detection (standard analysis window) for the top 5 offenders. + /// + private async Task CollectParameterSensitiveQueries(AnalysisFinding finding, AnalysisContext context) + { + using var readLock = _duckDb.AcquireReadLock(); + using var connection = _duckDb.CreateConnection(); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +WITH latest AS +( + SELECT + database_name, + query_hash, + query_plan_hash, + execution_count, + creation_time, + min_worker_time, + max_worker_time, + min_grant_kb, + max_grant_kb, + min_spills, + max_spills, + query_text, + ROW_NUMBER() OVER + ( + PARTITION BY database_name, query_hash, query_plan_hash + ORDER BY collection_time DESC + ) AS rn + FROM v_query_stats + WHERE server_id = $1 + AND collection_time >= $2 + AND collection_time <= $3 + AND delta_execution_count > 0 +) +SELECT + database_name, + query_hash, + query_plan_hash, + execution_count, + min_worker_time, + max_worker_time, + max_worker_time::DOUBLE / NULLIF(min_worker_time, 0) AS worker_ratio, + max_grant_kb::DOUBLE / NULLIF(min_grant_kb, 0) AS grant_ratio, + CASE WHEN max_spills > 0 AND min_spills = 0 THEN 1 ELSE 0 END AS spill_divergence, + LEFT(query_text, 500) AS query_text +FROM latest +WHERE rn = 1 +AND min_worker_time >= 10000 +AND max_worker_time >= 250000 +AND execution_count >= 20 +AND creation_time <= $2 +AND max_worker_time::DOUBLE / NULLIF(min_worker_time, 0) >= 10 +ORDER BY worker_ratio DESC +LIMIT 5"; + + cmd.Parameters.Add(new DuckDBParameter { Value = context.ServerId }); + cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeStart }); + cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeEnd }); + + var items = new List(); + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + items.Add(new + { + database = reader.IsDBNull(0) ? "" : reader.GetString(0), + query_hash = reader.IsDBNull(1) ? "" : reader.GetString(1), + query_plan_hash = reader.IsDBNull(2) ? "" : reader.GetString(2), + execution_count = reader.IsDBNull(3) ? 0L : Convert.ToInt64(reader.GetValue(3)), + min_worker_time_us = reader.IsDBNull(4) ? 0L : Convert.ToInt64(reader.GetValue(4)), + max_worker_time_us = reader.IsDBNull(5) ? 0L : Convert.ToInt64(reader.GetValue(5)), + worker_ratio = reader.IsDBNull(6) ? 0.0 : Convert.ToDouble(reader.GetValue(6)), + grant_ratio = reader.IsDBNull(7) ? 0.0 : Convert.ToDouble(reader.GetValue(7)), + spills_on_some_inputs = !reader.IsDBNull(8) && Convert.ToInt32(reader.GetValue(8)) == 1, + query_text = reader.IsDBNull(9) ? "" : reader.GetString(9) + }); + } + + if (items.Count > 0) + finding.DrillDown!["parameter_sensitive_queries"] = items; + } + + /// + /// Top regressed queries behind a PLAN_REGRESSION finding. + /// Re-runs Detector B's detection for the top 5 offenders. Uses the same 14-day + /// last_execution_time comparison window as the detector — NOT the standard analysis + /// window — so the days-old "best plan" baseline is present. + /// + private async Task CollectRegressedQueries(AnalysisFinding finding, AnalysisContext context) + { + using var readLock = _duckDb.AcquireReadLock(); + using var connection = _duckDb.CreateConnection(); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +WITH deduped AS +( + SELECT + database_name, + query_id, + plan_id, + query_plan_hash, + execution_count, + avg_cpu_time_us, + avg_duration_us, + last_execution_time, + query_text, + ROW_NUMBER() OVER + ( + PARTITION BY database_name, query_id, plan_id, first_execution_time + ORDER BY collection_time DESC + ) AS rn + FROM v_query_store_stats + WHERE server_id = $1 + AND execution_type_desc = 'Regular' + AND last_execution_time >= $2 +), +plan_agg AS +( + SELECT + database_name, + query_id, + plan_id, + any_value(query_plan_hash) AS query_plan_hash, + any_value(query_text) AS query_text, + SUM(execution_count) AS execs, + SUM(avg_cpu_time_us * execution_count) / NULLIF(SUM(execution_count), 0) AS cpu_per_exec, + SUM(avg_duration_us * execution_count) / NULLIF(SUM(execution_count), 0) AS dur_per_exec, + MAX(last_execution_time) AS last_exec + FROM deduped + WHERE rn = 1 + GROUP BY database_name, query_id, plan_id +), +plan_dedup AS +( + SELECT + database_name, + query_id, + query_plan_hash, + any_value(query_text) AS query_text, + SUM(execs) AS execs, + SUM(cpu_per_exec * execs) / NULLIF(SUM(execs), 0) AS cpu_per_exec, + SUM(dur_per_exec * execs) / NULLIF(SUM(execs), 0) AS dur_per_exec, + MAX(last_exec) AS last_exec + FROM plan_agg + GROUP BY database_name, query_id, query_plan_hash + HAVING SUM(execs) >= 25 +), +ranked AS +( + SELECT + *, + ROW_NUMBER() OVER (PARTITION BY database_name, query_id ORDER BY last_exec DESC) AS recency, + ROW_NUMBER() OVER (PARTITION BY database_name, query_id ORDER BY cpu_per_exec ASC) AS cheapness + FROM plan_dedup +), +compared AS +( + SELECT + l.database_name, + l.query_id, + l.query_plan_hash AS latest_plan_hash, + l.cpu_per_exec AS latest_cpu, + l.dur_per_exec AS latest_dur, + b.query_plan_hash AS best_plan_hash, + b.cpu_per_exec AS best_cpu, + b.dur_per_exec AS best_dur, + l.query_text, + GREATEST + ( + l.cpu_per_exec / NULLIF(b.cpu_per_exec, 0), + l.dur_per_exec / NULLIF(b.dur_per_exec, 0) + ) AS regression_factor + FROM ranked AS l + JOIN ranked AS b + ON b.database_name = l.database_name + AND b.query_id = l.query_id + AND b.cheapness = 1 + WHERE l.recency = 1 + AND l.query_plan_hash <> b.query_plan_hash +) +SELECT + database_name, + query_id, + latest_plan_hash, + latest_cpu, + latest_dur, + best_plan_hash, + best_cpu, + best_dur, + regression_factor, + LEFT(query_text, 500) AS query_text +FROM compared +WHERE regression_factor >= 2 +ORDER BY regression_factor DESC +LIMIT 5"; + + cmd.Parameters.Add(new DuckDBParameter { Value = context.ServerId }); + cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeStart.AddDays(-14) }); + + var items = new List(); + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + items.Add(new + { + database = reader.IsDBNull(0) ? "" : reader.GetString(0), + query_id = reader.IsDBNull(1) ? 0L : Convert.ToInt64(reader.GetValue(1)), + latest_plan_hash = reader.IsDBNull(2) ? "" : reader.GetString(2), + latest_cpu_per_exec_us = reader.IsDBNull(3) ? 0.0 : Convert.ToDouble(reader.GetValue(3)), + latest_duration_per_exec_us = reader.IsDBNull(4) ? 0.0 : Convert.ToDouble(reader.GetValue(4)), + best_plan_hash = reader.IsDBNull(5) ? "" : reader.GetString(5), + best_cpu_per_exec_us = reader.IsDBNull(6) ? 0.0 : Convert.ToDouble(reader.GetValue(6)), + best_duration_per_exec_us = reader.IsDBNull(7) ? 0.0 : Convert.ToDouble(reader.GetValue(7)), + regression_factor = reader.IsDBNull(8) ? 0.0 : Convert.ToDouble(reader.GetValue(8)), + query_text = reader.IsDBNull(9) ? "" : reader.GetString(9) + }); + } + + if (items.Count > 0) + finding.DrillDown!["regressed_queries"] = items; + } + private async Task CollectQueriesAtSpike(AnalysisFinding finding, AnalysisContext context) { // Find the peak CPU time, then get queries active within 2 minutes of it diff --git a/Lite/Analysis/DuckDbFactCollector.cs b/Lite/Analysis/DuckDbFactCollector.cs index c366270d..8ab21150 100644 --- a/Lite/Analysis/DuckDbFactCollector.cs +++ b/Lite/Analysis/DuckDbFactCollector.cs @@ -39,6 +39,8 @@ public async Task> CollectFactsAsync(AnalysisContext context) await CollectTempDbFactsAsync(context, facts); await CollectMemoryGrantFactsAsync(context, facts); await CollectQueryStatsFactsAsync(context, facts); + await CollectParameterSensitivityFactsAsync(context, facts); + await CollectPlanRegressionFactsAsync(context, facts); await CollectBadActorFactsAsync(context, facts); await CollectPerfmonFactsAsync(context, facts); await CollectMemoryClerkFactsAsync(context, facts); @@ -754,6 +756,298 @@ FROM v_query_stats catch { /* Table may not exist or have no data */ } } + /// + /// Detects parameter-sensitive cached plans: a single query_plan_hash whose + /// per-execution worker time varies wildly — one plan serving very different + /// parameter values. Emits one aggregate PARAMETER_SENSITIVITY fact. + /// Note min_*/max_* are cumulative over the plan's cached lifetime, so the + /// finding means "this plan, active now, has a history of widely varying cost". + /// + private async Task CollectParameterSensitivityFactsAsync(AnalysisContext context, List facts) + { + try + { + using var readLock = _duckDb.AcquireReadLock(); + using var connection = _duckDb.CreateConnection(); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +WITH latest AS +( + SELECT + query_hash, + query_plan_hash, + database_name, + execution_count, + creation_time, + min_worker_time, + max_worker_time, + min_grant_kb, + max_grant_kb, + min_spills, + max_spills, + ROW_NUMBER() OVER + ( + PARTITION BY database_name, query_hash, query_plan_hash + ORDER BY collection_time DESC + ) AS rn + FROM v_query_stats + WHERE server_id = $1 + AND collection_time >= $2 + AND collection_time <= $3 + AND delta_execution_count > 0 +) +SELECT + min_worker_time, + max_worker_time, + max_worker_time::DOUBLE / NULLIF(min_worker_time, 0) AS worker_ratio, + max_grant_kb::DOUBLE / NULLIF(min_grant_kb, 0) AS grant_ratio, + CASE WHEN max_spills > 0 AND min_spills = 0 THEN 1 ELSE 0 END AS spill_divergence +FROM latest +WHERE rn = 1 +AND min_worker_time >= 10000 +AND max_worker_time >= 250000 +AND execution_count >= 20 +AND creation_time <= $2 +AND max_worker_time::DOUBLE / NULLIF(min_worker_time, 0) >= 10 +ORDER BY worker_ratio DESC +LIMIT 20"; + + cmd.Parameters.Add(new DuckDBParameter { Value = context.ServerId }); + cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeStart }); + cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeEnd }); + + var offenderCount = 0; + var worstRatio = 0.0; + var worstMinWorker = 0L; + var worstMaxWorker = 0L; + var worstGrantRatio = 0.0; + var worstSpillDivergence = 0; + + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + // Rows arrive ordered by worker_ratio DESC — the first row is the worst offender. + if (offenderCount == 0) + { + worstMinWorker = reader.IsDBNull(0) ? 0L : ToInt64(reader.GetValue(0)); + worstMaxWorker = reader.IsDBNull(1) ? 0L : ToInt64(reader.GetValue(1)); + worstRatio = reader.IsDBNull(2) ? 0.0 : Convert.ToDouble(reader.GetValue(2)); + worstGrantRatio = reader.IsDBNull(3) ? 0.0 : Convert.ToDouble(reader.GetValue(3)); + worstSpillDivergence = reader.IsDBNull(4) ? 0 : Convert.ToInt32(reader.GetValue(4)); + } + offenderCount++; + } + + if (offenderCount == 0) return; + + facts.Add(new Fact + { + Source = "queries", + Key = "PARAMETER_SENSITIVITY", + Value = worstRatio, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["offender_count"] = offenderCount, + ["worst_ratio"] = worstRatio, + ["worst_min_worker_us"] = worstMinWorker, + ["worst_max_worker_us"] = worstMaxWorker, + ["worst_grant_ratio"] = worstGrantRatio, + ["grant_divergence"] = worstGrantRatio >= 5 ? 1 : 0, + ["spill_divergence"] = worstSpillDivergence + } + }); + } + catch { /* Table may not exist or have no data */ } + } + + /// + /// Detects plan regressions: a query whose currently-active plan has per-execution + /// cost >= 2x the best plan that query is known to perform well with. Emits one + /// aggregate PLAN_REGRESSION fact. Sourced from Query Store (v_query_store_stats); + /// no fact when Query Store is not enabled on the monitored databases. + /// Unlike other collectors this windows on last_execution_time (14-day comparison + /// window), NOT collection_time — see plan note. + /// + private async Task CollectPlanRegressionFactsAsync(AnalysisContext context, List facts) + { + try + { + using var readLock = _duckDb.AcquireReadLock(); + using var connection = _duckDb.CreateConnection(); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +WITH deduped AS +( + -- Collapse incremental re-collections of the same open runtime-stats interval: + -- keep only the latest collection_time row per logical interval. + SELECT + database_name, + query_id, + plan_id, + query_plan_hash, + execution_count, + avg_cpu_time_us, + avg_duration_us, + last_execution_time, + is_forced_plan, + force_failure_count, + ROW_NUMBER() OVER + ( + PARTITION BY database_name, query_id, plan_id, first_execution_time + ORDER BY collection_time DESC + ) AS rn + FROM v_query_store_stats + WHERE server_id = $1 + AND execution_type_desc = 'Regular' + AND last_execution_time >= $2 +), +plan_agg AS +( + -- Execution-weighted per-exec cost per plan_id. + SELECT + database_name, + query_id, + plan_id, + any_value(query_plan_hash) AS query_plan_hash, + SUM(execution_count) AS execs, + SUM(avg_cpu_time_us * execution_count) / NULLIF(SUM(execution_count), 0) AS cpu_per_exec, + SUM(avg_duration_us * execution_count) / NULLIF(SUM(execution_count), 0) AS dur_per_exec, + MAX(last_execution_time) AS last_exec, + bool_or(is_forced_plan) AS is_forced_plan, + MAX(force_failure_count) AS force_failure_count + FROM deduped + WHERE rn = 1 + GROUP BY database_name, query_id, plan_id +), +plan_dedup AS +( + -- Collapse plan_ids that share a query_plan_hash (a recompile can produce an + -- identical plan under a new plan_id); keep only plans with enough executions. + SELECT + database_name, + query_id, + query_plan_hash, + SUM(execs) AS execs, + SUM(cpu_per_exec * execs) / NULLIF(SUM(execs), 0) AS cpu_per_exec, + SUM(dur_per_exec * execs) / NULLIF(SUM(execs), 0) AS dur_per_exec, + MAX(last_exec) AS last_exec, + bool_or(is_forced_plan) AS is_forced_plan, + MAX(force_failure_count) AS force_failure_count + FROM plan_agg + GROUP BY database_name, query_id, query_plan_hash + HAVING SUM(execs) >= 25 +), +ranked AS +( + SELECT + *, + ROW_NUMBER() OVER (PARTITION BY database_name, query_id ORDER BY last_exec DESC) AS recency, + ROW_NUMBER() OVER (PARTITION BY database_name, query_id ORDER BY cpu_per_exec ASC) AS cheapness + FROM plan_dedup +), +compared AS +( + -- Latest active plan vs the best-performing plan for the same query. + SELECT + l.query_id, + l.cpu_per_exec AS latest_cpu, + l.dur_per_exec AS latest_dur, + l.is_forced_plan AS latest_is_forced, + l.force_failure_count AS force_failure_count, + b.cpu_per_exec AS best_cpu, + b.dur_per_exec AS best_dur, + GREATEST + ( + l.cpu_per_exec / NULLIF(b.cpu_per_exec, 0), + l.dur_per_exec / NULLIF(b.dur_per_exec, 0) + ) AS regression_factor + FROM ranked AS l + JOIN ranked AS b + ON b.database_name = l.database_name + AND b.query_id = l.query_id + AND b.cheapness = 1 + WHERE l.recency = 1 + AND l.query_plan_hash <> b.query_plan_hash +) +SELECT + query_id, + latest_cpu, + latest_dur, + latest_is_forced, + force_failure_count, + best_cpu, + best_dur, + regression_factor +FROM compared +WHERE regression_factor >= 2 +ORDER BY regression_factor DESC +LIMIT 20"; + + cmd.Parameters.Add(new DuckDBParameter { Value = context.ServerId }); + cmd.Parameters.Add(new DuckDBParameter { Value = context.TimeRangeStart.AddDays(-14) }); + + var offenderCount = 0; + var worstFactor = 0.0; + var worstQueryId = 0L; + var worstLatestCpu = 0.0; + var worstBestCpu = 0.0; + var worstDimension = 1; + var worstLatestForced = 0; + var worstForceFailures = 0L; + + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + // Rows arrive ordered by regression_factor DESC — the first row is the worst offender. + if (offenderCount == 0) + { + worstQueryId = reader.IsDBNull(0) ? 0L : ToInt64(reader.GetValue(0)); + var latestCpu = reader.IsDBNull(1) ? 0.0 : Convert.ToDouble(reader.GetValue(1)); + var latestDur = reader.IsDBNull(2) ? 0.0 : Convert.ToDouble(reader.GetValue(2)); + worstLatestForced = (!reader.IsDBNull(3) && Convert.ToBoolean(reader.GetValue(3))) ? 1 : 0; + worstForceFailures = reader.IsDBNull(4) ? 0L : ToInt64(reader.GetValue(4)); + var bestCpu = reader.IsDBNull(5) ? 0.0 : Convert.ToDouble(reader.GetValue(5)); + var bestDur = reader.IsDBNull(6) ? 0.0 : Convert.ToDouble(reader.GetValue(6)); + worstFactor = reader.IsDBNull(7) ? 0.0 : Convert.ToDouble(reader.GetValue(7)); + + worstLatestCpu = latestCpu; + worstBestCpu = bestCpu; + var cpuRatio = bestCpu > 0 ? latestCpu / bestCpu : 0.0; + var durRatio = bestDur > 0 ? latestDur / bestDur : 0.0; + worstDimension = cpuRatio >= durRatio ? 1 : 2; // 1 = cpu, 2 = duration + } + offenderCount++; + } + + if (offenderCount == 0) return; + + facts.Add(new Fact + { + Source = "queries", + Key = "PLAN_REGRESSION", + Value = worstFactor, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["offender_count"] = offenderCount, + ["worst_regression_factor"] = worstFactor, + ["worst_query_id"] = worstQueryId, + ["latest_cpu_per_exec_us"] = worstLatestCpu, + ["best_cpu_per_exec_us"] = worstBestCpu, + ["regressed_dimension"] = worstDimension, + ["latest_is_forced"] = worstLatestForced, + ["force_failure_count"] = worstForceFailures + } + }); + } + catch { /* Table may not exist or have no data */ } + } + /// /// Identifies individual queries that are consistently terrible ("bad actors"). /// These queries don't necessarily cause server-level symptoms but waste resources diff --git a/Lite/Analysis/FactScorer.cs b/Lite/Analysis/FactScorer.cs index e42b4e9d..13c1044f 100644 --- a/Lite/Analysis/FactScorer.cs +++ b/Lite/Analysis/FactScorer.cs @@ -189,6 +189,11 @@ private static double ScoreQueryFact(Fact fact) "QUERY_SPILLS" => ApplyThresholdFormula(fact.Value, 100, 1000), // High DOP queries: concerning at 5, critical at 20 in the period "QUERY_HIGH_DOP" => ApplyThresholdFormula(fact.Value, 5, 20), + // Parameter sensitivity: worst max/min worker-time ratio. Magnitude-driven — + // concerning at 10x, critical at 100x — so a lone catastrophic plan still scores high. + "PARAMETER_SENSITIVITY" => ApplyThresholdFormula(fact.Value, 10, 100), + // Plan regression: worst per-exec cost factor vs the best plan. Concerning 2x, critical 10x. + "PLAN_REGRESSION" => ApplyThresholdFormula(fact.Value, 2, 10), _ => 0.0 }; } @@ -386,6 +391,8 @@ private static List GetAmplifiers(Fact fact) "IO_WRITE_LATENCY_MS" => IoWriteLatencyAmplifiers(), "MEMORY_GRANT_PENDING" => MemoryGrantAmplifiers(), "QUERY_SPILLS" => QuerySpillAmplifiers(), + "PARAMETER_SENSITIVITY" => ParameterSensitivityAmplifiers(), + "PLAN_REGRESSION" => PlanRegressionAmplifiers(), "PERFMON_PLE" => PleAmplifiers(), "DB_CONFIG" => DbConfigAmplifiers(), "DISK_SPACE" => DiskSpaceAmplifiers(), @@ -393,6 +400,76 @@ private static List GetAmplifiers(Fact fact) }; } + /// + /// PARAMETER_SENSITIVITY: a single plan with wildly varying per-execution cost. + /// Corroborated by grant/spill divergence and memory-grant pressure. + /// + private static List ParameterSensitivityAmplifiers() => + [ + new() + { + Description = "Three or more sensitive plans — systemic parameter-sniffing problem", + Boost = 0.3, + Predicate = facts => facts.TryGetValue("PARAMETER_SENSITIVITY", out var f) + && f.Metadata.GetValueOrDefault("offender_count") >= 3 + }, + new() + { + Description = "Memory grant varies with the plan — classic sniffing fingerprint", + Boost = 0.3, + Predicate = facts => facts.TryGetValue("PARAMETER_SENSITIVITY", out var f) + && f.Metadata.GetValueOrDefault("grant_divergence") > 0 + }, + new() + { + Description = "Worst plan spills on some parameter values but not others", + Boost = 0.2, + Predicate = facts => facts.TryGetValue("PARAMETER_SENSITIVITY", out var f) + && f.Metadata.GetValueOrDefault("spill_divergence") > 0 + }, + new() + { + Description = "Memory grant pressure present — sensitive plans competing for grants", + Boost = 0.2, + Predicate = facts => facts.TryGetValue("MEMORY_GRANT_PENDING", out var f) && f.BaseSeverity > 0 + } + ]; + + /// + /// PLAN_REGRESSION: a query running a worse plan than one it performed well with. + /// Corroborated by a failing forced plan and by CPU pressure. + /// + private static List PlanRegressionAmplifiers() => + [ + new() + { + Description = "Three or more regressed queries — systemic plan-choice instability", + Boost = 0.3, + Predicate = facts => facts.TryGetValue("PLAN_REGRESSION", out var f) + && f.Metadata.GetValueOrDefault("offender_count") >= 3 + }, + new() + { + Description = "Worst regression is on a forced plan that is failing to apply", + Boost = 0.4, + Predicate = facts => facts.TryGetValue("PLAN_REGRESSION", out var f) + && f.Metadata.GetValueOrDefault("latest_is_forced") > 0 + && f.Metadata.GetValueOrDefault("force_failure_count") > 0 + }, + new() + { + Description = "CPU spike present — regressed plan likely driving it", + Boost = 0.25, + Predicate = facts => facts.TryGetValue("CPU_SPIKE", out var f) && f.BaseSeverity > 0 + }, + new() + { + Description = "SQL Server CPU elevated — regressed plan contributing", + Boost = 0.2, + Predicate = facts => facts.TryGetValue("CPU_SQL_PERCENT", out var f) && f.BaseSeverity > 0 + } + ]; + /// /// SOS_SCHEDULER_YIELD: CPU starvation confirmed by parallelism waits. /// More amplifiers added when config and CPU utilization facts are available. diff --git a/Lite/Analysis/RelationshipGraph.cs b/Lite/Analysis/RelationshipGraph.cs index 11ab6a3a..b1c1d248 100644 --- a/Lite/Analysis/RelationshipGraph.cs +++ b/Lite/Analysis/RelationshipGraph.cs @@ -132,6 +132,21 @@ private void BuildCpuPressureEdges() AddEdge("CPU_SPIKE", "CXPACKET", "cpu_spike", "Parallelism waits — parallel queries contributing to CPU spike", facts => HasFact(facts, "CXPACKET") && facts["CXPACKET"].Severity >= 0.3); + + // CPU_SPIKE → PLAN_REGRESSION (spike explained by a regressed plan) + AddEdge("CPU_SPIKE", "PLAN_REGRESSION", "cpu_spike", + "Plan regression present — a query is running a worse plan than before", + facts => HasFact(facts, "PLAN_REGRESSION") && facts["PLAN_REGRESSION"].BaseSeverity > 0); + + // CPU_SQL_PERCENT → PLAN_REGRESSION (sustained CPU explained by a regressed plan) + AddEdge("CPU_SQL_PERCENT", "PLAN_REGRESSION", "cpu_pressure", + "Plan regression present — a query is running a worse plan than before", + facts => HasFact(facts, "PLAN_REGRESSION") && facts["PLAN_REGRESSION"].BaseSeverity > 0); + + // CPU_SPIKE → PARAMETER_SENSITIVITY (spike explained by a parameter-sensitive plan) + AddEdge("CPU_SPIKE", "PARAMETER_SENSITIVITY", "cpu_spike", + "Parameter-sensitive plan present — one plan running expensively for some inputs", + facts => HasFact(facts, "PARAMETER_SENSITIVITY") && facts["PARAMETER_SENSITIVITY"].BaseSeverity > 0); } /* ── Memory Pressure ── */ @@ -182,6 +197,11 @@ private void BuildMemoryPressureEdges() AddEdge("PAGEIOLATCH_EX", "IO_READ_LATENCY_MS", "memory_pressure", "Read latency elevated — disk confirms buffer pool pressure", facts => HasFact(facts, "IO_READ_LATENCY_MS") && facts["IO_READ_LATENCY_MS"].BaseSeverity > 0); + + // MEMORY_GRANT_PENDING → PARAMETER_SENSITIVITY (grant pressure traced to a sensitive plan) + AddEdge("MEMORY_GRANT_PENDING", "PARAMETER_SENSITIVITY", "memory_grants", + "Parameter-sensitive plan present — its grant varies wildly with its inputs", + facts => HasFact(facts, "PARAMETER_SENSITIVITY") && facts["PARAMETER_SENSITIVITY"].BaseSeverity > 0); } /* ── Blocking & Deadlocking ── */ @@ -316,6 +336,30 @@ private void BuildQueryEdges() AddEdge("QUERY_HIGH_DOP", "SOS_SCHEDULER_YIELD", "query_performance", "Scheduler yields — high-DOP queries saturating CPU", facts => HasFact(facts, "SOS_SCHEDULER_YIELD") && facts["SOS_SCHEDULER_YIELD"].Severity >= 0.5); + + // PARAMETER_SENSITIVITY → MEMORY_GRANT_PENDING (sensitive plan's grant varies — grant pressure) + AddEdge("PARAMETER_SENSITIVITY", "MEMORY_GRANT_PENDING", "query_performance", + "Memory grant waiters — a parameter-sensitive plan's grant varies with its inputs", + facts => HasFact(facts, "MEMORY_GRANT_PENDING") && facts["MEMORY_GRANT_PENDING"].BaseSeverity > 0 + && facts.TryGetValue("PARAMETER_SENSITIVITY", out var ps) + && ps.Metadata.GetValueOrDefault("grant_divergence") > 0); + + // PARAMETER_SENSITIVITY → QUERY_SPILLS (sensitive plan spills on some parameter values) + AddEdge("PARAMETER_SENSITIVITY", "QUERY_SPILLS", "query_performance", + "Query spills — a parameter-sensitive plan spills on some parameter values", + facts => HasFact(facts, "QUERY_SPILLS") && facts["QUERY_SPILLS"].BaseSeverity > 0 + && facts.TryGetValue("PARAMETER_SENSITIVITY", out var ps) + && ps.Metadata.GetValueOrDefault("spill_divergence") > 0); + + // PLAN_REGRESSION → CPU_SQL_PERCENT (regressed plan driving CPU load) + AddEdge("PLAN_REGRESSION", "CPU_SQL_PERCENT", "query_performance", + "SQL Server CPU elevated — the regressed plan is burning CPU", + facts => HasFact(facts, "CPU_SQL_PERCENT") && facts["CPU_SQL_PERCENT"].BaseSeverity > 0); + + // PLAN_REGRESSION → CPU_SPIKE (regressed plan caused a CPU spike) + AddEdge("PLAN_REGRESSION", "CPU_SPIKE", "query_performance", + "CPU spike — the regressed plan is burning CPU", + facts => HasFact(facts, "CPU_SPIKE") && facts["CPU_SPIKE"].BaseSeverity > 0); } private static bool HasFact(IReadOnlyDictionary facts, string key) diff --git a/Lite/Analysis/TestDataSeeder.cs b/Lite/Analysis/TestDataSeeder.cs index e4241a09..4dbf4653 100644 --- a/Lite/Analysis/TestDataSeeder.cs +++ b/Lite/Analysis/TestDataSeeder.cs @@ -429,6 +429,122 @@ await SeedIoLatencyAsync(totalReads: 800_000, stallReadMs: 28_000_000, // 35ms a await SeedServerPropertiesAsync(cpuCount: 16, htRatio: 2, physicalMemMb: 65_536); } + /// + /// Parameter-sensitive server: three cached plans whose per-execution worker time + /// varies wildly — one plan serving very different parameter values. The worst is a + /// ~1000x spread, also diverging on memory grant and on spills. + /// + /// Expected: a PARAMETER_SENSITIVITY finding. The worst ratio (1000x) drives + /// BaseSeverity to 1.0 on its own; offender_count = 3 fires the "systemic" amplifier. + /// + public async Task SeedParameterSensitiveServerAsync() + { + await ClearTestDataAsync(); + await SeedTestServerAsync(); + + // Hash, executions, min/max worker time (us), min/max grant (kb), min/max spills. + var plans = new (string Hash, long Execs, long MinWorker, long MaxWorker, + long MinGrant, long MaxGrant, long MinSpills, long MaxSpills)[] + { + ("0xSENS0001", 5_000, 20_000, 20_000_000, 1_024, 1_048_576, 0, 50), // ~1000x — catastrophic + ("0xSENS0002", 1_000, 50_000, 2_000_000, 4_096, 8_192, 0, 0), // ~40x + ("0xSENS0003", 200, 15_000, 300_000, 2_048, 2_048, 0, 0), // ~20x + }; + + using var readLock = _duckDb.AcquireReadLock(); + using var connection = _duckDb.CreateConnection(); + await connection.OpenAsync(); + + for (var i = 0; i < plans.Length; i++) + { + var p = plans[i]; + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +INSERT INTO query_stats + (collection_id, collection_time, server_id, server_name, database_name, + query_hash, query_plan_hash, creation_time, execution_count, + min_worker_time, max_worker_time, min_grant_kb, max_grant_kb, + min_spills, max_spills, query_text, delta_execution_count) +VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17)"; + + cmd.Parameters.Add(new DuckDBParameter { Value = _nextId-- }); + cmd.Parameters.Add(new DuckDBParameter { Value = TestPeriodStart.AddMinutes(30 + i * 20) }); + cmd.Parameters.Add(new DuckDBParameter { Value = TestServerId }); + cmd.Parameters.Add(new DuckDBParameter { Value = TestServerName }); + cmd.Parameters.Add(new DuckDBParameter { Value = "SensitiveDb" }); + cmd.Parameters.Add(new DuckDBParameter { Value = $"0xQH{i:D6}" }); + cmd.Parameters.Add(new DuckDBParameter { Value = p.Hash }); + cmd.Parameters.Add(new DuckDBParameter { Value = TestPeriodStart.AddDays(-3) }); // compiled before window + cmd.Parameters.Add(new DuckDBParameter { Value = p.Execs }); + cmd.Parameters.Add(new DuckDBParameter { Value = p.MinWorker }); + cmd.Parameters.Add(new DuckDBParameter { Value = p.MaxWorker }); + cmd.Parameters.Add(new DuckDBParameter { Value = p.MinGrant }); + cmd.Parameters.Add(new DuckDBParameter { Value = p.MaxGrant }); + cmd.Parameters.Add(new DuckDBParameter { Value = p.MinSpills }); + cmd.Parameters.Add(new DuckDBParameter { Value = p.MaxSpills }); + cmd.Parameters.Add(new DuckDBParameter { Value = $"SELECT * FROM dbo.SensitiveQuery{i} WHERE col = @p" }); + cmd.Parameters.Add(new DuckDBParameter { Value = 500L }); // delta_execution_count > 0 — active in window + await cmd.ExecuteNonQueryAsync(); + } + } + + /// + /// Plan-regression server: one query (query_id 101) that switched from a fast plan + /// to a plan ~12x more expensive per execution. Sourced from Query Store. + /// + /// Expected: a PLAN_REGRESSION finding. The worst factor (12x) is past the critical + /// threshold so BaseSeverity reaches 1.0. + /// + public async Task SeedPlanRegressionServerAsync() + { + await ClearTestDataAsync(); + await SeedTestServerAsync(); + + // plan_id, query_plan_hash, avg cpu (us), avg duration (us), last exec, first exec. + var plans = new (long PlanId, string Hash, long AvgCpu, long AvgDur, + DateTime LastExec, DateTime FirstExec)[] + { + (1, "0xGOODPLAN", 100_000, 120_000, TestPeriodStart.AddDays(-5), TestPeriodStart.AddDays(-6)), + (2, "0xBADPLAN", 1_200_000, 1_350_000, TestPeriodEnd, TestPeriodStart.AddDays(-1)), + }; + + using var readLock = _duckDb.AcquireReadLock(); + using var connection = _duckDb.CreateConnection(); + await connection.OpenAsync(); + + foreach (var p in plans) + { + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +INSERT INTO query_store_stats + (collection_id, collection_time, server_id, server_name, database_name, + query_id, plan_id, execution_type_desc, first_execution_time, last_execution_time, + query_text, query_hash, execution_count, avg_cpu_time_us, avg_duration_us, + query_plan_hash, is_forced_plan, force_failure_count) +VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18)"; + + cmd.Parameters.Add(new DuckDBParameter { Value = _nextId-- }); + cmd.Parameters.Add(new DuckDBParameter { Value = TestPeriodEnd }); + cmd.Parameters.Add(new DuckDBParameter { Value = TestServerId }); + cmd.Parameters.Add(new DuckDBParameter { Value = TestServerName }); + cmd.Parameters.Add(new DuckDBParameter { Value = "RegressionDb" }); + cmd.Parameters.Add(new DuckDBParameter { Value = 101L }); + cmd.Parameters.Add(new DuckDBParameter { Value = p.PlanId }); + cmd.Parameters.Add(new DuckDBParameter { Value = "Regular" }); + cmd.Parameters.Add(new DuckDBParameter { Value = p.FirstExec }); + cmd.Parameters.Add(new DuckDBParameter { Value = p.LastExec }); + cmd.Parameters.Add(new DuckDBParameter { Value = "SELECT * FROM dbo.Orders WHERE CustomerId = @id" }); + cmd.Parameters.Add(new DuckDBParameter { Value = "0xREGRESSQH" }); + cmd.Parameters.Add(new DuckDBParameter { Value = 100L }); // executions per plan — above the 25 floor + cmd.Parameters.Add(new DuckDBParameter { Value = p.AvgCpu }); + cmd.Parameters.Add(new DuckDBParameter { Value = p.AvgDur }); + cmd.Parameters.Add(new DuckDBParameter { Value = p.Hash }); + cmd.Parameters.Add(new DuckDBParameter { Value = false }); + cmd.Parameters.Add(new DuckDBParameter { Value = 0L }); + await cmd.ExecuteNonQueryAsync(); + } + } + /// /// Everything on fire: multiple high-severity categories competing. /// Memory pressure, CPU pressure, parallelism, lock contention, log writes.