From 6120c212eb1e29371cca029fbc63bf6cb530b09f Mon Sep 17 00:00:00 2001 From: Erik Darling <2136037+erikdarlingdata@users.noreply.github.com> Date: Thu, 19 Feb 2026 11:21:58 -0500 Subject: [PATCH] Lite incremental collection: query store, CPU dedup, deadlock timing fix (#147) Query store collector now fetches only new runtime_stats intervals since last collection instead of scanning a 60-minute window with CTE/GROUP BY every cycle. CPU collector deduplicates ring buffer samples client-side to insert ~1 row/cycle instead of 60. Deadlock collector moves XML parsing out of DuckDB timing block for accurate metrics. Shared GetLastCollectedTimeAsync helper extracts watermark lookup pattern. Co-Authored-By: Claude Opus 4.6 --- Lite/Services/RemoteCollectorService.Cpu.cs | 43 +++++++++- .../RemoteCollectorService.Deadlocks.cs | 40 ++++------ .../RemoteCollectorService.QueryStore.cs | 79 +++++++------------ Lite/Services/RemoteCollectorService.cs | 25 ++++++ 4 files changed, 111 insertions(+), 76 deletions(-) diff --git a/Lite/Services/RemoteCollectorService.Cpu.cs b/Lite/Services/RemoteCollectorService.Cpu.cs index 85148702..09fd6df5 100644 --- a/Lite/Services/RemoteCollectorService.Cpu.cs +++ b/Lite/Services/RemoteCollectorService.Cpu.cs @@ -78,19 +78,50 @@ FROM sys.dm_os_ring_buffers AS dorb ORDER BY t.timestamp DESC OPTION(RECOMPILE);"; - string query = isAzureSqlDb ? azureSqlDbQuery : ringBufferQuery; - var serverId = GetServerId(server); var collectionTime = DateTime.UtcNow; var rowsCollected = 0; _lastSqlMs = 0; _lastDuckDbMs = 0; + /* Get the most recent sample_time we already have, to skip duplicates. + Ring buffer always returns TOP 60 (computed sample_time can't be filtered server-side). + For Azure SQL DB, we push the filter into the SQL query since end_time is a real column. */ + var lastSampleTime = await GetLastCollectedTimeAsync( + serverId, "cpu_utilization_stats", "sample_time", cancellationToken); + + string query; + if (isAzureSqlDb && lastSampleTime.HasValue) + { + /* Azure SQL DB: filter server-side since end_time is a real column */ + query = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT + sample_time = drs.end_time, + sqlserver_cpu_utilization = CONVERT(integer, drs.avg_cpu_percent), + other_process_cpu_utilization = 0 +FROM sys.dm_db_resource_stats AS drs +WHERE drs.end_time > @last_sample_time +ORDER BY + drs.end_time DESC +OPTION(RECOMPILE);"; + } + else + { + query = isAzureSqlDb ? azureSqlDbQuery : ringBufferQuery; + } + var sqlSw = Stopwatch.StartNew(); using var sqlConnection = await CreateConnectionAsync(server, cancellationToken); using var command = new SqlCommand(query, sqlConnection); command.CommandTimeout = CommandTimeoutSeconds; + if (isAzureSqlDb && lastSampleTime.HasValue) + { + command.Parameters.Add(new SqlParameter("@last_sample_time", System.Data.SqlDbType.DateTime2) { Value = lastSampleTime.Value }); + } + using var reader = await command.ExecuteReaderAsync(cancellationToken); sqlSw.Stop(); _lastSqlMs = sqlSw.ElapsedMilliseconds; @@ -104,12 +135,18 @@ ORDER BY t.timestamp DESC while (await reader.ReadAsync(cancellationToken)) { + var sampleTime = reader.GetDateTime(0); + + /* Client-side dedup for ring buffer (computed sample_time can't be filtered in SQL) */ + if (!isAzureSqlDb && lastSampleTime.HasValue && sampleTime <= lastSampleTime.Value) + continue; + var row = appender.CreateRow(); row.AppendValue(GenerateCollectionId()) .AppendValue(collectionTime) .AppendValue(serverId) .AppendValue(server.ServerName) - .AppendValue(reader.GetDateTime(0)) + .AppendValue(sampleTime) .AppendValue(reader.IsDBNull(1) ? 0 : reader.GetInt32(1)) .AppendValue(reader.IsDBNull(2) ? 0 : reader.GetInt32(2)) .EndRow(); diff --git a/Lite/Services/RemoteCollectorService.Deadlocks.cs b/Lite/Services/RemoteCollectorService.Deadlocks.cs index 1a76b726..2ff5e9f9 100644 --- a/Lite/Services/RemoteCollectorService.Deadlocks.cs +++ b/Lite/Services/RemoteCollectorService.Deadlocks.cs @@ -13,7 +13,6 @@ using System.Threading; using System.Threading.Tasks; using System.Xml.Linq; -using DuckDB.NET.Data; using Microsoft.Data.SqlClient; using Microsoft.Extensions.Logging; using PerformanceMonitorLite.Models; @@ -358,22 +357,8 @@ WHERE evt.value('(@timestamp)[1]', 'datetime2') > @cutoff_time Pass it to SQL Server so we only fetch events newer than what we've collected. This prevents the same deadlock from being inserted multiple times as it lingers in the ring buffer across collection cycles. */ - DateTime? lastCollectedTime = null; - try - { - using var duckConn = _duckDb.CreateConnection(); - await duckConn.OpenAsync(cancellationToken); - using var cmd = duckConn.CreateCommand(); - cmd.CommandText = "SELECT MAX(deadlock_time) FROM deadlocks WHERE server_id = $1"; - cmd.Parameters.Add(new DuckDBParameter { Value = serverId }); - var result = await cmd.ExecuteScalarAsync(cancellationToken); - if (result is DateTime dt) - lastCollectedTime = dt; - } - catch - { - /* If DuckDB query fails, fall back to default 10-minute window */ - } + var lastCollectedTime = await GetLastCollectedTimeAsync( + serverId, "deadlocks", "deadlock_time", cancellationToken); var sqlSw = Stopwatch.StartNew(); using var sqlConnection = await CreateConnectionAsync(server, cancellationToken); @@ -389,24 +374,33 @@ lingers in the ring buffer across collection cycles. */ sqlSw.Stop(); _lastSqlMs = sqlSw.ElapsedMilliseconds; + /* Read all rows and parse XML before starting DuckDB timing. + ExtractVictimSqlText does XElement.Parse which is expensive + and was previously misattributed as DuckDB time. */ + var deadlockRows = new System.Collections.Generic.List<(DateTime? deadlockTime, string? victimProcessId, string? victimSqlText, string? graphXml)>(); + while (await reader.ReadAsync(cancellationToken)) + { + var victimProcessId = reader.IsDBNull(1) ? null : reader.GetString(1); + var graphXml = reader.IsDBNull(2) ? null : reader.GetString(2); + var victimSqlText = ExtractVictimSqlText(graphXml, victimProcessId); + var deadlockTime = reader.IsDBNull(0) ? (DateTime?)null : reader.GetDateTime(0); + deadlockRows.Add((deadlockTime, victimProcessId, victimSqlText, graphXml)); + } + var duckSw = Stopwatch.StartNew(); using var duckConnection = _duckDb.CreateConnection(); await duckConnection.OpenAsync(cancellationToken); using var appender = duckConnection.CreateAppender("deadlocks"); - while (await reader.ReadAsync(cancellationToken)) + foreach (var (deadlockTime, victimProcessId, victimSqlText, graphXml) in deadlockRows) { - var victimProcessId = reader.IsDBNull(1) ? null : reader.GetString(1); - var graphXml = reader.IsDBNull(2) ? null : reader.GetString(2); - var victimSqlText = ExtractVictimSqlText(graphXml, victimProcessId); - var row = appender.CreateRow(); row.AppendValue(GenerateCollectionId()) .AppendValue(collectionTime) .AppendValue(serverId) .AppendValue(server.ServerName) - .AppendValue(reader.IsDBNull(0) ? (DateTime?)null : reader.GetDateTime(0)) + .AppendValue(deadlockTime) .AppendValue(victimProcessId) .AppendValue(victimSqlText) .AppendValue(graphXml) diff --git a/Lite/Services/RemoteCollectorService.QueryStore.cs b/Lite/Services/RemoteCollectorService.QueryStore.cs index 651643a1..127a3878 100644 --- a/Lite/Services/RemoteCollectorService.QueryStore.cs +++ b/Lite/Services/RemoteCollectorService.QueryStore.cs @@ -46,6 +46,11 @@ ORDER BY d.name _lastSqlMs = 0; _lastDuckDbMs = 0; + /* Incremental: only fetch runtime_stats intervals newer than what we already have */ + var lastCollectedTime = await GetLastCollectedTimeAsync( + serverId, "query_store_stats", "last_execution_time", cancellationToken); + var cutoffTime = lastCollectedTime ?? DateTime.UtcNow.AddMinutes(-60); + var sqlSw = Stopwatch.StartNew(); using var sqlConnection = await CreateConnectionAsync(server, cancellationToken); @@ -72,7 +77,7 @@ ORDER BY d.name using var duckConnection = _duckDb.CreateConnection(); await duckConnection.OpenAsync(cancellationToken); - /* For each database, collect top queries */ + /* For each database, collect new query store intervals since last collection */ foreach (var dbName in databases) { try @@ -81,62 +86,36 @@ ORDER BY d.name EXECUTE [{dbName.Replace("]", "]]")}].sys.sp_executesql N'SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; - ;WITH ranked AS - ( - SELECT TOP (100) - query_id = qsq.query_id, - plan_id = qsp.plan_id, - query_text_id = qsq.query_text_id, - query_hash = qsq.query_hash, - query_plan_hash = qsp.query_plan_hash, - execution_count = SUM(qsrs.count_executions), - avg_duration_ms = CONVERT(decimal(18,2), AVG(qsrs.avg_duration) / 1000.0), - avg_cpu_time_ms = CONVERT(decimal(18,2), AVG(qsrs.avg_cpu_time) / 1000.0), - avg_logical_reads = CONVERT(decimal(18,2), AVG(qsrs.avg_logical_io_reads)), - avg_logical_writes = CONVERT(decimal(18,2), AVG(qsrs.avg_logical_io_writes)), - avg_physical_reads = CONVERT(decimal(18,2), AVG(qsrs.avg_physical_io_reads)), - avg_rowcount = CONVERT(decimal(18,2), AVG(qsrs.avg_rowcount)), - last_execution_time = MAX(qsrs.last_execution_time), - total_duration = SUM(qsrs.avg_duration * qsrs.count_executions) - FROM sys.query_store_query AS qsq - JOIN sys.query_store_plan AS qsp - ON qsp.query_id = qsq.query_id - JOIN sys.query_store_runtime_stats AS qsrs - ON qsrs.plan_id = qsp.plan_id - WHERE qsrs.last_execution_time >= DATEADD(MINUTE, -60, SYSDATETIME()) - GROUP BY - qsq.query_id, - qsp.plan_id, - qsq.query_text_id, - qsq.query_hash, - qsp.query_plan_hash - ORDER BY - total_duration DESC - ) SELECT - r.query_id, - r.plan_id, + query_id = qsq.query_id, + plan_id = qsp.plan_id, query_text = qst.query_sql_text, - query_hash = CONVERT(varchar(64), r.query_hash, 1), - r.execution_count, - r.avg_duration_ms, - r.avg_cpu_time_ms, - r.avg_logical_reads, - r.avg_logical_writes, - r.avg_physical_reads, - r.avg_rowcount, - r.last_execution_time, - query_plan_hash = CONVERT(varchar(64), r.query_plan_hash, 1) - FROM ranked AS r + query_hash = CONVERT(varchar(64), qsq.query_hash, 1), + execution_count = qsrs.count_executions, + avg_duration_ms = CONVERT(decimal(18,2), qsrs.avg_duration / 1000.0), + avg_cpu_time_ms = CONVERT(decimal(18,2), qsrs.avg_cpu_time / 1000.0), + avg_logical_reads = CONVERT(decimal(18,2), qsrs.avg_logical_io_reads), + avg_logical_writes = CONVERT(decimal(18,2), qsrs.avg_logical_io_writes), + avg_physical_reads = CONVERT(decimal(18,2), qsrs.avg_physical_io_reads), + avg_rowcount = CONVERT(decimal(18,2), qsrs.avg_rowcount), + last_execution_time = qsrs.last_execution_time, + query_plan_hash = CONVERT(varchar(64), qsp.query_plan_hash, 1) + FROM sys.query_store_runtime_stats AS qsrs + JOIN sys.query_store_plan AS qsp + ON qsp.plan_id = qsrs.plan_id + JOIN sys.query_store_query AS qsq + ON qsq.query_id = qsp.query_id JOIN sys.query_store_query_text AS qst - ON qst.query_text_id = r.query_text_id - ORDER BY - r.total_duration DESC - OPTION(RECOMPILE);';"; + ON qst.query_text_id = qsq.query_text_id + WHERE qsrs.last_execution_time > @cutoff_time + OPTION(RECOMPILE);', + N'@cutoff_time datetime2(7)', + @cutoff_time;"; sqlSw.Start(); using var qsCommand = new SqlCommand(qsQuery, sqlConnection); qsCommand.CommandTimeout = CommandTimeoutSeconds; + qsCommand.Parameters.Add(new SqlParameter("@cutoff_time", System.Data.SqlDbType.DateTime2) { Value = cutoffTime }); using var reader = await qsCommand.ExecuteReaderAsync(cancellationToken); sqlSw.Stop(); diff --git a/Lite/Services/RemoteCollectorService.cs b/Lite/Services/RemoteCollectorService.cs index ea1e79a2..c2d82093 100644 --- a/Lite/Services/RemoteCollectorService.cs +++ b/Lite/Services/RemoteCollectorService.cs @@ -527,6 +527,31 @@ protected static int GetServerId(ServerConnection server) return GetDeterministicHashCode(server.ServerName); } + /// + /// Gets the most recent value of a timestamp column from DuckDB for incremental collection. + /// Returns null on first run or if the query fails (caller uses a fallback window). + /// + protected async Task GetLastCollectedTimeAsync( + int serverId, string tableName, string columnName, CancellationToken cancellationToken) + { + try + { + using var conn = _duckDb.CreateConnection(); + await conn.OpenAsync(cancellationToken); + using var cmd = conn.CreateCommand(); + cmd.CommandText = $"SELECT MAX({columnName}) FROM {tableName} WHERE server_id = $1"; + cmd.Parameters.Add(new DuckDB.NET.Data.DuckDBParameter { Value = serverId }); + var result = await cmd.ExecuteScalarAsync(cancellationToken); + if (result is DateTime dt) + return dt; + } + catch + { + /* If DuckDB query fails, caller uses fallback window */ + } + return null; + } + /// /// Safely converts a SQL Server float/real value to decimal. /// Returns 0 for Infinity, NaN, or values outside decimal range.