From d129122477d71f7cbb77b38d8d80662f4f8c8da6 Mon Sep 17 00:00:00 2001 From: Erik Darling <2136037+erikdarlingdata@users.noreply.github.com> Date: Fri, 22 May 2026 11:41:35 -0400 Subject: [PATCH] Skip ring-buffer records missing SystemHealth values in CPU collector (#989) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some RING_BUFFER_SCHEDULER_MONITOR records lack a complete SystemHealth block, so the ProcessUtilization / SystemIdle XML values extract as NULL. The Dashboard collector inserts into NOT NULL columns, so a single bad record fails the whole INSERT atomically. Nothing is ever inserted, so @max_sample_time stays NULL, every run rescans the full 7-day window and re-hits the same bad records — the collector never recovers. - install/18: extract ProcessUtilization/SystemIdle once via CROSS APPLY, filter out records where either is NULL. Valid rows now insert, @max_sample_time advances, recovery is immediate. - Lite RemoteCollectorService.Cpu.cs: same CROSS APPLY + NULL filter. Lite's DuckDB columns are nullable so it never hard-failed, but it stored NULL samples that skew the CPU chart. Chose to drop malformed records rather than ISNULL(...,0): a fabricated 0 reads as a real "0% CPU" sample and misleads the charts; a record with no SystemHealth block is not a CPU reading at all. Verified: installed against SQL2022, collector runs clean; synthetic test confirms a record with an empty is filtered out while a valid record passes. Co-Authored-By: Claude Opus 4.7 (1M context) --- Lite/Services/RemoteCollectorService.Cpu.cs | 18 ++++++++++---- install/18_collect_cpu_utilization_stats.sql | 25 ++++++++++++++------ 2 files changed, 31 insertions(+), 12 deletions(-) diff --git a/Lite/Services/RemoteCollectorService.Cpu.cs b/Lite/Services/RemoteCollectorService.Cpu.cs index a46c0681..fbf3afd9 100644 --- a/Lite/Services/RemoteCollectorService.Cpu.cs +++ b/Lite/Services/RemoteCollectorService.Cpu.cs @@ -58,14 +58,12 @@ drs.end_time DESC SELECT TOP (60) sample_time = DATEADD(SECOND, -((@ms_ticks - t.timestamp) / 1000), SYSDATETIME()), - sqlserver_cpu_utilization = t.record.value('(Record/SchedulerMonitorEvent/SystemHealth/ProcessUtilization)[1]', 'integer'), + sqlserver_cpu_utilization = x.process_utilization, other_process_cpu_utilization = CASE - WHEN (100 - t.record.value('(Record/SchedulerMonitorEvent/SystemHealth/SystemIdle)[1]', 'integer') - - t.record.value('(Record/SchedulerMonitorEvent/SystemHealth/ProcessUtilization)[1]', 'integer')) < 0 + WHEN (100 - x.system_idle - x.process_utilization) < 0 THEN 0 - ELSE 100 - t.record.value('(Record/SchedulerMonitorEvent/SystemHealth/SystemIdle)[1]', 'integer') - - t.record.value('(Record/SchedulerMonitorEvent/SystemHealth/ProcessUtilization)[1]', 'integer') + ELSE 100 - x.system_idle - x.process_utilization END FROM ( @@ -75,6 +73,16 @@ record = CONVERT(xml, dorb.record) FROM sys.dm_os_ring_buffers AS dorb WHERE dorb.ring_buffer_type = N'RING_BUFFER_SCHEDULER_MONITOR' ) AS t +CROSS APPLY +( + SELECT + process_utilization = t.record.value('(Record/SchedulerMonitorEvent/SystemHealth/ProcessUtilization)[1]', 'integer'), + system_idle = t.record.value('(Record/SchedulerMonitorEvent/SystemHealth/SystemIdle)[1]', 'integer') +) AS x +/* Skip ring-buffer records lacking a complete SystemHealth block — their + XML values extract as NULL and would store NULL samples (Issue #989). */ +WHERE x.process_utilization IS NOT NULL +AND x.system_idle IS NOT NULL ORDER BY t.timestamp DESC OPTION(RECOMPILE);"; diff --git a/install/18_collect_cpu_utilization_stats.sql b/install/18_collect_cpu_utilization_stats.sql index 94857503..5563d4ab 100644 --- a/install/18_collect_cpu_utilization_stats.sql +++ b/install/18_collect_cpu_utilization_stats.sql @@ -131,16 +131,12 @@ BEGIN @start_time ), sqlserver_cpu_utilization = - t.record.value('(Record/SchedulerMonitorEvent/SystemHealth/ProcessUtilization)[1]', 'integer'), + x.process_utilization, other_process_cpu_utilization = CASE - WHEN (100 - - t.record.value('(Record/SchedulerMonitorEvent/SystemHealth/SystemIdle)[1]', 'integer') - - t.record.value('(Record/SchedulerMonitorEvent/SystemHealth/ProcessUtilization)[1]', 'integer')) < 0 + WHEN (100 - x.system_idle - x.process_utilization) < 0 THEN 0 - ELSE 100 - - t.record.value('(Record/SchedulerMonitorEvent/SystemHealth/SystemIdle)[1]', 'integer') - - t.record.value('(Record/SchedulerMonitorEvent/SystemHealth/ProcessUtilization)[1]', 'integer') + ELSE 100 - x.system_idle - x.process_utilization END FROM ( @@ -151,12 +147,27 @@ BEGIN FROM sys.dm_os_ring_buffers AS dorb WHERE dorb.ring_buffer_type = N'RING_BUFFER_SCHEDULER_MONITOR' ) AS t + CROSS APPLY + ( + SELECT + process_utilization = + t.record.value('(Record/SchedulerMonitorEvent/SystemHealth/ProcessUtilization)[1]', 'integer'), + system_idle = + t.record.value('(Record/SchedulerMonitorEvent/SystemHealth/SystemIdle)[1]', 'integer') + ) AS x WHERE DATEADD ( SECOND, -((@current_ms_ticks - t.timestamp) / 1000), @start_time ) > ISNULL(@max_sample_time, DATEADD(DAY, -7, @start_time)) + /* + Skip ring-buffer records that lack a complete SystemHealth block — + their XML values extract as NULL and would fail the NOT NULL INSERT, + breaking collection until the bad records age out (Issue #989). + */ + AND x.process_utilization IS NOT NULL + AND x.system_idle IS NOT NULL ORDER BY t.timestamp DESC OPTION(RECOMPILE);