From d129122477d71f7cbb77b38d8d80662f4f8c8da6 Mon Sep 17 00:00:00 2001
From: Erik Darling <2136037+erikdarlingdata@users.noreply.github.com>
Date: Fri, 22 May 2026 11:41:35 -0400
Subject: [PATCH] Skip ring-buffer records missing SystemHealth values in CPU
collector (#989)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Some RING_BUFFER_SCHEDULER_MONITOR records lack a complete SystemHealth
block, so the ProcessUtilization / SystemIdle XML values extract as NULL.
The Dashboard collector inserts into NOT NULL columns, so a single bad
record fails the whole INSERT atomically. Nothing is ever inserted, so
@max_sample_time stays NULL, every run rescans the full 7-day window and
re-hits the same bad records — the collector never recovers.
- install/18: extract ProcessUtilization/SystemIdle once via CROSS APPLY,
filter out records where either is NULL. Valid rows now insert,
@max_sample_time advances, recovery is immediate.
- Lite RemoteCollectorService.Cpu.cs: same CROSS APPLY + NULL filter.
Lite's DuckDB columns are nullable so it never hard-failed, but it
stored NULL samples that skew the CPU chart.
Chose to drop malformed records rather than ISNULL(...,0): a fabricated
0 reads as a real "0% CPU" sample and misleads the charts; a record
with no SystemHealth block is not a CPU reading at all.
Verified: installed against SQL2022, collector runs clean; synthetic
test confirms a record with an empty is filtered out
while a valid record passes.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
Lite/Services/RemoteCollectorService.Cpu.cs | 18 ++++++++++----
install/18_collect_cpu_utilization_stats.sql | 25 ++++++++++++++------
2 files changed, 31 insertions(+), 12 deletions(-)
diff --git a/Lite/Services/RemoteCollectorService.Cpu.cs b/Lite/Services/RemoteCollectorService.Cpu.cs
index a46c0681..fbf3afd9 100644
--- a/Lite/Services/RemoteCollectorService.Cpu.cs
+++ b/Lite/Services/RemoteCollectorService.Cpu.cs
@@ -58,14 +58,12 @@ drs.end_time DESC
SELECT TOP (60)
sample_time = DATEADD(SECOND, -((@ms_ticks - t.timestamp) / 1000), SYSDATETIME()),
- sqlserver_cpu_utilization = t.record.value('(Record/SchedulerMonitorEvent/SystemHealth/ProcessUtilization)[1]', 'integer'),
+ sqlserver_cpu_utilization = x.process_utilization,
other_process_cpu_utilization =
CASE
- WHEN (100 - t.record.value('(Record/SchedulerMonitorEvent/SystemHealth/SystemIdle)[1]', 'integer')
- - t.record.value('(Record/SchedulerMonitorEvent/SystemHealth/ProcessUtilization)[1]', 'integer')) < 0
+ WHEN (100 - x.system_idle - x.process_utilization) < 0
THEN 0
- ELSE 100 - t.record.value('(Record/SchedulerMonitorEvent/SystemHealth/SystemIdle)[1]', 'integer')
- - t.record.value('(Record/SchedulerMonitorEvent/SystemHealth/ProcessUtilization)[1]', 'integer')
+ ELSE 100 - x.system_idle - x.process_utilization
END
FROM
(
@@ -75,6 +73,16 @@ record = CONVERT(xml, dorb.record)
FROM sys.dm_os_ring_buffers AS dorb
WHERE dorb.ring_buffer_type = N'RING_BUFFER_SCHEDULER_MONITOR'
) AS t
+CROSS APPLY
+(
+ SELECT
+ process_utilization = t.record.value('(Record/SchedulerMonitorEvent/SystemHealth/ProcessUtilization)[1]', 'integer'),
+ system_idle = t.record.value('(Record/SchedulerMonitorEvent/SystemHealth/SystemIdle)[1]', 'integer')
+) AS x
+/* Skip ring-buffer records lacking a complete SystemHealth block — their
+ XML values extract as NULL and would store NULL samples (Issue #989). */
+WHERE x.process_utilization IS NOT NULL
+AND x.system_idle IS NOT NULL
ORDER BY t.timestamp DESC
OPTION(RECOMPILE);";
diff --git a/install/18_collect_cpu_utilization_stats.sql b/install/18_collect_cpu_utilization_stats.sql
index 94857503..5563d4ab 100644
--- a/install/18_collect_cpu_utilization_stats.sql
+++ b/install/18_collect_cpu_utilization_stats.sql
@@ -131,16 +131,12 @@ BEGIN
@start_time
),
sqlserver_cpu_utilization =
- t.record.value('(Record/SchedulerMonitorEvent/SystemHealth/ProcessUtilization)[1]', 'integer'),
+ x.process_utilization,
other_process_cpu_utilization =
CASE
- WHEN (100 -
- t.record.value('(Record/SchedulerMonitorEvent/SystemHealth/SystemIdle)[1]', 'integer') -
- t.record.value('(Record/SchedulerMonitorEvent/SystemHealth/ProcessUtilization)[1]', 'integer')) < 0
+ WHEN (100 - x.system_idle - x.process_utilization) < 0
THEN 0
- ELSE 100 -
- t.record.value('(Record/SchedulerMonitorEvent/SystemHealth/SystemIdle)[1]', 'integer') -
- t.record.value('(Record/SchedulerMonitorEvent/SystemHealth/ProcessUtilization)[1]', 'integer')
+ ELSE 100 - x.system_idle - x.process_utilization
END
FROM
(
@@ -151,12 +147,27 @@ BEGIN
FROM sys.dm_os_ring_buffers AS dorb
WHERE dorb.ring_buffer_type = N'RING_BUFFER_SCHEDULER_MONITOR'
) AS t
+ CROSS APPLY
+ (
+ SELECT
+ process_utilization =
+ t.record.value('(Record/SchedulerMonitorEvent/SystemHealth/ProcessUtilization)[1]', 'integer'),
+ system_idle =
+ t.record.value('(Record/SchedulerMonitorEvent/SystemHealth/SystemIdle)[1]', 'integer')
+ ) AS x
WHERE DATEADD
(
SECOND,
-((@current_ms_ticks - t.timestamp) / 1000),
@start_time
) > ISNULL(@max_sample_time, DATEADD(DAY, -7, @start_time))
+ /*
+ Skip ring-buffer records that lack a complete SystemHealth block —
+ their XML values extract as NULL and would fail the NOT NULL INSERT,
+ breaking collection until the bad records age out (Issue #989).
+ */
+ AND x.process_utilization IS NOT NULL
+ AND x.system_idle IS NOT NULL
ORDER BY
t.timestamp DESC
OPTION(RECOMPILE);