From 3e9cf89996a2d144d72f7f30bcde9c56d0a6ea31 Mon Sep 17 00:00:00 2001 From: crusaderky Date: Thu, 7 Mar 2024 12:04:46 +0000 Subject: [PATCH 1/3] Weigh gilknocker Prometheus metric by duration --- distributed/dashboard/components/scheduler.py | 7 ++++--- distributed/http/scheduler/prometheus/core.py | 3 ++- .../http/scheduler/tests/test_scheduler_http.py | 2 +- distributed/http/tests/test_core.py | 4 ++-- distributed/http/worker/prometheus/core.py | 3 ++- .../http/worker/tests/test_worker_http.py | 2 +- distributed/system_monitor.py | 13 ++++++------- docs/source/prometheus.rst | 16 ++++++++++------ 8 files changed, 28 insertions(+), 22 deletions(-) diff --git a/distributed/dashboard/components/scheduler.py b/distributed/dashboard/components/scheduler.py index b9412168234..3d4e9b6fc5c 100644 --- a/distributed/dashboard/components/scheduler.py +++ b/distributed/dashboard/components/scheduler.py @@ -3911,7 +3911,6 @@ def __init__(self, scheduler, **kwargs): @log_errors def update(self): s = self.scheduler - monitor_gil = s.monitor.monitor_gil_contention self.data["values"] = [ s._tick_interval_observed, @@ -3919,11 +3918,13 @@ def update(self): sum(w.metrics["event_loop_interval"] for w in s.workers.values()) / (len(s.workers) or 1), self.gil_contention_workers, - ][:: 1 if monitor_gil else 2] + ][:: 1 if s.monitor.monitor_gil_contention else 2] # Format event loop as time and GIL (if configured) as % self.data["text"] = [ - f"{x * 100:.1f}%" if i % 2 and monitor_gil else format_time(x) + f"{x * 100:.1f}%" + if i % 2 and s.monitor.monitor_gil_contention + else format_time(x) for i, x in enumerate(self.data["values"]) ] update(self.source, self.data) diff --git a/distributed/http/scheduler/prometheus/core.py b/distributed/http/scheduler/prometheus/core.py index adea703f50e..b3ac06b6b22 100644 --- a/distributed/http/scheduler/prometheus/core.py +++ b/distributed/http/scheduler/prometheus/core.py @@ -56,7 +56,8 @@ def collect(self) -> Iterator[GaugeMetricFamily | CounterMetricFamily]: yield CounterMetricFamily( self.build_name("gil_contention"), "GIL contention metric", - value=self.server.monitor._cumulative_gil_contention, + value=self.server.monitor.cumulative_gil_contention, + unit="seconds", ) yield CounterMetricFamily( diff --git a/distributed/http/scheduler/tests/test_scheduler_http.py b/distributed/http/scheduler/tests/test_scheduler_http.py index d51ddf33ead..25ede716fbf 100644 --- a/distributed/http/scheduler/tests/test_scheduler_http.py +++ b/distributed/http/scheduler/tests/test_scheduler_http.py @@ -132,7 +132,7 @@ async def test_prometheus(c, s, a, b): except ImportError: pass # pragma: nocover else: - expected_metrics.add("dask_scheduler_gil_contention") + expected_metrics.add("dask_scheduler_gil_contention_seconds") assert set(active_metrics.keys()) == expected_metrics assert active_metrics["dask_scheduler_clients"].samples[0].value == 1.0 diff --git a/distributed/http/tests/test_core.py b/distributed/http/tests/test_core.py index 7f43d581156..3bccaf11ce7 100644 --- a/distributed/http/tests/test_core.py +++ b/distributed/http/tests/test_core.py @@ -73,8 +73,8 @@ async def test_prometheus_api_doc(c, s, a, _): gil_metrics = set() # Already in worker_metrics except ImportError: gil_metrics = { - "dask_scheduler_gil_contention_total", - "dask_worker_gil_contention_total", + "dask_scheduler_gil_contention_seconds_total", + "dask_worker_gil_contention_seconds_total", } implemented = scheduler_metrics | worker_metrics | crick_metrics | gil_metrics diff --git a/distributed/http/worker/prometheus/core.py b/distributed/http/worker/prometheus/core.py index f62027cf475..5354ee521c0 100644 --- a/distributed/http/worker/prometheus/core.py +++ b/distributed/http/worker/prometheus/core.py @@ -63,7 +63,8 @@ def collect(self) -> Iterator[Metric]: yield CounterMetricFamily( self.build_name("gil_contention"), "GIL contention metric", - value=self.server.monitor._cumulative_gil_contention, + value=self.server.monitor.cumulative_gil_contention, + unit="seconds", ) yield GaugeMetricFamily( diff --git a/distributed/http/worker/tests/test_worker_http.py b/distributed/http/worker/tests/test_worker_http.py index 36213589dd2..8fc56662bf2 100644 --- a/distributed/http/worker/tests/test_worker_http.py +++ b/distributed/http/worker/tests/test_worker_http.py @@ -59,7 +59,7 @@ async def test_prometheus(c, s, a): except ImportError: pass # pragma: nocover else: - expected_metrics.add("dask_worker_gil_contention_total") + expected_metrics.add("dask_worker_gil_contention_seconds_total") try: import crick # noqa: F401 diff --git a/distributed/system_monitor.py b/distributed/system_monitor.py index bccce514ca7..c3f3cec1d1a 100644 --- a/distributed/system_monitor.py +++ b/distributed/system_monitor.py @@ -31,7 +31,7 @@ class SystemMonitor: _last_host_cpu_counters: Any # dynamically-defined psutil namedtuple _last_gil_contention: float # 0-1 value - _cumulative_gil_contention: float + cumulative_gil_contention: float gpu_name: str | None gpu_memory_total: int @@ -114,12 +114,11 @@ def __init__( self.monitor_gil_contention = False else: self.quantities["gil_contention"] = deque(maxlen=maxlen) - self._cumulative_gil_contention = 0.0 + self.cumulative_gil_contention = 0.0 raw_interval = dask.config.get( "distributed.admin.system-monitor.gil.interval", ) - interval = parse_timedelta(raw_interval, default="us") * 1e6 - + interval = parse_timedelta(raw_interval) * 1e6 self._gilknocker = KnockKnock(polling_interval_micros=int(interval)) self._gilknocker.start() @@ -197,10 +196,10 @@ def update(self) -> dict[str, Any]: self._last_host_cpu_counters = host_cpu if self.monitor_gil_contention: - self._last_gil_contention = self._gilknocker.contention_metric - self._cumulative_gil_contention += self._last_gil_contention - result["gil_contention"] = self._last_gil_contention + gil_contention = self._gilknocker.contention_metric self._gilknocker.reset_contention_metric() + result["gil_contention"] = self._last_gil_contention = gil_contention + self.cumulative_gil_contention += duration * gil_contention # Note: WINDOWS constant doesn't work with `mypy --platform win32` if sys.platform != "win32": diff --git a/docs/source/prometheus.rst b/docs/source/prometheus.rst index 8a97993ed72..042e45cd6c7 100644 --- a/docs/source/prometheus.rst +++ b/docs/source/prometheus.rst @@ -26,9 +26,11 @@ dask_scheduler_clients Number of clients connected dask_scheduler_desired_workers Number of workers scheduler needs for task graph -dask_scheduler_gil_contention_total - Value representing cumulative total of GIL contention, - in the form of summed percentages. +dask_scheduler_gil_contention_seconds_total + Value representing cumulative total of _potential_ GIL contention, + in the form of cumulative seconds during which a thread held the GIL locked. + Other threads may or may not have been actually trying to acquire the GIL in the + meantime. .. note:: Requires ``gilknocker`` to be installed, and @@ -128,9 +130,11 @@ dask_worker_tasks Number of tasks at worker dask_worker_threads Number of worker threads -dask_worker_gil_contention_total - Value representing cumulative total GIL contention on worker, - in the form of summed percentages. +dask_worker_gil_contention_seconds_total + Value representing cumulative total of _potential_ GIL contention, + in the form of cumulative seconds during which a thread held the GIL locked. + Other threads may or may not have been actually trying to acquire the GIL in the + meantime. .. note:: Requires ``gilknocker`` to be installed, and From 91d31486d66d04c86ad424d74681498ec2eda473 Mon Sep 17 00:00:00 2001 From: crusaderky Date: Thu, 7 Mar 2024 12:50:28 +0000 Subject: [PATCH 2/3] Fix italics --- docs/source/prometheus.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/prometheus.rst b/docs/source/prometheus.rst index 042e45cd6c7..9dfe8c6e358 100644 --- a/docs/source/prometheus.rst +++ b/docs/source/prometheus.rst @@ -27,7 +27,7 @@ dask_scheduler_clients dask_scheduler_desired_workers Number of workers scheduler needs for task graph dask_scheduler_gil_contention_seconds_total - Value representing cumulative total of _potential_ GIL contention, + Value representing cumulative total of *potential* GIL contention, in the form of cumulative seconds during which a thread held the GIL locked. Other threads may or may not have been actually trying to acquire the GIL in the meantime. @@ -131,7 +131,7 @@ dask_worker_tasks dask_worker_threads Number of worker threads dask_worker_gil_contention_seconds_total - Value representing cumulative total of _potential_ GIL contention, + Value representing cumulative total of *potential* GIL contention, in the form of cumulative seconds during which a thread held the GIL locked. Other threads may or may not have been actually trying to acquire the GIL in the meantime. From 0ce28be10ef828cac65767ae21417d417431141f Mon Sep 17 00:00:00 2001 From: crusaderky Date: Thu, 7 Mar 2024 17:03:44 +0000 Subject: [PATCH 3/3] nit --- docs/source/prometheus.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/prometheus.rst b/docs/source/prometheus.rst index 9dfe8c6e358..8ca3165cd49 100644 --- a/docs/source/prometheus.rst +++ b/docs/source/prometheus.rst @@ -28,7 +28,7 @@ dask_scheduler_desired_workers Number of workers scheduler needs for task graph dask_scheduler_gil_contention_seconds_total Value representing cumulative total of *potential* GIL contention, - in the form of cumulative seconds during which a thread held the GIL locked. + in the form of cumulative seconds during which any thread held the GIL locked. Other threads may or may not have been actually trying to acquire the GIL in the meantime. @@ -132,7 +132,7 @@ dask_worker_threads Number of worker threads dask_worker_gil_contention_seconds_total Value representing cumulative total of *potential* GIL contention, - in the form of cumulative seconds during which a thread held the GIL locked. + in the form of cumulative seconds during which any thread held the GIL locked. Other threads may or may not have been actually trying to acquire the GIL in the meantime.