From f971e7c7e0e47ffc1f2871c8aaf5c8bed8e992b2 Mon Sep 17 00:00:00 2001 From: Gabe Joseph Date: Fri, 23 Jul 2021 15:37:12 -0800 Subject: [PATCH] Short-circuit root-ish check for many deps While looking into https://github.com/dask/distributed/issues/5083 I happened to notice that the dashboard felt very sluggish. I profiled with py-spy and discovered that the scheduler was spending 20% of runtime calculaing `sum(map(len, group._dependencies)) < 5`! A quick print statement showed some task groups depended on 25,728 other groups (each of size 1). We can easily skip those. I originally had this conditional in https://github.com/dask/distributed/pull/4967 but we removed it for simplicity: https://github.com/dask/distributed/pull/4967#discussion_r661479904; turns out it was relevant after all! --- distributed/scheduler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/distributed/scheduler.py b/distributed/scheduler.py index 75c6fe259be..2ba9f124161 100644 --- a/distributed/scheduler.py +++ b/distributed/scheduler.py @@ -2485,6 +2485,7 @@ def decide_worker(self, ts: TaskState) -> WorkerState: if ( valid_workers is None and len(group) > self._total_nthreads * 2 + and len(group._dependencies) < 5 and sum(map(len, group._dependencies)) < 5 ): ws: WorkerState = group._last_worker