From 5ce2c91c5b336c4726d41a651560745d5d2f590e Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Fri, 13 Oct 2023 05:08:01 -0700 Subject: [PATCH] Reenable `test_ucx_config_w_env_var` Some time ago `test_ucx_config_w_env_var` started failing intermittently, and the causes were still unknown. After some investigation it seems in certain cases exchanging UCX-Py peer information causes some of the underlying communication calls to never complete and thus cause a hang that can't be recovered from by Distributed. With https://github.com/rapidsai/ucx-py/pull/994, UCX-Py now has a timeout on those calls that allow Distributed to catch and retry establishing the connection, which seems to resolve the problem. --- distributed/comm/tests/test_ucx_config.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/distributed/comm/tests/test_ucx_config.py b/distributed/comm/tests/test_ucx_config.py index 74eb8a512e4..155a9dbbc65 100644 --- a/distributed/comm/tests/test_ucx_config.py +++ b/distributed/comm/tests/test_ucx_config.py @@ -108,10 +108,9 @@ async def test_ucx_config(ucx_loop, cleanup): assert ucx_environment == {"UCX_MEMTRACK_DEST": "stdout"} -@pytest.mark.xfail(reason="https://github.com/dask/distributed/issues/5229") def test_ucx_config_w_env_var(ucx_loop, cleanup, loop): env = os.environ.copy() - env["DASK_RMM__POOL_SIZE"] = "1000.00 MB" + env["DASK_DISTRIBUTED__RMM__POOL_SIZE"] = "1000.00 MB" port = str(open_port()) # Using localhost appears to be less flaky than {HOST}. Additionally, this is @@ -136,7 +135,7 @@ def test_ucx_config_w_env_var(ucx_loop, cleanup, loop): ], env=env, ): - with Client(sched_addr, loop=loop, timeout=60) as c: + with Client(sched_addr, loop=loop, timeout=30) as c: while not c.scheduler_info()["workers"]: sleep(0.1)