From 026a7af6a5ec159959bee63e7cebd82bafb33e2a Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Thu, 23 Apr 2026 14:51:33 -0700 Subject: [PATCH 01/11] fix: drop taichi xdist fork, use stock pytest-xdist + pytest-timeout The taichi fork of pytest-xdist used os._exit(0) in its retire() method, which killed workers before the execnet channel could flush the test failure report. This made all error messages invisible when running with multiple threads. Stock xdist >= 3.4 already handles worker crashes correctly: it preserves the failure report, restarts the worker, and displays full error details in the terminal summary. The conftest hook now just calls os._exit(1) after a failure, and stock xdist does the rest. Changes: - Replace taichi xdist fork with stock pytest-xdist >= 3.7 + pytest-timeout - Simplify conftest pytest_runtest_logreport hook (no fork-specific APIs) - Remove requirements_test_xdist.txt and all CI references to it - Remove pytest_rerunfailures.works_with_current_xdist version hack - Replace pytest_hardtle with pytest-timeout in run_tests.py --- .github/workflows/scripts_new/linux/4_test.sh | 1 - .../workflows/scripts_new/macosx/4_test.sh | 1 - .../scripts_new/manylinux_wheel/5_test.sh | 1 - .github/workflows/scripts_new/win/3_test.ps1 | 1 - .github/workflows/test_gpu.yml | 3 -- pyproject.toml | 4 +-- requirements_test_xdist.txt | 6 ---- tests/python/conftest.py | 30 +++++-------------- tests/run_tests.py | 2 -- 9 files changed, 10 insertions(+), 39 deletions(-) delete mode 100644 requirements_test_xdist.txt diff --git a/.github/workflows/scripts_new/linux/4_test.sh b/.github/workflows/scripts_new/linux/4_test.sh index b707ff68d5..829f2e2bef 100644 --- a/.github/workflows/scripts_new/linux/4_test.sh +++ b/.github/workflows/scripts_new/linux/4_test.sh @@ -3,7 +3,6 @@ set -ex pip install --group test -pip install -r requirements_test_xdist.txt export QD_LIB_DIR="$(python -c 'import quadrants as ti; print(ti.__path__[0])' | tail -n 1)/_lib/runtime" ./build/quadrants_cpp_tests --gtest_filter=-AMDGPU.* diff --git a/.github/workflows/scripts_new/macosx/4_test.sh b/.github/workflows/scripts_new/macosx/4_test.sh index 4fc44bf330..53ca92f165 100644 --- a/.github/workflows/scripts_new/macosx/4_test.sh +++ b/.github/workflows/scripts_new/macosx/4_test.sh @@ -3,7 +3,6 @@ set -ex pip install --prefer-binary --group test -pip install -r requirements_test_xdist.txt find . -name '*.bc' ls -lh build/ export QD_LIB_DIR="$(python -c 'import quadrants as ti; print(ti.__path__[0])' | tail -n 1)/_lib/runtime" diff --git a/.github/workflows/scripts_new/manylinux_wheel/5_test.sh b/.github/workflows/scripts_new/manylinux_wheel/5_test.sh index 7d8215d9a8..fa92bc533c 100644 --- a/.github/workflows/scripts_new/manylinux_wheel/5_test.sh +++ b/.github/workflows/scripts_new/manylinux_wheel/5_test.sh @@ -3,7 +3,6 @@ set -ex pip install --group test -pip install -r requirements_test_xdist.txt # Phase 1: run all tests except torch-dependent ones python tests/run_tests.py -v -r 3 -m "not needs_torch" diff --git a/.github/workflows/scripts_new/win/3_test.ps1 b/.github/workflows/scripts_new/win/3_test.ps1 index 597da17b56..1678204f1f 100644 --- a/.github/workflows/scripts_new/win/3_test.ps1 +++ b/.github/workflows/scripts_new/win/3_test.ps1 @@ -6,7 +6,6 @@ python -c 'import gstaichi as ti; ti.init();' $env:QD_LIB_DIR="python/gstaichi/_lib/runtime" Get-ChildItem -Path build -Recurse pip install --group test -pip install -r requirements_test_xdist.txt # Phase 1: run all tests except torch-dependent ones python .\tests\run_tests.py -v -r 3 -m "not needs_torch" diff --git a/.github/workflows/test_gpu.yml b/.github/workflows/test_gpu.yml index c16c3cf748..388c3bd369 100644 --- a/.github/workflows/test_gpu.yml +++ b/.github/workflows/test_gpu.yml @@ -113,7 +113,6 @@ jobs: - name: install test requirements run: | pip install --group test - pip install -r requirements_test_xdist.txt - name: run tests (without torch) run: | python tests/run_tests.py -r 1 -v --arch cuda -m "not needs_torch" @@ -149,7 +148,6 @@ jobs: - name: install test requirements run: | pip install --group test - pip install -r requirements_test_xdist.txt - name: run tests (without torch) run: | python tests/run_tests.py -r 3 -v --arch vulkan -m "not needs_torch" @@ -180,7 +178,6 @@ jobs: - name: install test requirements run: | pip install --group test - pip install -r requirements_test_xdist.txt - name: run tests (without torch) run: | export QD_AMDGPU_V520=1 diff --git a/pyproject.toml b/pyproject.toml index a0a6223d49..37f4301c4b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,8 +76,6 @@ dev = [ "ruamel.yaml", ] test = [ - # You also need to: - # pip install -r requirements_test_xdist.txt "Pillow", "pytest", # 16.0 upgrade broke xfail, caused fatal errors, see @@ -85,6 +83,8 @@ test = [ "pytest-rerunfailures<16.0", "pytest-cov", "pytest-retry", + "pytest-xdist>=3.7.0", + "pytest-timeout", "numpy>=2.0.0", # otherwise, on windows, tries to install 1.26.4, which has no wheel for python 3.13 "psutil", "autograd", diff --git a/requirements_test_xdist.txt b/requirements_test_xdist.txt deleted file mode 100644 index 12b246a1bc..0000000000 --- a/requirements_test_xdist.txt +++ /dev/null @@ -1,6 +0,0 @@ -# This URL format is incompatible with pyproject.toml -# taichi created a fork of pytest-xdist, to handle restarting gpu -# workers when they crash, in order to reset the GPU state. -# TODO: come up with some approach that is compatible with pyproject.toml -# Ticketed here: https://linear.app/genesis-ai-company/issue/CMP-141/clean-up-requirements-test-xdisttxt -git+https://github.com/taichi-dev/pytest-xdist@a3b5ad3038#egg=pytest-xdist diff --git a/tests/python/conftest.py b/tests/python/conftest.py index ba2f621e58..69c826388b 100644 --- a/tests/python/conftest.py +++ b/tests/python/conftest.py @@ -1,17 +1,11 @@ import gc +import os import sys import pytest -# rerunfailures use xdist version number to determine if it is compatible -# but we are using a forked version of xdist(with git hash as it's version), -# so we need to override it -import pytest_rerunfailures - import quadrants as qd -pytest_rerunfailures.works_with_current_xdist = lambda: True - @pytest.fixture(autouse=True) def run_gc_after_test(): @@ -86,28 +80,20 @@ def pytest_generate_tests(metafunc): @pytest.hookimpl(trylast=True) def pytest_runtest_logreport(report): """ - Intentionally crash test workers when a test fails. - This is to avoid the failing test leaving a corrupted GPU state for the - following tests. + Kill the xdist worker process after a test failure so it restarts with + clean GPU state. Stock xdist (>= 3.4) automatically restarts the worker + and preserves the failure report for the terminal summary. """ + if not hasattr(pytest, "version_tuple"): + return - interactor = getattr(sys, "xdist_interactor", None) - if not interactor: - # not running under xdist, or xdist is not active, - # or using stock xdist (we need a customized version) + if not os.environ.get("PYTEST_XDIST_WORKER"): return if report.outcome not in ("rerun", "error", "failed"): return - layoff = False - - for _, loc, _ in report.longrepr.chain: - if "CUDA_ERROR_OUT_OF_MEMORY" in loc.message: - layoff = True - break - - interactor.retire(layoff=layoff) + os._exit(1) import importlib diff --git a/tests/run_tests.py b/tests/run_tests.py index a454003002..ed90b64fe0 100644 --- a/tests/run_tests.py +++ b/tests/run_tests.py @@ -51,8 +51,6 @@ def _test_python(args, default_dir="python"): if args.timeout > 0: pytest_args += [ "--durations=15", - "-p", - "pytest_hardtle", f"--timeout={args.timeout}", ] except AttributeError: From 8c83491dc164673dca88a8cc1e6ba89ef44adeba Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Thu, 23 Apr 2026 22:20:12 -0700 Subject: [PATCH 02/11] Keep pytest_hardtle for hard-kill timeouts on native GPU hangs Stock pytest-timeout uses Python-level signals that can't interrupt native CUDA/HIP kernel hangs. pytest_hardtle compiles a C watchdog via CFFI that calls _exit(1) from a signal handler, which always works. Restore `-p pytest_hardtle` (with `-p no:timeout` to suppress stock pytest-timeout) and drop pytest-timeout from pyproject.toml. --- pyproject.toml | 2 +- tests/run_tests.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 37f4301c4b..68d798e72a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -84,7 +84,7 @@ test = [ "pytest-cov", "pytest-retry", "pytest-xdist>=3.7.0", - "pytest-timeout", + "numpy>=2.0.0", # otherwise, on windows, tries to install 1.26.4, which has no wheel for python 3.13 "psutil", "autograd", diff --git a/tests/run_tests.py b/tests/run_tests.py index ed90b64fe0..36059493cc 100644 --- a/tests/run_tests.py +++ b/tests/run_tests.py @@ -51,6 +51,10 @@ def _test_python(args, default_dir="python"): if args.timeout > 0: pytest_args += [ "--durations=15", + "-p", + "no:timeout", + "-p", + "pytest_hardtle", f"--timeout={args.timeout}", ] except AttributeError: From 76672b1d86ffe908d14e80847acb3cda3a3a31e7 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Thu, 23 Apr 2026 22:35:05 -0700 Subject: [PATCH 03/11] Add docstring to pytest_hardtle explaining why it exists Documents why we use a CFFI-based hard-kill timeout instead of stock pytest-timeout, and why the two cannot be loaded simultaneously. --- tests/pytest_hardtle.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/pytest_hardtle.py b/tests/pytest_hardtle.py index 68b6bf72e4..97f65d6ef0 100644 --- a/tests/pytest_hardtle.py +++ b/tests/pytest_hardtle.py @@ -1,4 +1,14 @@ # -*- coding: utf-8 -*- +# +# Hard-kill timeout plugin (drop-in replacement for pytest-timeout). +# Uses CFFI to compile a native C watchdog that calls _exit(1) on timeout. +# Unlike stock pytest-timeout (which uses Python-level SIGALRM handlers), +# this can kill tests that hang inside native CUDA/HIP kernel calls or +# C extensions that don't release the GIL. +# +# Stock pytest-timeout must be suppressed (`-p no:timeout`) when this +# plugin is loaded, because both register the same hook specs and pytest +# will raise a ValueError on the duplicate. # -- stdlib -- import importlib From f82637749e871b1ae90c05563ed1300940e8ef97 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Thu, 23 Apr 2026 23:29:04 -0700 Subject: [PATCH 04/11] Address review feedback: remove dead guard, add clarifying comments - Remove always-true `hasattr(pytest, "version_tuple")` guard in conftest.py (pytest >= 7.0 is guaranteed by dependency constraints). - Add comments in run_tests.py explaining why we suppress stock pytest-timeout and why pytest_hardtle is preferred (CFFI-based native signal handler can kill GIL-held native GPU hangs). --- tests/python/conftest.py | 3 --- tests/run_tests.py | 6 ++++++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/python/conftest.py b/tests/python/conftest.py index 69c826388b..4a6b85916e 100644 --- a/tests/python/conftest.py +++ b/tests/python/conftest.py @@ -84,9 +84,6 @@ def pytest_runtest_logreport(report): clean GPU state. Stock xdist (>= 3.4) automatically restarts the worker and preserves the failure report for the terminal summary. """ - if not hasattr(pytest, "version_tuple"): - return - if not os.environ.get("PYTEST_XDIST_WORKER"): return diff --git a/tests/run_tests.py b/tests/run_tests.py index 36059493cc..06227f836e 100644 --- a/tests/run_tests.py +++ b/tests/run_tests.py @@ -51,8 +51,14 @@ def _test_python(args, default_dir="python"): if args.timeout > 0: pytest_args += [ "--durations=15", + # Suppress stock pytest-timeout if installed — it conflicts + # with pytest_hardtle (both register the same hook specs). "-p", "no:timeout", + # pytest_hardtle uses a CFFI-compiled C watchdog that calls + # _exit(1) from a native signal handler, so it can kill tests + # hung in native GPU calls even when the GIL is held. + # Stock pytest-timeout's signal method cannot do this. "-p", "pytest_hardtle", f"--timeout={args.timeout}", From 431197766c4032c31f397e0dcfd81b7a1d50a263 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 24 Apr 2026 00:19:37 -0700 Subject: [PATCH 05/11] Fix double-counted failures and crash-cap shutdown with stock xdist Stock xdist treats os._exit(1) as a worker crash, which causes: 1. A synthetic "worker crashed" report duplicating the real failure 2. Each exit counting toward --max-worker-restart, eventually shutting down the session and silently dropping remaining tests Fix both by: - Adding a pytest_handlecrashitem hook (firstresult=True) that marks the synthetic crash report as passed, since the real report was already sent before the worker exited - Setting --max-worker-restart=999999 so intentional worker kills don't trigger xdist's crash-cap shutdown --- tests/python/conftest.py | 20 ++++++++++++++++++-- tests/run_tests.py | 11 ++++++++++- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/tests/python/conftest.py b/tests/python/conftest.py index 4a6b85916e..4631434dc3 100644 --- a/tests/python/conftest.py +++ b/tests/python/conftest.py @@ -81,8 +81,10 @@ def pytest_generate_tests(metafunc): def pytest_runtest_logreport(report): """ Kill the xdist worker process after a test failure so it restarts with - clean GPU state. Stock xdist (>= 3.4) automatically restarts the worker - and preserves the failure report for the terminal summary. + clean GPU state. The real test report is sent by xdist's own hook + (which runs before this trylast hook) before we exit. The controller's + pytest_handlecrashitem hook below suppresses the synthetic "worker + crashed" duplicate. """ if not os.environ.get("PYTEST_XDIST_WORKER"): return @@ -93,6 +95,20 @@ def pytest_runtest_logreport(report): os._exit(1) +def pytest_handlecrashitem(crashitem, report, sched): + """Suppress the synthetic 'worker crashed while running ...' report. + + When pytest_runtest_logreport above kills a worker via os._exit(1), + stock xdist treats it as a crash and synthesizes a duplicate failure + report. The real report was already sent before the exit, so we + mark the synthetic one as passed to keep it out of the failure summary. + This hook is firstresult=True in xdist, so returning here prevents + the default handler from running. + """ + report.outcome = "passed" + report.longrepr = None + + import importlib import sys diff --git a/tests/run_tests.py b/tests/run_tests.py index 06227f836e..f477021e00 100644 --- a/tests/run_tests.py +++ b/tests/run_tests.py @@ -81,7 +81,16 @@ def _test_python(args, default_dir="python"): print(f"Due to how pytest-xdist is implemented, the -s option does not work with multiple thread...") else: if int(threads) > 1: - pytest_args += ["-n", str(threads), "--dist=worksteal"] + # We intentionally kill workers on test failure (see conftest.py) + # to reset GPU state. Stock xdist counts each kill toward + # --max-worker-restart and shuts down the session when the cap is + # reached, so we set a very high cap to prevent that. + pytest_args += [ + "-n", + str(threads), + "--dist=worksteal", + "--max-worker-restart=999999", + ] import pytest # pylint: disable=C0415 return int(pytest.main(pytest_args)) From fbd787f30d3dbe1012d5a547db7063e40a4bf9af Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 24 Apr 2026 00:29:21 -0700 Subject: [PATCH 06/11] Add tests for xdist worker retirement hooks Uses pytester to verify in a subprocess that: - pytest_handlecrashitem suppresses synthetic "worker crashed" duplicates (failures counted exactly once, not doubled) - --max-worker-restart=999999 prevents premature session shutdown when many tests fail and workers are intentionally killed Also fix pytest_handlecrashitem to set report.when="teardown" so the suppressed crash report doesn't inflate the passed count. --- tests/python/conftest.py | 1 + tests/test_xdist_worker_retirement.py | 124 ++++++++++++++++++++++++++ 2 files changed, 125 insertions(+) create mode 100644 tests/test_xdist_worker_retirement.py diff --git a/tests/python/conftest.py b/tests/python/conftest.py index 4631434dc3..925ff4b32b 100644 --- a/tests/python/conftest.py +++ b/tests/python/conftest.py @@ -106,6 +106,7 @@ def pytest_handlecrashitem(crashitem, report, sched): the default handler from running. """ report.outcome = "passed" + report.when = "teardown" report.longrepr = None diff --git a/tests/test_xdist_worker_retirement.py b/tests/test_xdist_worker_retirement.py new file mode 100644 index 0000000000..960a6d120a --- /dev/null +++ b/tests/test_xdist_worker_retirement.py @@ -0,0 +1,124 @@ +"""Tests for the xdist worker retirement hooks in conftest.py. + +Verifies that when a worker is killed via os._exit(1) after a test failure: +1. Failures are not double-counted (no synthetic "worker crashed" report) +2. The session completes even with many failures (--max-worker-restart cap + does not trigger premature shutdown) + +These tests use pytester to run pytest-xdist in a subprocess, so they do +not require GPU hardware. +""" + +import pytest + +pytest_plugins = ["pytester"] + +SUBPROCESS_ARGS = [ + "-p", + "no:retry", + "-p", + "no:rerunfailures", + "-p", + "no:nbmake", + "-p", + "no:timeout", + "-p", + "no:cacheprovider", + "-o", + "addopts=", +] + + +@pytest.fixture +def xdist_project(pytester): + """Write a minimal conftest that reproduces our worker-retirement hooks.""" + pytester.makeconftest( + """ + import os + import pytest + + @pytest.hookimpl(trylast=True) + def pytest_runtest_logreport(report): + if not os.environ.get("PYTEST_XDIST_WORKER"): + return + if report.outcome not in ("error", "failed"): + return + os._exit(1) + + def pytest_handlecrashitem(crashitem, report, sched): + report.outcome = "passed" + report.when = "teardown" + report.longrepr = None + """ + ) + return pytester + + +class TestNoDuplicateFailures: + def test_single_failure_counted_once(self, xdist_project): + """A single failing test should appear exactly once in the summary.""" + xdist_project.makepyfile( + """ + def test_pass(): + pass + + def test_fail(): + assert False, "intentional failure" + """ + ) + result = xdist_project.runpytest_subprocess( + "-n", "2", "--dist=worksteal", *SUBPROCESS_ARGS, "-v" + ) + result.assert_outcomes(passed=1, failed=1) + + def test_multiple_failures_counted_correctly(self, xdist_project): + """Each failing test should be counted exactly once.""" + xdist_project.makepyfile( + """ + import pytest + + @pytest.mark.parametrize("i", range(4)) + def test_fail(i): + assert False, f"failure {i}" + + def test_pass(): + pass + """ + ) + result = xdist_project.runpytest_subprocess( + "-n", + "2", + "--dist=worksteal", + "--max-worker-restart=999999", + *SUBPROCESS_ARGS, + "-v", + ) + result.assert_outcomes(passed=1, failed=4) + + +class TestSessionCompletesWithManyFailures: + def test_no_premature_shutdown(self, xdist_project): + """With a high --max-worker-restart, all tests should run even if many fail.""" + xdist_project.makepyfile( + """ + import pytest + + @pytest.mark.parametrize("i", range(20)) + def test_fail(i): + assert False, f"failure {i}" + + @pytest.mark.parametrize("i", range(5)) + def test_pass(i): + pass + """ + ) + result = xdist_project.runpytest_subprocess( + "-n", + "2", + "--dist=worksteal", + "--max-worker-restart=999999", + *SUBPROCESS_ARGS, + "-v", + ) + result.assert_outcomes(passed=5, failed=20) + assert "maximum crashed workers reached" not in result.stdout.str() From 2353f511c13e8491b4da3592306478b2683c9abb Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Fri, 24 Apr 2026 00:34:18 -0700 Subject: [PATCH 07/11] style: fix black formatting in test_xdist_worker_retirement.py Made-with: Cursor --- tests/test_xdist_worker_retirement.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_xdist_worker_retirement.py b/tests/test_xdist_worker_retirement.py index 960a6d120a..30730d8b4f 100644 --- a/tests/test_xdist_worker_retirement.py +++ b/tests/test_xdist_worker_retirement.py @@ -66,9 +66,7 @@ def test_fail(): assert False, "intentional failure" """ ) - result = xdist_project.runpytest_subprocess( - "-n", "2", "--dist=worksteal", *SUBPROCESS_ARGS, "-v" - ) + result = xdist_project.runpytest_subprocess("-n", "2", "--dist=worksteal", *SUBPROCESS_ARGS, "-v") result.assert_outcomes(passed=1, failed=1) def test_multiple_failures_counted_correctly(self, xdist_project): From b135892ed8e9488a10b817a698d7e757729b8112 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Mon, 27 Apr 2026 13:55:55 -0700 Subject: [PATCH 08/11] [Test] Fix pytest_handlecrashitem to distinguish intentional exits from genuine crashes The previous implementation blindly suppressed all crash reports, causing: 1. Double-counted failures (1 failed + 1 passed for the same test) 2. Genuine crashes (segfaults, OOM) silently marked as passed Use a marker-file protocol: workers write a marker before os._exit(1), and the controller only suppresses crash reports when a marker is found. A wrapper hookimpl on pytest_runtest_logreport swallows the suppressed synthetic report so it never reaches the terminal reporter or stats. --- tests/python/conftest.py | 98 +++++++++++++++++++++------ tests/test_xdist_worker_retirement.py | 51 +++++++++++--- 2 files changed, 119 insertions(+), 30 deletions(-) diff --git a/tests/python/conftest.py b/tests/python/conftest.py index 2fbe4c6345..bf189aa5fc 100644 --- a/tests/python/conftest.py +++ b/tests/python/conftest.py @@ -1,6 +1,7 @@ import gc import os import sys +import tempfile import pytest @@ -112,37 +113,90 @@ def pytest_generate_tests(metafunc): metafunc.parametrize("req_arch,req_options", [(None, None)], ids=["none"]) -@pytest.hookimpl(trylast=True) -def pytest_runtest_logreport(report): - """ - Kill the xdist worker process after a test failure so it restarts with - clean GPU state. The real test report is sent by xdist's own hook - (which runs before this trylast hook) before we exit. The controller's - pytest_handlecrashitem hook below suppresses the synthetic "worker - crashed" duplicate. +def _exit_marker_dir(): + """Temp directory shared between xdist controller and workers for intentional-exit markers.""" + return os.environ.get("_QD_XDIST_EXIT_MARKER_DIR") + + +def pytest_configure(config): + """On the xdist controller, create a temp directory for intentional-exit markers. + + Workers inherit the ``_QD_XDIST_EXIT_MARKER_DIR`` env var and use the same directory. """ - if not os.environ.get("PYTEST_XDIST_WORKER"): + if os.environ.get("PYTEST_XDIST_WORKER"): + return + if os.environ.get("_QD_XDIST_EXIT_MARKER_DIR"): return + d = os.path.join(tempfile.gettempdir(), f"qd_xdist_exits_{os.getpid()}") + os.makedirs(d, exist_ok=True) + os.environ["_QD_XDIST_EXIT_MARKER_DIR"] = d - if report.outcome not in ("rerun", "error", "failed"): + +def pytest_unconfigure(config): + """Clean up the marker directory at session end.""" + if os.environ.get("PYTEST_XDIST_WORKER"): return + d = _exit_marker_dir() + if d and os.path.isdir(d): + import shutil + + shutil.rmtree(d, ignore_errors=True) + + +@pytest.hookimpl(wrapper=True, tryfirst=True) +def pytest_runtest_logreport(report): + """Handle xdist worker retirement and crash-report suppression. - os._exit(1) + On the controller: swallow synthetic crash reports that were already marked for suppression by + pytest_handlecrashitem. + + On workers: after a test failure, write an intentional-exit marker and kill the process so it + restarts with clean GPU state. The real test report is sent by inner hooks (including xdist's + report-forwarding hook) during ``yield`` before we exit. + """ + if getattr(report, "_qd_suppress", False): + return None + + result = yield + + if os.environ.get("PYTEST_XDIST_WORKER") and report.outcome in ("rerun", "error", "failed"): + d = _exit_marker_dir() + if d: + worker_id = os.environ["PYTEST_XDIST_WORKER"] + try: + with open(os.path.join(d, worker_id), "w") as f: + f.write(report.nodeid) + except OSError: + pass + os._exit(1) + + return result def pytest_handlecrashitem(crashitem, report, sched): - """Suppress the synthetic 'worker crashed while running ...' report. - - When pytest_runtest_logreport above kills a worker via os._exit(1), - stock xdist treats it as a crash and synthesizes a duplicate failure - report. The real report was already sent before the exit, so we - mark the synthetic one as passed to keep it out of the failure summary. - This hook is firstresult=True in xdist, so returning here prevents - the default handler from running. + """Suppress the synthetic crash report only for intentional ``os._exit(1)`` exits. + + When a worker is killed intentionally (to reset GPU state after a failure), it writes a marker + file before exiting. If the marker exists, we flag the synthetic report for suppression and + return a truthy value to stop the firstresult hook chain. Genuine crashes (segfaults, OOM, + etc.) have no marker, so their reports pass through unmodified. """ - report.outcome = "passed" - report.when = "teardown" - report.longrepr = None + d = _exit_marker_dir() + if not d: + return + node = getattr(report, "node", None) + if not node: + return + worker_id = node.gateway.id + marker = os.path.join(d, worker_id) + if not os.path.exists(marker): + return + try: + os.unlink(marker) + except OSError: + pass + report._qd_suppress = True + return True import importlib diff --git a/tests/test_xdist_worker_retirement.py b/tests/test_xdist_worker_retirement.py index 30730d8b4f..e91ad431fb 100644 --- a/tests/test_xdist_worker_retirement.py +++ b/tests/test_xdist_worker_retirement.py @@ -35,20 +35,55 @@ def xdist_project(pytester): pytester.makeconftest( """ import os + import tempfile import pytest - @pytest.hookimpl(trylast=True) - def pytest_runtest_logreport(report): - if not os.environ.get("PYTEST_XDIST_WORKER"): + def _exit_marker_dir(): + return os.environ.get("_QD_XDIST_EXIT_MARKER_DIR") + + def pytest_configure(config): + if os.environ.get("PYTEST_XDIST_WORKER"): return - if report.outcome not in ("error", "failed"): + if os.environ.get("_QD_XDIST_EXIT_MARKER_DIR"): return - os._exit(1) + d = os.path.join(tempfile.gettempdir(), f"qd_xdist_exits_{os.getpid()}") + os.makedirs(d, exist_ok=True) + os.environ["_QD_XDIST_EXIT_MARKER_DIR"] = d + + @pytest.hookimpl(wrapper=True, tryfirst=True) + def pytest_runtest_logreport(report): + if getattr(report, "_qd_suppress", False): + return None + result = yield + if os.environ.get("PYTEST_XDIST_WORKER") and report.outcome in ("error", "failed"): + d = _exit_marker_dir() + if d: + worker_id = os.environ["PYTEST_XDIST_WORKER"] + try: + with open(os.path.join(d, worker_id), "w") as f: + f.write(report.nodeid) + except OSError: + pass + os._exit(1) + return result def pytest_handlecrashitem(crashitem, report, sched): - report.outcome = "passed" - report.when = "teardown" - report.longrepr = None + d = _exit_marker_dir() + if not d: + return + node = getattr(report, "node", None) + if not node: + return + worker_id = node.gateway.id + marker = os.path.join(d, worker_id) + if not os.path.exists(marker): + return + try: + os.unlink(marker) + except OSError: + pass + report._qd_suppress = True + return True """ ) return pytester From 7fa8f34f7036e0685e21d278f580e52868e6401c Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Mon, 27 Apr 2026 13:56:16 -0700 Subject: [PATCH 09/11] [Test] Move xdist worker retirement tests into tests/python/ for CI discovery CI runs pytest rooted at tests/python/ via run_tests.py, so tests placed as siblings of that directory are never collected. --- tests/{ => python}/test_xdist_worker_retirement.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/{ => python}/test_xdist_worker_retirement.py (100%) diff --git a/tests/test_xdist_worker_retirement.py b/tests/python/test_xdist_worker_retirement.py similarity index 100% rename from tests/test_xdist_worker_retirement.py rename to tests/python/test_xdist_worker_retirement.py From 55dac704788bb30e9530638cc092e94c18386e83 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Mon, 27 Apr 2026 14:02:33 -0700 Subject: [PATCH 10/11] [Test] Remove duplicate import sys in conftest.py The top-level import at line 3 already covers all usages. --- tests/python/conftest.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/python/conftest.py b/tests/python/conftest.py index bf189aa5fc..6b9f302204 100644 --- a/tests/python/conftest.py +++ b/tests/python/conftest.py @@ -200,9 +200,6 @@ def pytest_handlecrashitem(crashitem, report, sched): import importlib -import sys - -import pytest @pytest.fixture From f63261e64a2b5b1967623152d2a320a566cf7773 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Mon, 27 Apr 2026 14:12:00 -0700 Subject: [PATCH 11/11] [CI] Remove stale requirements_test_xdist.txt reference from linux.yml; scrub PYTEST_XDIST_WORKER in pytester fixture to prevent env var leak The linux.yml test-cuda job still referenced the deleted requirements_test_xdist.txt. Also, pytester propagates the outer worker's PYTEST_XDIST_WORKER into inner subprocesses, causing the inner controller to misidentify itself as a worker and os._exit(1). --- .github/workflows/linux.yml | 1 - tests/python/test_xdist_worker_retirement.py | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index abca17810f..fd71349f45 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -86,7 +86,6 @@ jobs: - name: Install test requirements run: | pip install --group test - pip install -r requirements_test_xdist.txt - name: Run CUDA tests with coverage run: | bash .github/workflows/scripts_new/linux/4_test_cuda.sh diff --git a/tests/python/test_xdist_worker_retirement.py b/tests/python/test_xdist_worker_retirement.py index e91ad431fb..c471db3a2e 100644 --- a/tests/python/test_xdist_worker_retirement.py +++ b/tests/python/test_xdist_worker_retirement.py @@ -30,8 +30,9 @@ @pytest.fixture -def xdist_project(pytester): +def xdist_project(pytester, monkeypatch): """Write a minimal conftest that reproduces our worker-retirement hooks.""" + monkeypatch.delenv("PYTEST_XDIST_WORKER", raising=False) pytester.makeconftest( """ import os