diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index abca17810f..fd71349f45 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -86,7 +86,6 @@ jobs: - name: Install test requirements run: | pip install --group test - pip install -r requirements_test_xdist.txt - name: Run CUDA tests with coverage run: | bash .github/workflows/scripts_new/linux/4_test_cuda.sh diff --git a/.github/workflows/scripts_new/linux/4_test.sh b/.github/workflows/scripts_new/linux/4_test.sh index 630dc34783..5b7b604011 100644 --- a/.github/workflows/scripts_new/linux/4_test.sh +++ b/.github/workflows/scripts_new/linux/4_test.sh @@ -3,7 +3,6 @@ set -ex pip install --group test -pip install -r requirements_test_xdist.txt export QD_LIB_DIR="$(python -c 'import quadrants as ti; print(ti.__path__[0])' | tail -n 1)/_lib/runtime" ./build/quadrants_cpp_tests --gtest_filter=-AMDGPU.* diff --git a/.github/workflows/scripts_new/macosx/4_test.sh b/.github/workflows/scripts_new/macosx/4_test.sh index 71037e2471..d5fa680770 100644 --- a/.github/workflows/scripts_new/macosx/4_test.sh +++ b/.github/workflows/scripts_new/macosx/4_test.sh @@ -3,7 +3,6 @@ set -ex pip install --prefer-binary --group test -pip install -r requirements_test_xdist.txt find . -name '*.bc' ls -lh build/ export QD_LIB_DIR="$(python -c 'import quadrants as ti; print(ti.__path__[0])' | tail -n 1)/_lib/runtime" diff --git a/.github/workflows/scripts_new/manylinux_wheel/5_test.sh b/.github/workflows/scripts_new/manylinux_wheel/5_test.sh index b14b9d7778..f045c2ccc7 100644 --- a/.github/workflows/scripts_new/manylinux_wheel/5_test.sh +++ b/.github/workflows/scripts_new/manylinux_wheel/5_test.sh @@ -3,7 +3,6 @@ set -ex pip install --group test -pip install -r requirements_test_xdist.txt # Phase 1: run all tests except torch-dependent ones python tests/run_tests.py -v -r 1 -m "not needs_torch" diff --git a/.github/workflows/scripts_new/win/3_test.ps1 b/.github/workflows/scripts_new/win/3_test.ps1 index c7eae72395..9ebc71e437 100644 --- a/.github/workflows/scripts_new/win/3_test.ps1 +++ b/.github/workflows/scripts_new/win/3_test.ps1 @@ -6,7 +6,6 @@ python -c 'import gstaichi as ti; ti.init();' $env:QD_LIB_DIR="python/gstaichi/_lib/runtime" Get-ChildItem -Path build -Recurse pip install --group test -pip install -r requirements_test_xdist.txt # Phase 1: run all tests except torch-dependent ones python .\tests\run_tests.py -v -r 1 -m "not needs_torch" diff --git a/.github/workflows/test_gpu.yml b/.github/workflows/test_gpu.yml index d98ff9f013..28fef6774b 100644 --- a/.github/workflows/test_gpu.yml +++ b/.github/workflows/test_gpu.yml @@ -113,7 +113,6 @@ jobs: - name: install test requirements run: | pip install --group test - pip install -r requirements_test_xdist.txt - name: run tests (without torch) run: | python tests/run_tests.py -r 1 -v --arch cuda -m "not needs_torch" @@ -158,7 +157,6 @@ jobs: - name: install test requirements run: | pip install --group test - pip install -r requirements_test_xdist.txt - name: run tests (without torch) run: | python tests/run_tests.py -r 1 -v --arch vulkan -m "not needs_torch" @@ -189,7 +187,6 @@ jobs: - name: install test requirements run: | pip install --group test - pip install -r requirements_test_xdist.txt - name: run tests (without torch) run: | export QD_AMDGPU_V520=1 diff --git a/pyproject.toml b/pyproject.toml index 4c600131e2..0d1b33385f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,8 +76,6 @@ dev = [ "ruamel.yaml", ] test = [ - # You also need to: - # pip install -r requirements_test_xdist.txt "Pillow", "pytest", # 16.0 upgrade broke xfail, caused fatal errors, see @@ -85,6 +83,8 @@ test = [ "pytest-rerunfailures<16.0", "pytest-cov", "pytest-retry", + "pytest-xdist>=3.7.0", + "numpy>=2.0.0", # otherwise, on windows, tries to install 1.26.4, which has no wheel for python 3.13 "psutil", "autograd", diff --git a/requirements_test_xdist.txt b/requirements_test_xdist.txt deleted file mode 100644 index 12b246a1bc..0000000000 --- a/requirements_test_xdist.txt +++ /dev/null @@ -1,6 +0,0 @@ -# This URL format is incompatible with pyproject.toml -# taichi created a fork of pytest-xdist, to handle restarting gpu -# workers when they crash, in order to reset the GPU state. -# TODO: come up with some approach that is compatible with pyproject.toml -# Ticketed here: https://linear.app/genesis-ai-company/issue/CMP-141/clean-up-requirements-test-xdisttxt -git+https://github.com/taichi-dev/pytest-xdist@a3b5ad3038#egg=pytest-xdist diff --git a/tests/pytest_hardtle.py b/tests/pytest_hardtle.py index 68b6bf72e4..97f65d6ef0 100644 --- a/tests/pytest_hardtle.py +++ b/tests/pytest_hardtle.py @@ -1,4 +1,14 @@ # -*- coding: utf-8 -*- +# +# Hard-kill timeout plugin (drop-in replacement for pytest-timeout). +# Uses CFFI to compile a native C watchdog that calls _exit(1) on timeout. +# Unlike stock pytest-timeout (which uses Python-level SIGALRM handlers), +# this can kill tests that hang inside native CUDA/HIP kernel calls or +# C extensions that don't release the GIL. +# +# Stock pytest-timeout must be suppressed (`-p no:timeout`) when this +# plugin is loaded, because both register the same hook specs and pytest +# will raise a ValueError on the duplicate. # -- stdlib -- import importlib diff --git a/tests/python/conftest.py b/tests/python/conftest.py index 1ae8945a54..6b9f302204 100644 --- a/tests/python/conftest.py +++ b/tests/python/conftest.py @@ -1,19 +1,12 @@ import gc import os import sys -import time +import tempfile import pytest -# rerunfailures use xdist version number to determine if it is compatible -# but we are using a forked version of xdist(with git hash as it's version), -# so we need to override it -import pytest_rerunfailures - import quadrants as qd -pytest_rerunfailures.works_with_current_xdist = lambda: True - @pytest.fixture(autouse=True) def run_gc_after_test(): @@ -120,43 +113,93 @@ def pytest_generate_tests(metafunc): metafunc.parametrize("req_arch,req_options", [(None, None)], ids=["none"]) -@pytest.hookimpl(trylast=True) -def pytest_runtest_logreport(report): - """ - Retire test workers when a test fails, to avoid the failing test - leaving a corrupted GPU state for the following tests. - """ +def _exit_marker_dir(): + """Temp directory shared between xdist controller and workers for intentional-exit markers.""" + return os.environ.get("_QD_XDIST_EXIT_MARKER_DIR") + - interactor = getattr(sys, "xdist_interactor", None) - if not interactor: +def pytest_configure(config): + """On the xdist controller, create a temp directory for intentional-exit markers. + + Workers inherit the ``_QD_XDIST_EXIT_MARKER_DIR`` env var and use the same directory. + """ + if os.environ.get("PYTEST_XDIST_WORKER"): return + if os.environ.get("_QD_XDIST_EXIT_MARKER_DIR"): + return + d = os.path.join(tempfile.gettempdir(), f"qd_xdist_exits_{os.getpid()}") + os.makedirs(d, exist_ok=True) + os.environ["_QD_XDIST_EXIT_MARKER_DIR"] = d + - if report.outcome not in ("rerun", "error", "failed"): +def pytest_unconfigure(config): + """Clean up the marker directory at session end.""" + if os.environ.get("PYTEST_XDIST_WORKER"): return + d = _exit_marker_dir() + if d and os.path.isdir(d): + import shutil - layoff = False + shutil.rmtree(d, ignore_errors=True) - chain = getattr(getattr(report, "longrepr", None), "chain", None) - if chain: - for _, loc, _ in chain: - msg = getattr(loc, "message", "") if loc else "" - if "CUDA_ERROR_OUT_OF_MEMORY" in msg: - layoff = True - break - # Don't call interactor.retire() — it uses os._exit(0) which kills - # the process before execnet's IO thread can flush the channel buffer. - # The test failure report (queued by xdist's own hook, which ran before - # this trylast hook) would be lost, hiding all error messages. - interactor.sendevent("workerretire", layoff=layoff) - time.sleep(0.2) - os._exit(0) +@pytest.hookimpl(wrapper=True, tryfirst=True) +def pytest_runtest_logreport(report): + """Handle xdist worker retirement and crash-report suppression. + On the controller: swallow synthetic crash reports that were already marked for suppression by + pytest_handlecrashitem. -import importlib -import sys + On workers: after a test failure, write an intentional-exit marker and kill the process so it + restarts with clean GPU state. The real test report is sent by inner hooks (including xdist's + report-forwarding hook) during ``yield`` before we exit. + """ + if getattr(report, "_qd_suppress", False): + return None -import pytest + result = yield + + if os.environ.get("PYTEST_XDIST_WORKER") and report.outcome in ("rerun", "error", "failed"): + d = _exit_marker_dir() + if d: + worker_id = os.environ["PYTEST_XDIST_WORKER"] + try: + with open(os.path.join(d, worker_id), "w") as f: + f.write(report.nodeid) + except OSError: + pass + os._exit(1) + + return result + + +def pytest_handlecrashitem(crashitem, report, sched): + """Suppress the synthetic crash report only for intentional ``os._exit(1)`` exits. + + When a worker is killed intentionally (to reset GPU state after a failure), it writes a marker + file before exiting. If the marker exists, we flag the synthetic report for suppression and + return a truthy value to stop the firstresult hook chain. Genuine crashes (segfaults, OOM, + etc.) have no marker, so their reports pass through unmodified. + """ + d = _exit_marker_dir() + if not d: + return + node = getattr(report, "node", None) + if not node: + return + worker_id = node.gateway.id + marker = os.path.join(d, worker_id) + if not os.path.exists(marker): + return + try: + os.unlink(marker) + except OSError: + pass + report._qd_suppress = True + return True + + +import importlib @pytest.fixture diff --git a/tests/python/test_xdist_worker_retirement.py b/tests/python/test_xdist_worker_retirement.py new file mode 100644 index 0000000000..c471db3a2e --- /dev/null +++ b/tests/python/test_xdist_worker_retirement.py @@ -0,0 +1,158 @@ +"""Tests for the xdist worker retirement hooks in conftest.py. + +Verifies that when a worker is killed via os._exit(1) after a test failure: +1. Failures are not double-counted (no synthetic "worker crashed" report) +2. The session completes even with many failures (--max-worker-restart cap + does not trigger premature shutdown) + +These tests use pytester to run pytest-xdist in a subprocess, so they do +not require GPU hardware. +""" + +import pytest + +pytest_plugins = ["pytester"] + +SUBPROCESS_ARGS = [ + "-p", + "no:retry", + "-p", + "no:rerunfailures", + "-p", + "no:nbmake", + "-p", + "no:timeout", + "-p", + "no:cacheprovider", + "-o", + "addopts=", +] + + +@pytest.fixture +def xdist_project(pytester, monkeypatch): + """Write a minimal conftest that reproduces our worker-retirement hooks.""" + monkeypatch.delenv("PYTEST_XDIST_WORKER", raising=False) + pytester.makeconftest( + """ + import os + import tempfile + import pytest + + def _exit_marker_dir(): + return os.environ.get("_QD_XDIST_EXIT_MARKER_DIR") + + def pytest_configure(config): + if os.environ.get("PYTEST_XDIST_WORKER"): + return + if os.environ.get("_QD_XDIST_EXIT_MARKER_DIR"): + return + d = os.path.join(tempfile.gettempdir(), f"qd_xdist_exits_{os.getpid()}") + os.makedirs(d, exist_ok=True) + os.environ["_QD_XDIST_EXIT_MARKER_DIR"] = d + + @pytest.hookimpl(wrapper=True, tryfirst=True) + def pytest_runtest_logreport(report): + if getattr(report, "_qd_suppress", False): + return None + result = yield + if os.environ.get("PYTEST_XDIST_WORKER") and report.outcome in ("error", "failed"): + d = _exit_marker_dir() + if d: + worker_id = os.environ["PYTEST_XDIST_WORKER"] + try: + with open(os.path.join(d, worker_id), "w") as f: + f.write(report.nodeid) + except OSError: + pass + os._exit(1) + return result + + def pytest_handlecrashitem(crashitem, report, sched): + d = _exit_marker_dir() + if not d: + return + node = getattr(report, "node", None) + if not node: + return + worker_id = node.gateway.id + marker = os.path.join(d, worker_id) + if not os.path.exists(marker): + return + try: + os.unlink(marker) + except OSError: + pass + report._qd_suppress = True + return True + """ + ) + return pytester + + +class TestNoDuplicateFailures: + def test_single_failure_counted_once(self, xdist_project): + """A single failing test should appear exactly once in the summary.""" + xdist_project.makepyfile( + """ + def test_pass(): + pass + + def test_fail(): + assert False, "intentional failure" + """ + ) + result = xdist_project.runpytest_subprocess("-n", "2", "--dist=worksteal", *SUBPROCESS_ARGS, "-v") + result.assert_outcomes(passed=1, failed=1) + + def test_multiple_failures_counted_correctly(self, xdist_project): + """Each failing test should be counted exactly once.""" + xdist_project.makepyfile( + """ + import pytest + + @pytest.mark.parametrize("i", range(4)) + def test_fail(i): + assert False, f"failure {i}" + + def test_pass(): + pass + """ + ) + result = xdist_project.runpytest_subprocess( + "-n", + "2", + "--dist=worksteal", + "--max-worker-restart=999999", + *SUBPROCESS_ARGS, + "-v", + ) + result.assert_outcomes(passed=1, failed=4) + + +class TestSessionCompletesWithManyFailures: + def test_no_premature_shutdown(self, xdist_project): + """With a high --max-worker-restart, all tests should run even if many fail.""" + xdist_project.makepyfile( + """ + import pytest + + @pytest.mark.parametrize("i", range(20)) + def test_fail(i): + assert False, f"failure {i}" + + @pytest.mark.parametrize("i", range(5)) + def test_pass(i): + pass + """ + ) + result = xdist_project.runpytest_subprocess( + "-n", + "2", + "--dist=worksteal", + "--max-worker-restart=999999", + *SUBPROCESS_ARGS, + "-v", + ) + result.assert_outcomes(passed=5, failed=20) + assert "maximum crashed workers reached" not in result.stdout.str() diff --git a/tests/run_tests.py b/tests/run_tests.py index da7df93f9a..cf2349933b 100644 --- a/tests/run_tests.py +++ b/tests/run_tests.py @@ -87,7 +87,16 @@ def _test_python(args, default_dir="python"): print(f"Due to how pytest-xdist is implemented, the -s option does not work with multiple thread...") else: if int(threads) > 1: - pytest_args += ["-n", str(threads), "--dist=worksteal"] + # We intentionally kill workers on test failure (see conftest.py) + # to reset GPU state. Stock xdist counts each kill toward + # --max-worker-restart and shuts down the session when the cap is + # reached, so we set a very high cap to prevent that. + pytest_args += [ + "-n", + str(threads), + "--dist=worksteal", + "--max-worker-restart=999999", + ] import pytest # pylint: disable=C0415 return int(pytest.main(pytest_args))