-
Notifications
You must be signed in to change notification settings - Fork 17
[Test] Drop taichi xdist fork, use stock pytest-xdist #556
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
026a7af
ff62e25
55ebcb7
8c83491
76672b1
f826377
4311977
fbd787f
2353f51
efc7d4b
b135892
7fa8f34
55dac70
f63261e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,19 +1,12 @@ | ||
| import gc | ||
| import os | ||
| import sys | ||
|
claude[bot] marked this conversation as resolved.
|
||
| import time | ||
| import tempfile | ||
|
|
||
| import pytest | ||
|
|
||
| # rerunfailures use xdist version number to determine if it is compatible | ||
| # but we are using a forked version of xdist(with git hash as it's version), | ||
| # so we need to override it | ||
| import pytest_rerunfailures | ||
|
|
||
| import quadrants as qd | ||
|
|
||
| pytest_rerunfailures.works_with_current_xdist = lambda: True | ||
|
|
||
|
|
||
| @pytest.fixture(autouse=True) | ||
| def run_gc_after_test(): | ||
|
|
@@ -120,43 +113,93 @@ def pytest_generate_tests(metafunc): | |
| metafunc.parametrize("req_arch,req_options", [(None, None)], ids=["none"]) | ||
|
|
||
|
|
||
| @pytest.hookimpl(trylast=True) | ||
| def pytest_runtest_logreport(report): | ||
| """ | ||
| Retire test workers when a test fails, to avoid the failing test | ||
| leaving a corrupted GPU state for the following tests. | ||
| """ | ||
| def _exit_marker_dir(): | ||
| """Temp directory shared between xdist controller and workers for intentional-exit markers.""" | ||
| return os.environ.get("_QD_XDIST_EXIT_MARKER_DIR") | ||
|
|
||
|
|
||
| interactor = getattr(sys, "xdist_interactor", None) | ||
| if not interactor: | ||
| def pytest_configure(config): | ||
| """On the xdist controller, create a temp directory for intentional-exit markers. | ||
|
|
||
| Workers inherit the ``_QD_XDIST_EXIT_MARKER_DIR`` env var and use the same directory. | ||
| """ | ||
| if os.environ.get("PYTEST_XDIST_WORKER"): | ||
| return | ||
| if os.environ.get("_QD_XDIST_EXIT_MARKER_DIR"): | ||
| return | ||
| d = os.path.join(tempfile.gettempdir(), f"qd_xdist_exits_{os.getpid()}") | ||
| os.makedirs(d, exist_ok=True) | ||
| os.environ["_QD_XDIST_EXIT_MARKER_DIR"] = d | ||
|
|
||
|
|
||
| if report.outcome not in ("rerun", "error", "failed"): | ||
| def pytest_unconfigure(config): | ||
| """Clean up the marker directory at session end.""" | ||
| if os.environ.get("PYTEST_XDIST_WORKER"): | ||
| return | ||
| d = _exit_marker_dir() | ||
| if d and os.path.isdir(d): | ||
| import shutil | ||
|
|
||
| layoff = False | ||
| shutil.rmtree(d, ignore_errors=True) | ||
|
Comment on lines
+135
to
+143
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟡 Latent regression of the duplicate-failure protection on the second Extended reasoning...What the bug is
How it manifests
run_count = 1
if args.with_offline_cache:
run_count += args.rerun_with_offline_cache # default 1, so run_count = 2
...
for _ in range(run_count):
ret = _test_python(args) # calls pytest.main(...)
if ret == 5: ret = 0
if ret != 0: exit(ret)When Step-by-step proof
Why existing code doesn't prevent itThe two early-return guards in Scope and severityCI lanes do not exercise FixOne line in os.environ.pop('_QD_XDIST_EXIT_MARKER_DIR', None)This is the symmetric counterpart to the |
||
|
|
||
| chain = getattr(getattr(report, "longrepr", None), "chain", None) | ||
| if chain: | ||
| for _, loc, _ in chain: | ||
| msg = getattr(loc, "message", "") if loc else "" | ||
| if "CUDA_ERROR_OUT_OF_MEMORY" in msg: | ||
| layoff = True | ||
| break | ||
|
|
||
| # Don't call interactor.retire() — it uses os._exit(0) which kills | ||
| # the process before execnet's IO thread can flush the channel buffer. | ||
| # The test failure report (queued by xdist's own hook, which ran before | ||
| # this trylast hook) would be lost, hiding all error messages. | ||
| interactor.sendevent("workerretire", layoff=layoff) | ||
| time.sleep(0.2) | ||
| os._exit(0) | ||
| @pytest.hookimpl(wrapper=True, tryfirst=True) | ||
| def pytest_runtest_logreport(report): | ||
| """Handle xdist worker retirement and crash-report suppression. | ||
|
|
||
| On the controller: swallow synthetic crash reports that were already marked for suppression by | ||
| pytest_handlecrashitem. | ||
|
|
||
| import importlib | ||
| import sys | ||
| On workers: after a test failure, write an intentional-exit marker and kill the process so it | ||
| restarts with clean GPU state. The real test report is sent by inner hooks (including xdist's | ||
| report-forwarding hook) during ``yield`` before we exit. | ||
| """ | ||
| if getattr(report, "_qd_suppress", False): | ||
| return None | ||
|
|
||
| import pytest | ||
| result = yield | ||
|
|
||
| if os.environ.get("PYTEST_XDIST_WORKER") and report.outcome in ("rerun", "error", "failed"): | ||
| d = _exit_marker_dir() | ||
| if d: | ||
| worker_id = os.environ["PYTEST_XDIST_WORKER"] | ||
| try: | ||
| with open(os.path.join(d, worker_id), "w") as f: | ||
| f.write(report.nodeid) | ||
| except OSError: | ||
| pass | ||
| os._exit(1) | ||
|
|
||
| return result | ||
|
Comment on lines
145
to
+173
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🔴 The wrapper hook at tests/python/conftest.py:146-173 (@pytest.hookimpl(wrapper=True, tryfirst=True)) has Extended reasoning...What the bug isThe hook at @pytest.hookimpl(wrapper=True, tryfirst=True)
def pytest_runtest_logreport(report):
if getattr(report, "_qd_suppress", False):
return None # <-- early return BEFORE yield
result = yield
if os.environ.get("PYTEST_XDIST_WORKER") and report.outcome in ("rerun", "error", "failed"):
...
os._exit(1)
return resultA How the bad path is triggeredThe PR introduces two cooperating pieces:
The trigger chain (verified against
Why the existing code does not prevent it
ImpactEvery xdist run with a worker-dying-after-failure (which is the central path of this PRs design) hits the bug on the first such failure and aborts the test loop with INTERNALERROR. With N>=2 failing tests under It also means the new regression tests at Step-by-step concrete proofSetup: pytest 9.x + pytest-xdist 3.8+ + pluggy 1.6, exact PR conftest, three tests of which two always fail, run with
This was independently reproduced by three verifiers using the exact PR code with stock pytest+xdist+pluggy. How to fixTwo clean options:
The same identical bug exists in the embedded conftest at |
||
|
|
||
|
|
||
| def pytest_handlecrashitem(crashitem, report, sched): | ||
| """Suppress the synthetic crash report only for intentional ``os._exit(1)`` exits. | ||
|
|
||
| When a worker is killed intentionally (to reset GPU state after a failure), it writes a marker | ||
| file before exiting. If the marker exists, we flag the synthetic report for suppression and | ||
| return a truthy value to stop the firstresult hook chain. Genuine crashes (segfaults, OOM, | ||
| etc.) have no marker, so their reports pass through unmodified. | ||
| """ | ||
| d = _exit_marker_dir() | ||
| if not d: | ||
| return | ||
| node = getattr(report, "node", None) | ||
| if not node: | ||
| return | ||
| worker_id = node.gateway.id | ||
| marker = os.path.join(d, worker_id) | ||
| if not os.path.exists(marker): | ||
| return | ||
| try: | ||
| os.unlink(marker) | ||
| except OSError: | ||
| pass | ||
| report._qd_suppress = True | ||
| return True | ||
|
|
||
|
|
||
| import importlib | ||
|
|
||
|
|
||
| @pytest.fixture | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.